## 1. Import Modules

In [1]:
import pyspark as ps
# from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import SQLContext

In [2]:
# spark = SparkSession\
#         .builder\
#         .appName("PythonPi")\
#         .getOrCreate()

In [3]:
sc = ps.SparkContext('local[4]')
sqlContext = SQLContext(sc)

## 2. Load the train and test data

**Load and inspect train data**

In [4]:
df = sqlContext.read.csv(
    "./input/train.csv", header=True, mode="DROPMALFORMED")

In [5]:
df.count()

18047

In [6]:
df.show()

+-------+--------------------+------+
|     id|                text|author|
+-------+--------------------+------+
|id26305|This process, how...|   EAP|
|id17569|It never once occ...|   HPL|
|id11008|In his left hand ...|   EAP|
|id27763|How lovely is spr...|   MWS|
|id12958|Finding nothing e...|   HPL|
|id22965|A youth passed in...|   MWS|
|id09674|The astronomer, p...|   EAP|
|id13515|The surcingle hun...|   EAP|
|id19322|I knew that you c...|   EAP|
|id00912|I confess that ne...|   MWS|
|id16737|"He shall find th...|   MWS|
|id16607|Here we barricade...|   EAP|
|id19764|Herbert West need...|   HPL|
|id18886|The farm like gro...|   HPL|
|id17189|But a glance will...|   EAP|
|id12799|He had escaped me...|   MWS|
|id08441|To these speeches...|   EAP|
|id13117|Her native sprigh...|   MWS|
|id14862|I even went so fa...|   EAP|
|id20836|His facial aspect...|   HPL|
+-------+--------------------+------+
only showing top 20 rows



In [7]:
df.groupBy("author").count().show()

+------+-----+
|author|count|
+------+-----+
|   MWS| 5552|
|   HPL| 5451|
|   EAP| 7044|
+------+-----+



**Load and inspect test data**

In [8]:
test_df =  sqlContext.read.csv(
    "./input/test.csv", header=True, mode="DROPMALFORMED")

In [9]:
test_df.count()

7811

In [10]:
test_df.show()

+-------+--------------------+
|     id|                text|
+-------+--------------------+
|id02310|Still, as I urged...|
|id24541|If a fire wanted ...|
|id00134|And when they had...|
|id27757|While I was think...|
|id04081|I am not sure to ...|
|id24265|That which is not...|
|id25917|I sought for repo...|
|id04951|Upon the fourth d...|
|id14549|"""The tone metap...|
|id22505|These, the offspr...|
|id15181|When I arose trem...|
|id21888|And by the shores...|
|id12035|Idris heard of he...|
|id17991|I say this proudl...|
|id00345|At his nod I took...|
|id05912|No one doubted no...|
|id13443|But although, in ...|
|id09248|Festivity, and ev...|
|id17542|For I am Iranon, ...|
|id25159|I am serious in a...|
+-------+--------------------+
only showing top 20 rows



## 3. Preprocess Text Data

**Import nltk modules**

In [11]:
import nltk
import nltk, re, time
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

**Extract stopwords and create stemmer object**

In [12]:
sw = set(stopwords.words("english"))
stemmer = SnowballStemmer("english")

**A function for text processing**

In [13]:
def text_precessing(text):
    
    '''a function for preprocessing'''
    import string
    
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    
    text = str(text)
    
    # replacing the punctuations with no space,which in effect deletes the punctuation marks 
    text = text.translate(translator)
    
    # remove stop word
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    text = " ".join(text)
    
    # stemming
    text = [stemmer.stem(word) for word in text.split()]
    text = " ".join(text) 
    
    # Clean the text
    text = re.sub(r"<br />", " ", text)
    text = re.sub(r"[^a-z]", " ", text)
    text = re.sub(r"   ", " ", text) # Remove any extra spaces
    text = re.sub(r"  ", " ", text)
    
    return text

**RDD mapping function**

In [14]:
def data_transformation(row):
    
    iD = row['id']
    text = row['text']
    if len(row) == 3:    
        auth = row['author']
    
    text = text_precessing(text)
    
    if len(row) == 3:   
        values = Row(id = iD, text = text, author = auth)
    else:
        values = Row(id = iD, text = text)
    
    return values    

**Extract RDD from dataframe and apply the mapping function**

In [15]:
data_rdd = df.rdd
test_data_rdd = test_df.rdd

In [16]:
processed_rdd = data_rdd.map(data_transformation)
processed_test_rdd = test_data_rdd.map(data_transformation)

In [17]:
processed_rdd.take(5)

[Row(author='EAP', id='id26305', text='process howev afford mean ascertain dimens dungeon might make circuit return point whenc set without awar fact perfect uniform seem wall'),
 Row(author='HPL', id='id17569', text='never occur fumbl might mere mistak'),
 Row(author='EAP', id='id11008', text='left hand gold snuff box caper hill cut manner fantast step took snuff incess air greatest possibl self satisfact'),
 Row(author='MWS', id='id27763', text='love spring look windsor terrac sixteen fertil counti spread beneath speckl happi cottag wealthier town look former year heart cheer fair'),
 Row(author='HPL', id='id12958', text='find noth els even gold superintend abandon attempt perplex look occasion steal counten sit think desk')]

In [18]:
processed_test_rdd.take(5)

[Row(id='id02310', text='still urg leav ireland inquietud impati father thought best yield'),
 Row(id='id24541', text='fire want fan could readili fan newspap govern grew weaker doubt leather iron acquir durabl proport short time pair bellow rotterdam ever stood need stitch requir assist hammer'),
 Row(id='id00134', text='broken frail door found two clean pick human skeleton earthen floor number singular beetl crawl shadowi corner'),
 Row(id='id27757', text='think possibl manag without one actual tumbl head roll steep side steepl lodg rain gutter ran along eav main build'),
 Row(id='id04081', text='sure limit knowledg may extend')]

**Turn the processed data back to dataframe**

In [19]:
processed_df = sqlContext.createDataFrame(processed_rdd)
processed_test_df = sqlContext.createDataFrame(processed_test_rdd)

In [20]:
processed_df.show()

+------+-------+--------------------+
|author|     id|                text|
+------+-------+--------------------+
|   EAP|id26305|process howev aff...|
|   HPL|id17569|never occur fumbl...|
|   EAP|id11008|left hand gold sn...|
|   MWS|id27763|love spring look ...|
|   HPL|id12958|find noth els eve...|
|   MWS|id22965|youth pass solitu...|
|   EAP|id09674|astronom perhap p...|
|   EAP|id13515|surcingl hung rib...|
|   EAP|id19322|knew could say st...|
|   MWS|id00912|confess neither s...|
|   MWS|id16737|shall find feel i...|
|   EAP|id16607|barricad present ...|
|   HPL|id19764|herbert west need...|
|   HPL|id18886|farm like ground ...|
|   EAP|id17189|glanc show fallac...|
|   MWS|id12799|escap must commen...|
|   EAP|id08441|speech gave cours...|
|   MWS|id13117|nativ sprightli n...|
|   EAP|id14862|even went far spe...|
|   HPL|id20836|facial aspect rem...|
+------+-------+--------------------+
only showing top 20 rows



In [21]:
processed_test_df.show()

+-------+--------------------+
|     id|                text|
+-------+--------------------+
|id02310|still urg leav ir...|
|id24541|fire want fan cou...|
|id00134|broken frail door...|
|id27757|think possibl man...|
|id04081|sure limit knowle...|
|id24265|matter unless qua...|
|id25917|sought repos alth...|
|id04951|upon fourth day a...|
|id14549|tone metaphys als...|
|id22505|offspr later peri...|
|id15181|aros trembl know ...|
|id21888|shore river zair ...|
|id12035|idri heard mother...|
|id17991|say proud tear ey...|
|id00345|nod took one latt...|
|id05912|one doubt mysteri...|
|id13443|although one two ...|
|id09248|festiv even liber...|
|id17542|   iranon princ aira|
|id25159|serious assert br...|
+-------+--------------------+
only showing top 20 rows



## 4. Create TF-IDF Matrix

**Import modules for creating TF-IDF matrix**

In [22]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import CountVectorizer, Tokenizer
from pyspark.ml.feature import IDF

**Create train validation split**

In [23]:
(train_set, val_set) = processed_df.randomSplit([0.80, 0.20], seed = 50)

**Create pipeline for train and validation set**

In [24]:
# create tokenizer object
tokenizer = Tokenizer(inputCol="text", outputCol="words")

In [25]:
# create count vectorizer object
cv = CountVectorizer(vocabSize=2**15, inputCol="words", outputCol='cv')

In [26]:
# create tf-idf object
idf = IDF(inputCol='cv', outputCol="features", minDocFreq=5)

In [27]:
# create string indexer object to transform the categorical output value
label_stringIdx = StringIndexer(inputCol = "author", outputCol = "label")

In [28]:
# create the pipeline
pipeline = Pipeline(stages=[tokenizer, cv, idf, label_stringIdx])

In [29]:
pipeline_fit = pipeline.fit(train_set)

In [30]:
# transform the train set
train_df = pipeline_fit.transform(train_set)

In [31]:
# transform the validation set
val_df = pipeline_fit.transform(val_set)

**Create pipeline for test set**

In [32]:
pipeline_test = Pipeline(stages=[tokenizer, cv, idf])
pipeline_test_fit = pipeline_test.fit(train_set)

In [33]:
test_df = pipeline_test_fit.transform(processed_test_df)

**Display the transformed train, validation and test set**

In [34]:
train_df.show()

+------+-------+--------------------+--------------------+--------------------+--------------------+-----+
|author|     id|                text|               words|                  cv|            features|label|
+------+-------+--------------------+--------------------+--------------------+--------------------+-----+
|   EAP|id00003|burn know incid p...|[burn, know, inci...|(13256,[70,82,239...|(13256,[70,82,239...|  0.0|
|   EAP|id00006|difficult given t...|[difficult, given...|(13256,[0,1,4,43,...|(13256,[0,1,4,43,...|  0.0|
|   EAP|id00007|cannot maintain e...|[cannot, maintain...|(13256,[3,5,25,14...|(13256,[3,5,25,14...|  0.0|
|   EAP|id00012|      deposit coffin|   [deposit, coffin]|(13256,[1143,1229...|(13256,[1143,1229...|  0.0|
|   EAP|id00015|ten reason believ...|[ten, reason, bel...|(13256,[182,206,3...|(13256,[182,206,3...|  0.0|
|   EAP|id00021|besid thing seen ...|[besid, thing, se...|(13256,[7,113,118...|(13256,[7,113,118...|  0.0|
|   EAP|id00024|wyatt parti arriv...|

In [35]:
val_df.show()

+------+-------+--------------------+--------------------+--------------------+--------------------+-----+
|author|     id|                text|               words|                  cv|            features|label|
+------+-------+--------------------+--------------------+--------------------+--------------------+-----+
|   EAP|id00004|might see perhap ...|[might, see, perh...|(13256,[0,13,49,5...|(13256,[0,13,49,5...|  0.0|
|   EAP|id00017|seen pp illustri ...|[seen, pp, illust...|(13256,[21,22,43,...|(13256,[21,22,43,...|  0.0|
|   EAP|id00027|      reason conceal|   [reason, conceal]|(13256,[182,647],...|(13256,[182,647],...|  0.0|
|   EAP|id00030|dun meantim left ...|[dun, meantim, le...|(13256,[33,71,112...|(13256,[33,71,112...|  0.0|
|   EAP|id00034|warehous resort p...|[warehous, resort...|(13256,[28,110,24...|(13256,[28,110,24...|  0.0|
|   EAP|id00051|observ although o...|[observ, although...|(13256,[1,9,10,12...|(13256,[1,9,10,12...|  0.0|
|   EAP|id00063|pest spirit plagu...|

In [36]:
test_df.show()

+-------+--------------------+--------------------+--------------------+--------------------+
|     id|                text|               words|                  cv|            features|
+-------+--------------------+--------------------+--------------------+--------------------+
|id02310|still urg leav ir...|[still, urg, leav...|(13256,[16,29,134...|(13256,[16,29,134...|
|id24541|fire want fan cou...|[fire, want, fan,...|(13256,[2,4,55,20...|(13256,[2,4,55,20...|
|id00134|broken frail door...|[broken, frail, d...|(13256,[18,49,64,...|(13256,[18,49,64,...|
|id27757|think possibl man...|[think, possibl, ...|(13256,[0,62,80,1...|(13256,[0,62,80,1...|
|id04081|sure limit knowle...|[sure, limit, kno...|(13256,[53,317,44...|(13256,[53,317,44...|
|id24265|matter unless qua...|[matter, unless, ...|(13256,[7,205,905...|(13256,[7,205,905...|
|id25917|sought repos alth...|[sought, repos, a...|(13256,[0,78,89,1...|(13256,[0,78,89,1...|
|id04951|upon fourth day a...|[upon, fourth, da...|(13256,[1

## 5. Fit various machine learning models

### Fit naive bayes classifier

**Create and fit model**

In [37]:
from pyspark.ml.classification import NaiveBayes

In [38]:
nb_model = NaiveBayes(featuresCol="features", labelCol="label",  modelType='multinomial')

In [39]:
nb_model_fitted = nb_model.fit(train_df)

**Evaluate Model and create submission**

In [40]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [41]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label", metricName="accuracy")

In [42]:
predictions1 = nb_model_fitted.transform(val_df)

In [43]:
evaluator.evaluate(predictions1)

0.798284449363586

In [44]:
submission1 = nb_model_fitted.transform(test_df)
submission1.select(["id", "prediction"]).toPandas().to_csv('submission1.csv', index = False)

### Fit logistic regression model

**Create and fit model**

In [45]:
from pyspark.ml.classification import LogisticRegression

In [46]:
lr = LogisticRegression(maxIter=1000,featuresCol="features", labelCol="label")

In [47]:
lr_fitted = lr.fit(train_df)

**Evaluate Model and create submission**

In [48]:
predictions2 = nb_model_fitted.transform(val_df)

In [49]:
evaluator.evaluate(predictions2)

0.798284449363586

In [50]:
submission2 = nb_model_fitted.transform(test_df)
submission2.select(["id", "prediction"]).toPandas().to_csv('submission2.csv', index = False)

### Fit Linear Support Vector Machine

**Create and fit model**

In [51]:
from pyspark.ml.classification import OneVsRest, LinearSVC

In [52]:
lsvc = LinearSVC(featuresCol='features', labelCol='label', maxIter=80, regParam=0.1)

In [53]:
lsvc_ovr = OneVsRest(classifier=lr)

In [54]:
lsvc_fitted = lsvc_ovr.fit(train_df)

**Evaluate Model and create submission**

In [55]:
predictions3 = lsvc_fitted.transform(val_df)

In [56]:
evaluator.evaluate(predictions3)

0.7230215827338129

In [57]:
submission3 = lsvc_fitted.transform(test_df)
submission3.select(["id", "prediction"]).toPandas().to_csv('submission3.csv', index = False)