In [2]:
from pyspark.sql.types import StructType, StructField, StringType, LongType
jsonSchema = StructType([
    StructField('label', StringType(), True),
    StructField('tweet_id', LongType(), True),
    StructField('tweet_text', StringType(), True)
])

#replace the file path
df=spark.read.format("json").schema(jsonSchema).load("/Users/Pavel/Documents/KULeuven/Courses/AdvancedAnalyticsinBigDataWorld/spark/data/*")

In [3]:
import pyspark.sql.functions as f
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import ltrim

#Converting all letters to lowercase
df = df.withColumn("tweet_text",f.lower(f.col("tweet_text")))

#removing punctuations, numbers, http and spaces
df =df.withColumn("tweet_text",f.regexp_replace(f.col("tweet_text"),'([^ a-zA-Z\'])',''))
df = df.withColumn("tweet_text",f.regexp_replace(f.col("tweet_text"),'http.*?\\b',' '))
df = df.withColumn("tweet_text",f.ltrim(f.regexp_replace(f.col("tweet_text"),'[\r\n\t\f\v ]+', ' ')))

## Pipeline preparation

In [4]:
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression


#Splitting words
tokenizer = Tokenizer(inputCol="tweet_text", outputCol="words")

#Removing stop words
stopwordList = ["u","ur", "amp", "q"] 
stopwordList.extend(StopWordsRemover().getStopWords())
remover = StopWordsRemover(inputCol="words", outputCol="filtered" ,stopWords=stopwordList)

#Vectorizing
cv = CountVectorizer(inputCol="filtered", outputCol="features")
label_stringIdx = StringIndexer(inputCol = "label", outputCol = "labelIndex")

#Logistic Regression
lr = LogisticRegression(labelCol = "labelIndex", featuresCol = "features", maxIter=20, regParam=0.3, elasticNetParam=0)

#create the pipeline
pipeline = Pipeline(stages=[tokenizer, remover, cv, label_stringIdx, lr])




In [9]:
pipelineFit = pipeline.fit(df)

In [11]:
pipelineFit.write().overwrite().save('lr_model')