In [None]:
from pyspark.sql.types import StructType, StructField, StringType, LongType
jsonSchema = StructType([
    StructField('label', StringType(), True),
    StructField('tweet_id', LongType(), True),
    StructField('tweet_text', StringType(), True)
])

df=spark.read.format("json").schema(jsonSchema).load("/Users/Pavel/Documents/KULeuven/Courses/AdvancedAnalyticsinBigDataWorld/spark/data/*")

In [None]:
df.show()

### TF-IDF Hashing

In [None]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
tokenizer = Tokenizer(inputCol="tweet_text", outputCol="words")
wordsData = tokenizer.transform(df)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
featurizedData = hashingTF.transform(wordsData)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.select("label", "features").show()


### CountVectorizer step by step

In [None]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
tokenizer = Tokenizer(inputCol="tweet_text", outputCol="words")

In [None]:
from pyspark.ml.feature import StopWordsRemover

#remember to delete also some words that this function does not see like "ur" or "u"
remover = StopWordsRemover(inputCol="words", outputCol="filtered")



In [None]:
from pyspark.ml.feature import CountVectorizer

cv = CountVectorizer(inputCol="filtered", outputCol="features")

#model = cv.fit(filtered)

#result = model.transform(wordsData)
#result.show(truncate=False)

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "label", outputCol = "labelIndex")
pipeline = Pipeline(stages=[tokenizer, remover, cv, label_stringIdx])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)
dataset.show(truncate = 30)

In [None]:
dataset.show()

In [None]:
# set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

In [None]:

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("filtered","label","labelIndex","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)