In [45]:
import pandas as pd 
import glob
import os 
import sys
from pyspark.ml.feature import Tokenizer, StopWordsRemover,StringIndexer,IDF, CountVectorizer
from pyspark.sql.functions import col,udf, when, isnan, count, length, lower
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, LinearSVC
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator




In [46]:
os.environ['PYSPARK_PYTHON']=sys.executable
os.environ['PYSPARK_DRIVER_PYTHON']=sys.executable

In [47]:
spark= SparkSession.builder.master('local[*]').appName("NLP").getOrCreate()
sc= spark.sparkContext

In [48]:
path=r'C:\Users\terli\Programs\Python\Projects\Advanced Business Analytics\Twitchchat\Chatlogs'

In [49]:
all_files= glob.glob(os.path.join(path,"*.csv"))
df=(pd.read_csv(f) for f in all_files)
all_chatlogs=pd.concat(df,ignore_index=True)

##Keeping only Xqc and amondgold chanels
all_chatlogs=all_chatlogs[all_chatlogs['channel'].str.contains('#xqcow|#asmongold')]


print(all_chatlogs['channel'].unique())
all_chatlogs.to_csv(r"C:\Users\terli\Programs\Python\Projects\Advanced Business Analytics\Twitchchat\all_chatlogs.csv", index=False,header=1)

['#xqcow' '#asmongold']


In [50]:
# chat= (spark.read.option("header",True).csv(r'C:\Users\terli\Programs\Python\Projects\Advanced Business Analytics\Twitchchat\all_chatlogs.csv')).sample(withReplacement=False,fraction=0.2,seed=7).withColumn('message',lower(col('message')))
chat= (spark.read.option("header",True).csv(r'C:\Users\terli\Programs\Python\Projects\Advanced Business Analytics\Twitchchat\all_chatlogs.csv')).withColumn('message',lower(col('message')))
chat=chat.na.drop()

In [51]:
labelEncoder=StringIndexer(inputCol='channel',outputCol='label').fit(chat)
labelEncoder.labels
chat=labelEncoder.transform(chat)
chat_vis=chat.sample(withReplacement=False,fraction=0.2,seed=7)

In [52]:
logs,test_logs=chat.randomSplit([0.7,0.3])
test_logs,val_logs=chat.randomSplit([0.2,0.1])

In [53]:
logs.select([count(when(isnan(c)| col(c).isNull(),c)).alias(c) for c in logs.columns]).show

<bound method DataFrame.show of DataFrame[datetime: bigint, username: bigint, channel: bigint, message: bigint, label: bigint]>

In [54]:
logs.show(10)

+--------------------+------------------+-------+--------------------+-----+
|            datetime|          username|channel|             message|label|
+--------------------+------------------+-------+--------------------+-----+
|2022-04-20T11:17:...|       thepepexdvv| #xqcow|"uhm not i litera...|  0.0|
|2022-04-20T11:17:...|         faramarz7| #xqcow|                lulw|  0.0|
|2022-04-20T11:17:...|           dc89427| #xqcow|gigachad i would:...|  0.0|
|2022-04-20T11:17:...|          accxrsed| #xqcow|u got boomer reac...|  0.0|
|2022-04-20T11:17:...|            rendoa| #xqcow|i wouldnt :):toos...|  0.0|
|2022-04-20T11:17:...|     masaru_kato15| #xqcow|how can you not r...|  0.0|
|2022-04-20T11:17:...|           versaw0| #xqcow|i have meningitis...|  0.0|
|2022-04-20T11:17:...|         nulle_dud| #xqcow|peepoglad 🌹 for ...|  0.0|
|2022-04-20T11:17:...|777777777777777464| #xqcow|  wutface 1 man spam|  0.0|
|2022-04-20T11:17:...|           iemeigh| #xqcow|elisdancy ✨ 󠀀:sc...|  0.0|
+

In [55]:
logs.count()

713924

In [56]:
logs.schema['label'].dataType

DoubleType

In [57]:
logs.agg({'label':'sum'}).show()

+----------+
|sum(label)|
+----------+
|  102762.0|
+----------+



In [58]:
chat_length=chat.withColumn('length', length('message'))
chat_length.show()

+--------------------+-------------+-------+--------------------+-----+------+
|            datetime|     username|channel|             message|label|length|
+--------------------+-------------+-------+--------------------+-----+------+
|2022-04-20T11:19:...|   depressy20| #xqcow|             lulw 💦|  0.0|     6|
|2022-04-20T11:19:...|      zynderz| #xqcow|omegalul  󠀀 󠀀 ?...|  0.0|    19|
|2022-04-20T11:19:...|     matthulm| #xqcow|                lulw|  0.0|     4|
|2022-04-20T11:19:...|      sez_you| #xqcow|lulw farm em x:xv...|  0.0|    76|
|2022-04-20T11:19:...|    nulle_dud| #xqcow|lulw 💦 farm it x...|  0.0|    78|
|2022-04-20T11:19:...| foxy_winters| #xqcow|                lulw|  0.0|     4|
|2022-04-20T11:19:...|       xuuaa_| #xqcow|                lulw|  0.0|     4|
|2022-04-20T11:19:...|      nakazzy| #xqcow|    what are u doing|  0.0|    16|
|2022-04-20T11:19:...|       reyoen| #xqcow|lul nice r:dexter...|  0.0|   167|
|2022-04-20T11:19:...|      svk_ron| #xqcow|      farm e

In [59]:

tokenizer= Tokenizer(inputCol="message",outputCol="words")
stopwords_remover= StopWordsRemover(inputCol='words',outputCol='filtered_words')
vectorizer= CountVectorizer(inputCol='filtered_words',outputCol='rawfeatures')
idf=IDF(inputCol='rawfeatures',outputCol="vectorisedFeatures")





In [60]:
#Visualise The different transformations 

# chat_tokenized=tokenizer.transform(chat_vis)
# chat_tokenized[['message','words',]].show(5,truncate=100,vertical=True)
# chat_stopwords=stopwords_remover.transform(chat_tokenized)
# chat_stopwords[['message','words','filtered_words']].show(5,truncate=100,vertical=True)

# model=vectorizer.fit(chat_stopwords)
# chat_vector=vectorizer.fit(chat_stopwords).transform(chat_stopwords)
# chat_vector[['filtered_words','rawfeatures']].show(5, truncate=100,vertical=True)
# chat_idf=idf.fit(chat_vector).transform(chat_vector)
# chat_idf[['filtered_words','vectorisedFeatures']].show(5, truncate=100,vertical=True)




In [61]:
logistic=LogisticRegression(featuresCol='vectorisedFeatures',labelCol='label')
lr=Pipeline(stages=[tokenizer,stopwords_remover,vectorizer,idf,logistic])
lr_model=lr.fit(logs)
predictions_lr=lr_model.transform(test_logs)
predictions_lr.select('rawPrediction','probability','prediction','channel','label').show(10)


+--------------------+--------------------+----------+-------+-----+
|       rawPrediction|         probability|prediction|channel|label|
+--------------------+--------------------+----------+-------+-----+
|[9.78685079321644...|[0.99994381759774...|       0.0| #xqcow|  0.0|
|[2.74411663590631...|[0.93958021827579...|       0.0| #xqcow|  0.0|
|[12.2804073398411...|[0.99999535821692...|       0.0| #xqcow|  0.0|
|[12.1389301130886...|[0.99999465278910...|       0.0| #xqcow|  0.0|
|[12.7328471630578...|[0.99999704747586...|       0.0| #xqcow|  0.0|
|[10.8843224052302...|[0.99998125045654...|       0.0| #xqcow|  0.0|
|[3.51771098321722...|[0.97118752111062...|       0.0| #xqcow|  0.0|
|[8.83471243492502...|[0.99985443065906...|       0.0| #xqcow|  0.0|
|[30.6140542694567...|[0.99999999999994...|       0.0| #xqcow|  0.0|
|[4.45364532387266...|[0.98849776848577...|       0.0| #xqcow|  0.0|
+--------------------+--------------------+----------+-------+-----+
only showing top 10 rows



In [62]:
evaluator_accuracy=MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='accuracy')
evaluator_precision=MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='precisionByLabel')
evaluator_recall=MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='recallByLabel')

In [63]:
accuracy_lr=evaluator_accuracy.evaluate(predictions_lr)
precision_lr=evaluator_precision.evaluate(predictions_lr)
recall_lr=evaluator_recall.evaluate(predictions_lr)

In [64]:
random=RandomForestClassifier(featuresCol='vectorisedFeatures',labelCol='label',numTrees=10)
rdf=Pipeline(stages=[tokenizer,stopwords_remover,vectorizer,idf,random])
rdf_model=rdf.fit(logs)
predictions_rdf=rdf_model.transform(test_logs)



In [65]:
LSVM=LinearSVC(featuresCol='vectorisedFeatures',labelCol='label',predictionCol='prediction',maxIter=50,regParam=0.0)
lsvc=Pipeline(stages=[tokenizer,stopwords_remover,vectorizer,idf,LSVM])
svm_model= lsvc.fit(logs)
predictions_svm=(svm_model.transform(test_logs))


In [66]:
evaluator_accuracy=MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='accuracy')
accuracy_lr=evaluator_accuracy.evaluate(predictions_lr)
accuracy_rdf=evaluator_accuracy.evaluate(predictions_rdf)
accuracy_svm=evaluator_accuracy.evaluate(predictions_svm)


evaluator_precision=MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='precisionByLabel')
precision_lr=evaluator_precision.evaluate(predictions_lr)
precision_rdf=evaluator_precision.evaluate(predictions_rdf)
precision_svm=evaluator_precision.evaluate(predictions_svm)


evaluator_recall=MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='recallByLabel')
recall_lr=evaluator_recall.evaluate(predictions_lr)
recall_rdf=evaluator_recall.evaluate(predictions_rdf)
recall_svm=evaluator_recall.evaluate(predictions_svm)


evaluator_f1=MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='f1')
f1_lr=evaluator_f1.evaluate(predictions_lr)
f1_rdf=evaluator_f1.evaluate(predictions_rdf)
f1_svm=evaluator_f1.evaluate(predictions_svm)


In [67]:
print('Logistic regression model')
print('Accuracy:',accuracy_lr)
print('Precsion:',precision_lr)
print('Recall:',recall_lr)
print('f1:',f1_lr)


Logistic regression model
Accuracy: 0.9231001157484009
Precsion: 0.9272392782456294
Recall: 0.9877044464252898
f1: 0.9150388295087861


In [68]:
print('Random forest model')
print('Accuracy:',accuracy_rdf)
print('Precsion:',precision_rdf)
print('Recall:',recall_rdf)
print('f1:',f1_rdf)


Random forest model
Accuracy: 0.8563366734174311
Precsion: 0.8563366734174311
Recall: 1.0
f1: 0.7900641179378708


In [69]:
print('SVM model')
print('Accuracy:',accuracy_svm)
print('Precsion:',precision_svm)
print('Recall:',recall_svm)
print('f1:',f1_svm)

SVM model
Accuracy: 0.9205204118701676
Precsion: 0.9213159788082935
Recall: 0.9918985715586104
f1: 0.9102280489788097


In [70]:
predictions_lr_val=lr_model.transform(val_logs)
f1_lr_val=evaluator_f1.evaluate(predictions_lr_val)
# accuracy_lr_val=evaluator_accuracy.evaluate(predictions_lr_val)
# recall_lr_val=evaluator_recall.evaluate(predictions_lr_val)
# precision_lr_val=evaluator_precision.evaluate(predictions_lr_val)

In [71]:
print('Lr model_val')
print('f1:',f1_lr_val)

Lr model_val
f1: 0.9149483879114753
