## Data Loading

In [1]:
from pyspark.sql.types import *
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
import time

In [2]:
twitter_path = './data/twitter.csv'
reddit_path = './data/reddit.csv'
mental_path = './data/mental_health.csv'

# schema = StructType([StructField('text', StringType(), True),
#                     StructField('label', IntegerType(), True)])

df_twitter = spark.read.csv(twitter_path, header=True, inferSchema=True)
df_reddit = spark.read.csv(reddit_path, header=True, inferSchema=True)
df_mental = spark.read.csv(mental_path, header=True, inferSchema=True)

                                                                                

In [3]:
df_twitter.printSchema()
df_reddit.printSchema()
df_mental.printSchema()

root
 |-- text: string (nullable = true)
 |-- label: integer (nullable = true)

root
 |-- text: string (nullable = true)
 |-- label: integer (nullable = true)

root
 |-- text: string (nullable = true)
 |-- label: integer (nullable = true)



In [4]:
df_twitter = df_twitter.withColumnRenamed("message to examine", "text")
df_twitter = df_twitter.withColumnRenamed("label (depression result)", "label")
df_twitter = df_twitter.withColumn("label", df_twitter["label"].cast(IntegerType()))
df_twitter = df_twitter.drop('Index')

df_reddit = df_reddit.withColumnRenamed("clean_text", "text")
df_reddit = df_reddit.withColumnRenamed("is_depression", "label")

In [5]:
df_twitter.show()
df_reddit.show()
df_mental.show()

+--------------------+-----+
|                text|label|
+--------------------+-----+
|just had a real g...|    0|
|is reading manga ...|    0|
|@comeagainjen htt...|    0|
|@lapcat Need to s...|    0|
|ADD ME ON MYSPACE...|    0|
|so sleepy. good t...|    0|
|@SilkCharm re: #n...|    0|
|23 or 24ï¿½C poss...|    0|
|nite twitterville...|    0|
|@daNanner Night, ...|    0|
|Good morning ever...|    0|
|Finally! I just c...|    0|
|kisha they cnt ge...|    0|
|@nicolerichie Yes...|    0|
|I really love ref...|    0|
|@blueaero ooo it'...|    0|
|@rokchic28 no pro...|    0|
|@shipovalov &quot...|    0|
|Once again stayed...|    0|
|@Kal_Penn I just ...|    0|
+--------------------+-----+
only showing top 20 rows

+--------------------+-----+
|                text|label|
+--------------------+-----+
|we understand tha...|    1|
|welcome to r depr...|    1|
|anyone else inste...|    1|
|i ve kind of stuf...|    1|
|sleep is my great...|    1|
|i m year old turn...|    1|
|i live alone and

## Data Preprocessing

In [6]:
print(df_twitter.toPandas()['text'].isnull().sum())
print(df_reddit.toPandas()['text'].isnull().sum())
print(df_mental.toPandas()['text'].isnull().sum())

print(df_twitter.toPandas()['label'].isnull().sum())
print(df_reddit.toPandas()['label'].isnull().sum())
print(df_mental.toPandas()['label'].isnull().sum())

0
0
0
0
0
0


In [7]:
df_twitter = df_twitter.dropna()

In [8]:
print(df_twitter.toPandas()['text'].isnull().sum())
print(df_reddit.toPandas()['text'].isnull().sum())
print(df_mental.toPandas()['text'].isnull().sum())

print(df_twitter.toPandas()['label'].isnull().sum())
print(df_reddit.toPandas()['label'].isnull().sum())
print(df_mental.toPandas()['label'].isnull().sum())

0
0
0
0
0
0


In [9]:
df = df_twitter.union(df_reddit).union(df_mental)

df.groupby('label').count().show()
print(df.count())

+-----+-----+
|label|count|
+-----+-----+
|    1|19972|
|    0|26039|
+-----+-----+

46011


In [10]:
trainDF, testDF = df.randomSplit([.8, .2], seed=42)
print(f"""There are {trainDF.count()} rows in the training set, and {testDF.count()} in the test set""") 

There are 36939 rows in the training set, and 9072 in the test set


In [11]:
tokenizer = Tokenizer(inputCol='text', outputCol='tokens')
stopwords_remover = StopWordsRemover(inputCol='tokens', outputCol='filtered_tokens',locale='en_US.utf8')
vectorizer = CountVectorizer(inputCol='filtered_tokens', outputCol='vectors')
idf = IDF(inputCol='vectors', outputCol='features')

## Feature Extraction

## Logistic Regression

In [12]:
lr = LogisticRegression(featuresCol='features', labelCol='label')

pipeline = Pipeline(stages=[tokenizer, stopwords_remover, vectorizer, idf, lr])

pipelineModel = pipeline.fit(trainDF)

IllegalArgumentException: StopWordsRemover_22242e963ed7 parameter locale given invalid value en_US.utf8.

In [None]:
predDF = pipelineModel.transform(testDF)

predDF.show(5)

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')

accuracy = evaluator.evaluate(predDF)
print(f'Accuracy for logistic regression: {accuracy}')

In [None]:
# Save model
model_path = "../backend/models/lr_model"
pipelineModel.save(model_path)

In [None]:
# Load model
from pyspark.ml import PipelineModel

loaded_pipeline_model  = PipelineModel.load("../backend/models/lr_model")
df = pipelineModel.transform(df)

## Random Forest

In [None]:
rf = RandomForestClassifier(labelCol='label', maxBins=40, seed=42)

In [None]:
paramGrid = (ParamGridBuilder()
            .addGrid(rf.maxDepth, [2, 4, 6])
            .addGrid(rf.numTrees, [10, 100])
            .build())

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')

In [None]:
cv = CrossValidator(estimator=rf,
                    evaluator=evaluator,
                    estimatorParamMaps=paramGrid,
                    numFolds=3,
                    parallelism=4,
                    seed=42)

pipeline = Pipeline(stages=[tokenizer, stopwords_remover, vectorizer, idf, rf])

In [None]:
start_time = time.time()

pipelineModel = pipeline.fit(trainDF)
print('Time spent:', time.time() - start_time)

In [None]:
predDF = pipelineModel.transform(testDF)
accuracy = evaluator.evaluate(predDF)
print(f'Accuracy for random forest: {accuracy}')