## Text Comments Classification Using Hashing TF-IDF and Word2Vec (PySpark)

### Setting Up

In [50]:
!pip install pyspark -q

[0m

In [51]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, when, count
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Word2Vec
from pyspark.ml.classification import LogisticRegression
from tqdm import tqdm
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from sklearn.metrics import roc_auc_score, accuracy_score

import pyspark.sql.functions as f
import numpy as np
import pandas as pd

In [52]:
spark = SparkSession.builder \
    .master("local[1]") \
    .appName("Toxic Comments Classification - Big Data HomeWork Task") \
    .config("spark.executor.memory", "70g") \
    .config("spark.driver.memory", "50g") \
    .config("spark.memory.offHeap.enabled", True) \
    .config("spark.memory.offHeap.size","16g") \
    .getOrCreate()

spark.sparkContext.setLogLevel('ERROR')
spark.version

'3.3.1'

In [53]:
seed = 42
np.random.seed(seed)

### Loading and Preprocessing data

In [54]:
path_to_data = '/kaggle/input/toxic-comments-data/train.csv'

In [55]:
data_pandas = pd.read_csv(path_to_data)
data_pandas.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [56]:
print(f'Null values in train dataset : {data_pandas.isna().values.sum()}')

Null values in train dataset : 0


Конвертируем из Pandas в Spark DF

In [57]:
df = spark.createDataFrame(data_pandas) 

In [58]:
del data_pandas

In [59]:
df.show(5)

[Stage 1751:>                                                       (0 + 1) / 1]

+----------------+--------------------+-----+------------+-------+------+------+-------------+
|              id|        comment_text|toxic|severe_toxic|obscene|threat|insult|identity_hate|
+----------------+--------------------+-----+------------+-------+------+------+-------------+
|0000997932d777bf|Explanation\nWhy ...|    0|           0|      0|     0|     0|            0|
|000103f0d9cfb60f|D'aww! He matches...|    0|           0|      0|     0|     0|            0|
|000113f07ec002fd|Hey man, I'm real...|    0|           0|      0|     0|     0|            0|
|0001b41b1c6bb37e|"\nMore\nI can't ...|    0|           0|      0|     0|     0|            0|
|0001d958c54c6e35|You, sir, are my ...|    0|           0|      0|     0|     0|            0|
+----------------+--------------------+-----+------------+-------+------+------+-------------+
only showing top 5 rows



                                                                                

Разделим задачу многоклассовой классификации на отдельные задачи бинарной классификации

In [60]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

Разделим выборку на тренировочную и тестовую для оценки получившейся модели 

In [61]:
def train_test_split(df, test_size):
    if test_size >= 1:
        return
    return df.randomSplit([1 - test_size, test_size], seed=seed)

In [62]:
train_df, test_df = train_test_split(df, 0.25)

Токенизируем наш корпус для дальнейших действий с токенами, предварительно обработав слова

In [63]:
train_df = train_df.withColumn("comment_text", f.lower(f.col("comment_text")))
test_df = test_df.withColumn("comment_text", f.lower(f.col("comment_text")))

In [64]:
tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")
words_data = tokenizer.transform(train_df)

### Hashing TF-IDF 

Посчитаем метрики tf-idf для получившихся токенов

In [65]:
hashing_TF = HashingTF(inputCol="words", outputCol="rawFeatures")
feature_data = hashing_TF.transform(words_data)

In [66]:
idf_model = IDF(inputCol="rawFeatures", outputCol="features")

In [67]:
idf = idf_model.fit(feature_data) 
tf_idf = idf.transform(feature_data)

                                                                                

In [68]:
tf_idf.show(5)

+----------------+--------------------+-----+------------+-------+------+------+-------------+--------------------+--------------------+--------------------+
|              id|        comment_text|toxic|severe_toxic|obscene|threat|insult|identity_hate|               words|         rawFeatures|            features|
+----------------+--------------------+-----+------------+-------+------+------+-------------+--------------------+--------------------+--------------------+
|0000997932d777bf|explanation\nwhy ...|    0|           0|      0|     0|     0|            0|[explanation, why...|(262144,[6240,722...|(262144,[6240,722...|
|000103f0d9cfb60f|d'aww! he matches...|    0|           0|      0|     0|     0|            0|[d'aww!, he, matc...|(262144,[2195,471...|(262144,[2195,471...|
|00013fa6fb6ef643|wehwalt, ftr, i'm...|    0|           0|      0|     0|     0|            0|[wehwalt,, ftr,, ...|(262144,[18700,27...|(262144,[18700,27...|
|0001b41b1c6bb37e|"\nmore\ni can't ...|    0|       

                                                                                

### Word2Vec

In [69]:
word2vec_model = Word2Vec(vectorSize=30, minCount=0, inputCol="words", outputCol="Word2VecRawFeatures", seed=seed)

In [70]:
word2vec_fitted = word2vec_model.fit(words_data)
word2vec = word2vec_fitted.transform(words_data)

                                                                                

In [71]:
word2vec.show(5)

[Stage 1759:>                                                       (0 + 1) / 1]

+----------------+--------------------+-----+------------+-------+------+------+-------------+--------------------+--------------------+
|              id|        comment_text|toxic|severe_toxic|obscene|threat|insult|identity_hate|               words| Word2VecRawFeatures|
+----------------+--------------------+-----+------------+-------+------+------+-------------+--------------------+--------------------+
|0000997932d777bf|explanation\nwhy ...|    0|           0|      0|     0|     0|            0|[explanation, why...|[-0.1475098898829...|
|000103f0d9cfb60f|d'aww! he matches...|    0|           0|      0|     0|     0|            0|[d'aww!, he, matc...|[0.00161752963645...|
|00013fa6fb6ef643|wehwalt, ftr, i'm...|    0|           0|      0|     0|     0|            0|[wehwalt,, ftr,, ...|[-0.1532214559418...|
|0001b41b1c6bb37e|"\nmore\ni can't ...|    0|           0|      0|     0|     0|            0|[", more, i, can'...|[-0.1871844218278...|
|0001d958c54c6e35|you, sir, are my ...|  

                                                                                

### Fit & Predict 

Для каждого лейбла обучим линейный классификатор, например, логистическую регрессию. Таким образом, у нас получится 6 моделей логистической регрессии с регуляризацией. После обучения моделей, сделаем предикт на тестовой выборке

In [72]:
def get_predict_log_reg(features_col, label, df_to_fit, df_to_test, C=0.01):
    lr = LogisticRegression(featuresCol=features_col, labelCol=label, regParam=C)
    lr_model = lr.fit(df_to_fit)
    return lr_model.transform(df_to_test)

In [73]:
test_tokenized = tokenizer.transform(test_df)
test_tf = hashing_TF.transform(test_tokenized)
test_tf_idf = idf.transform(test_tf)

In [74]:
test_word2vec = word2vec_fitted.transform(test_tokenized)

In [75]:
log_reg_predictions_tf_idf = {}
log_reg_predictions_word2vec = {}

for label in labels:
    log_reg_predictions_tf_idf[label] = get_predict_log_reg('features', label, tf_idf, test_tf_idf)
    log_reg_predictions_word2vec[label] = get_predict_log_reg('Word2VecRawFeatures', label, word2vec, test_word2vec)

                                                                                

### Evaluation

Посчитаем метрики `accuracy` и `ROC AUC` для моделей логистической регрессии на тестовой выборке при разных подходах - `TF-IDF` и `Word2Vec`

In [79]:
def calc_roc_auc(y_true, y_pred, label):
    y_true = [row[label] for row in y_true.collect()]
    y_pred = [row.probability[1] for row in y_pred.collect()]
    return roc_auc_score(y_true, y_pred)

In [80]:
def calc_accuracy(y_true, y_pred, label):
    y_true = [row[label] for row in y_true.collect()]
    y_pred = [row.prediction for row in y_pred.collect()]
    return accuracy_score(y_true, y_pred)

In [81]:
result_metrics = {
    'Label' : [],
    'Accuracy': [],
    'ROC_AUC' : [],
    'Method' : []
}

for label, pred in log_reg_predictions_tf_idf.items():
    result_metrics['Label'].append(label)
    result_metrics['Accuracy'].append(calc_accuracy(test_df.select(col(label)), pred.select(col("prediction")), label))
    result_metrics['ROC_AUC'].append(calc_roc_auc(test_df.select(col(label)), pred.select(col("probability")), label))
    result_metrics['Method'].append('TF-IDF')
    
for label, pred in log_reg_predictions_word2vec.items():
    result_metrics['Label'].append(label)
    result_metrics['Accuracy'].append(calc_accuracy(test_df.select(col(label)), pred.select(col("prediction")), label))
    result_metrics['ROC_AUC'].append(calc_roc_auc(test_df.select(col(label)), pred.select(col("probability")), label))
    result_metrics['Method'].append('Word2Vec')

                                                                                

In [82]:
result_metrics = pd.DataFrame(result_metrics)
result_metrics

Unnamed: 0,Label,Accuracy,ROC_AUC,Method
0,toxic,0.92706,0.883526,TF-IDF
1,severe_toxic,0.989573,0.885047,TF-IDF
2,obscene,0.956741,0.889679,TF-IDF
3,threat,0.996599,0.906876,TF-IDF
4,insult,0.956591,0.887838,TF-IDF
5,identity_hate,0.990798,0.847486,TF-IDF
6,toxic,0.936762,0.926098,Word2Vec
7,severe_toxic,0.990548,0.972442,Word2Vec
8,obscene,0.963918,0.939849,Word2Vec
9,threat,0.997199,0.926156,Word2Vec


In [83]:
print(f'Average TF-IDF Accuracy : {result_metrics[result_metrics.Method=="TF-IDF"].Accuracy.values.mean()}')
print(f'Average Word2Vec Accuracy : {result_metrics[result_metrics.Method=="Word2Vec"].Accuracy.values.mean()}')
print(f'Average TF-IDF ROC AUC : {result_metrics[result_metrics.Method=="TF-IDF"].ROC_AUC.values.mean()}')
print(f'Average Word2Vec ROC AUC : {result_metrics[result_metrics.Method=="Word2Vec"].ROC_AUC.values.mean()}')

Average TF-IDF Accuracy : 0.9695605787824232
Average Word2Vec Accuracy : 0.9734155164366206
Average TF-IDF ROC AUC : 0.8834086644815599
Average Word2Vec ROC AUC : 0.9401407143597837


Как видим, сравнивая метрики `roc auc` и `accuracy` при подходах `TF-IDF` и `Word2Vec`, второй метод показывает себя более эффективно