In [5]:
!gdown 1OS2Aurb2oKmooCeYwYw0cnp542t7dyHo
!gdown 1Oibb4sFzJNLt0iBWOD4LAIKaWT7cCNT1

Downloading...
From: https://drive.google.com/uc?id=1OS2Aurb2oKmooCeYwYw0cnp542t7dyHo
To: /content/test_data.csv
100% 565k/565k [00:00<00:00, 63.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Oibb4sFzJNLt0iBWOD4LAIKaWT7cCNT1
To: /content/train_data.csv
100% 2.24M/2.24M [00:00<00:00, 154MB/s]


In [2]:
import numpy as np
import pandas as pd
import gensim
import gensim.downloader as api
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

In [3]:
train_df = pd.read_csv("train_data.csv")
X_train = train_df.drop(columns=['target'], axis=1)
y_train = train_df['target'].to_numpy()

In [6]:
test_df = pd.read_csv("test_data.csv")
X_test = test_df.drop(columns=['target'], axis=1)
y_test = test_df['target'].to_numpy()

# Fasttext

In [7]:
path = "/content/drive/MyDrive/NLP/Stress Detection/fasttext-wiki-news-subwords-300.gz"

In [64]:
model_fasttext = gensim.models.KeyedVectors.load_word2vec_format(path)

In [65]:
voc = set(model_fasttext.index_to_key)
def document_vector(doc):
    doc = doc.split()
    doc = [word for word in doc if word in voc]
    return np.mean(model_fasttext[doc], axis=0)

In [66]:
X_train_fasttext = []
for review in X_train['clean_text']:
    X_train_fasttext.append(document_vector(review))

In [67]:
X_test_fasttext = []
for review in X_test['clean_text']:
    X_test_fasttext.append(document_vector(review))

In [68]:
np.array(X_train_fasttext).shape

(2842, 300)

In [69]:
X_train_fasttext = np.concatenate((X_train.drop(['clean_text'], axis=1).to_numpy(), np.array(X_train_fasttext)), axis=1)

In [70]:
X_test_fasttext = np.concatenate((X_test.drop(['clean_text'], axis=1).to_numpy(), np.array(X_test_fasttext)), axis=1)

In [71]:
X_test_fasttext.shape

(711, 409)

# Training and Testing

In [72]:
def training_and_prediction(training_model, X_train, X_test, y_train, y_test):
  model = training_model
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1score = f1_score(y_test, y_pred)
  # print("Accuracy =", accuracy)
  # print("Confusion Matrix =\n", confusion_matrix(y_test, y_pred))
  # print(classification_report(y_test, y_pred))
  return {'Accuracy' : accuracy, 'Precision': precision, 'Recall' : recall, 'F1-Score' : f1score}

In [73]:
def get_report(y_test, y_pred):
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1score = f1_score(y_test, y_pred)
  return {'Accuracy' : accuracy, 'Precision': precision, 'Recall' : recall, 'F1-Score' : f1score}

## Naive Bayes

In [74]:
scaler = MinMaxScaler()
training_and_prediction(MultinomialNB(), scaler.fit_transform(X_train_fasttext), scaler.fit_transform(X_test_fasttext), y_train, y_test)

{'Accuracy': 0.7257383966244726,
 'Precision': 0.7049180327868853,
 'Recall': 0.7478260869565218,
 'F1-Score': 0.7257383966244725}

In [75]:
training_and_prediction(GaussianNB(), X_train_fasttext, X_test_fasttext, y_train, y_test)

{'Accuracy': 0.7116736990154712,
 'Precision': 0.6682692307692307,
 'Recall': 0.8057971014492754,
 'F1-Score': 0.7306176084099869}

In [76]:
training_and_prediction(BernoulliNB(), X_train_fasttext, X_test_fasttext, y_train, y_test)

{'Accuracy': 0.7130801687763713,
 'Precision': 0.6910569105691057,
 'Recall': 0.7391304347826086,
 'F1-Score': 0.7142857142857142}

## SVM

In [77]:
training_and_prediction(SVC(C=15), X_train_fasttext, X_test_fasttext, y_train, y_test)

{'Accuracy': 0.7580872011251758,
 'Precision': 0.7319034852546917,
 'Recall': 0.7913043478260869,
 'F1-Score': 0.7604456824512534}

In [78]:
training_and_prediction(SVC(kernel='poly', C=2), X_train_fasttext, X_test_fasttext, y_train, y_test)

{'Accuracy': 0.7623066104078763,
 'Precision': 0.7328042328042328,
 'Recall': 0.8028985507246377,
 'F1-Score': 0.7662517289073306}

## Random Forest

In [79]:
training_and_prediction(RandomForestClassifier(min_samples_split=10), X_train_fasttext, X_test_fasttext, y_train, y_test)

{'Accuracy': 0.7426160337552743,
 'Precision': 0.725,
 'Recall': 0.7565217391304347,
 'F1-Score': 0.7404255319148936}

## Logistic Regression

In [80]:
training_and_prediction(LogisticRegression(max_iter=5000, solver='saga'), X_train_fasttext, X_test_fasttext, y_train, y_test)

{'Accuracy': 0.7651195499296765,
 'Precision': 0.7486033519553073,
 'Recall': 0.7768115942028986,
 'F1-Score': 0.7624466571834992}

## Gradient Boosting

In [81]:
training_and_prediction(XGBClassifier(), X_train_fasttext, X_test_fasttext, y_train, y_test)

{'Accuracy': 0.7510548523206751,
 'Precision': 0.7320441988950276,
 'Recall': 0.7681159420289855,
 'F1-Score': 0.7496463932107497}

## Deep Learning Techniques

In [82]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GlobalAveragePooling1D, SpatialDropout1D, Bidirectional, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

### ANN

In [103]:
def ann_models(num_neurons_layer1, num_neurons_layer2, dropout_rate=0):
    model_dense = Sequential()
    model_dense.add(Dense(num_neurons_layer1, activation='relu', input_shape=(X_train_fasttext.shape[1],)))
    model_dense.add(Dropout(dropout_rate))
    model_dense.add(Dense(num_neurons_layer2, activation='relu'))
    model_dense.add(Dense(1, activation='sigmoid'))
    model_dense.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model_dense.fit(X_train_fasttext, y_train, epochs=100, batch_size=32, verbose=0, callbacks=[early_stop], validation_data=(X_test_fasttext, y_test))
    y_pred_dense = model_dense.predict(X_test_fasttext)
    y_pred_dense = (y_pred_dense > 0.5).astype(int)
    return [{'num_neurons_layer1': num_neurons_layer1, 'num_neurons_layer2': num_neurons_layer2, 'dropout_rate': dropout_rate}, get_report(y_test, y_pred_dense)]


In [104]:
for num_neurons_layer1 in [256, 128, 64]:
  for num_neurons_layer2 in [128, 64, 32]:
    for dropout_rate in [0.1, 0.2, 0.3, 0.4, 0.5]:
      print(ann_models(num_neurons_layer1, num_neurons_layer2, dropout_rate))

[{'num_neurons_layer1': 256, 'num_neurons_layer2': 128, 'dropout_rate': 0.1}, {'Accuracy': 0.7623066104078763, 'Precision': 0.7365591397849462, 'Recall': 0.7942028985507247, 'F1-Score': 0.7642956764295677}]
[{'num_neurons_layer1': 256, 'num_neurons_layer2': 128, 'dropout_rate': 0.2}, {'Accuracy': 0.7552742616033755, 'Precision': 0.7567567567567568, 'Recall': 0.7304347826086957, 'F1-Score': 0.7433628318584071}]
[{'num_neurons_layer1': 256, 'num_neurons_layer2': 128, 'dropout_rate': 0.3}, {'Accuracy': 0.759493670886076, 'Precision': 0.7558823529411764, 'Recall': 0.744927536231884, 'F1-Score': 0.7503649635036497}]
[{'num_neurons_layer1': 256, 'num_neurons_layer2': 128, 'dropout_rate': 0.4}, {'Accuracy': 0.7637130801687764, 'Precision': 0.7286821705426356, 'Recall': 0.8173913043478261, 'F1-Score': 0.7704918032786885}]
[{'num_neurons_layer1': 256, 'num_neurons_layer2': 128, 'dropout_rate': 0.5}, {'Accuracy': 0.7693389592123769, 'Precision': 0.7535014005602241, 'Recall': 0.7797101449275362, 

#Based on 80% confidence

In [86]:
reduced_train_df = train_df[train_df['confidence'] >= 0.8]

In [87]:
reduced_test_df = test_df[test_df['confidence'] >= 0.8]

In [88]:
reduced_train_df = reduced_train_df.reset_index(drop=True)
reduced_test_df = reduced_test_df.reset_index(drop=True)

In [89]:
print(reduced_train_df.shape, reduced_test_df.shape)

(1832, 111) (465, 111)


In [90]:
reduced_X_train = reduced_train_df.drop(columns=['target'], axis=1)
reduced_y_train = reduced_train_df['target'].to_numpy()
reduced_X_test = reduced_test_df.drop(columns=['target'], axis=1)
reduced_y_test = reduced_test_df['target'].to_numpy()

In [91]:
reduced_X_train_fasttext = []
for review in reduced_X_train['clean_text']:
    reduced_X_train_fasttext.append(document_vector(review))

In [92]:
reduced_X_test_fasttext = []
for review in reduced_X_test['clean_text']:
    reduced_X_test_fasttext.append(document_vector(review))

In [93]:
np.array(reduced_X_train_fasttext).shape

(1832, 300)

In [94]:
reduced_X_train_fasttext = np.concatenate((reduced_X_train.drop(['clean_text'], axis=1).to_numpy(), np.array(reduced_X_train_fasttext)), axis=1)

In [95]:
reduced_X_test_fasttext = np.concatenate((reduced_X_test.drop(['clean_text'], axis=1).to_numpy(), np.array(reduced_X_test_fasttext)), axis=1)

In [96]:
np.array(reduced_X_train_fasttext).shape

(1832, 409)

In [97]:
np.array(reduced_X_test_fasttext).shape

(465, 409)

##SVM

In [98]:
training_and_prediction(SVC(C=15), reduced_X_train_fasttext, reduced_X_test_fasttext, reduced_y_train, reduced_y_test)

{'Accuracy': 0.8064516129032258,
 'Precision': 0.7952755905511811,
 'Recall': 0.8416666666666667,
 'F1-Score': 0.8178137651821863}

##Logistic Regression

In [99]:
training_and_prediction(LogisticRegression(max_iter=10000, solver='saga'), reduced_X_train_fasttext, reduced_X_test_fasttext, reduced_y_train, reduced_y_test)

{'Accuracy': 0.8043010752688172,
 'Precision': 0.796812749003984,
 'Recall': 0.8333333333333334,
 'F1-Score': 0.814663951120163}

### ANN

In [105]:
def ann_models_80(num_neurons_layer1, num_neurons_layer2, dropout_rate=0):
    model_dense = Sequential()
    model_dense.add(Dense(num_neurons_layer1, activation='relu', input_shape=(reduced_X_train_fasttext.shape[1],)))
    model_dense.add(Dropout(dropout_rate))
    model_dense.add(Dense(num_neurons_layer2, activation='relu'))
    model_dense.add(Dense(1, activation='sigmoid'))
    model_dense.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model_dense.fit(reduced_X_train_fasttext, reduced_y_train, epochs=100, batch_size=32, verbose=0, callbacks=[early_stop], validation_data=(reduced_X_test_fasttext, reduced_y_test))
    y_pred_dense = model_dense.predict(reduced_X_test_fasttext)
    y_pred_dense = (y_pred_dense > 0.5).astype(int)
    return [{'num_neurons_layer1': num_neurons_layer1, 'num_neurons_layer2': num_neurons_layer2, 'dropout_rate': dropout_rate}, get_report(reduced_y_test, y_pred_dense)]


In [106]:
for num_neurons_layer1 in [256, 128, 64]:
  for num_neurons_layer2 in [128, 64, 32]:
    for dropout_rate in [0.0, 0.1, 0.2, 0.3, 0.4]:
      print(ann_models_80(num_neurons_layer1, num_neurons_layer2, dropout_rate))

[{'num_neurons_layer1': 256, 'num_neurons_layer2': 128, 'dropout_rate': 0.0}, {'Accuracy': 0.8021505376344086, 'Precision': 0.8008130081300813, 'Recall': 0.8208333333333333, 'F1-Score': 0.8106995884773662}]
[{'num_neurons_layer1': 256, 'num_neurons_layer2': 128, 'dropout_rate': 0.1}, {'Accuracy': 0.7913978494623656, 'Precision': 0.7848605577689243, 'Recall': 0.8208333333333333, 'F1-Score': 0.8024439918533605}]
[{'num_neurons_layer1': 256, 'num_neurons_layer2': 128, 'dropout_rate': 0.2}, {'Accuracy': 0.7978494623655914, 'Precision': 0.7991803278688525, 'Recall': 0.8125, 'F1-Score': 0.8057851239669422}]
[{'num_neurons_layer1': 256, 'num_neurons_layer2': 128, 'dropout_rate': 0.3}, {'Accuracy': 0.8086021505376344, 'Precision': 0.8007968127490039, 'Recall': 0.8375, 'F1-Score': 0.8187372708757636}]
[{'num_neurons_layer1': 256, 'num_neurons_layer2': 128, 'dropout_rate': 0.4}, {'Accuracy': 0.7849462365591398, 'Precision': 0.782258064516129, 'Recall': 0.8083333333333333, 'F1-Score': 0.795081967