In [1]:
!gdown 1OS2Aurb2oKmooCeYwYw0cnp542t7dyHo
!gdown 1Oibb4sFzJNLt0iBWOD4LAIKaWT7cCNT1

Downloading...
From: https://drive.google.com/uc?id=1OS2Aurb2oKmooCeYwYw0cnp542t7dyHo
To: /content/test_data.csv
100% 565k/565k [00:00<00:00, 85.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Oibb4sFzJNLt0iBWOD4LAIKaWT7cCNT1
To: /content/train_data.csv
100% 2.24M/2.24M [00:00<00:00, 103MB/s]


In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler

In [3]:
train_df = pd.read_csv("train_data.csv")
X_train = train_df.drop(['target'], axis=1)
y_train = train_df['target'].to_numpy()

In [4]:
test_df = pd.read_csv("test_data.csv")
X_test = test_df.drop(['target'], axis=1)
y_test = test_df['target'].to_numpy()

In [5]:
X_train.head()

Unnamed: 0,confidence,social_karma,syntax_ari,lex_liwc_WC,lex_liwc_Analytic,lex_liwc_Clout,lex_liwc_Authentic,lex_liwc_Tone,lex_liwc_WPS,lex_liwc_Sixltr,...,lex_dal_min_activation,lex_dal_min_imagery,lex_dal_avg_activation,lex_dal_avg_imagery,lex_dal_avg_pleasantness,social_upvote_ratio,social_num_comments,syntax_fk_grade,sentiment,clean_text
0,1.0,3,5.311,89,63.33,10.84,98.92,3.83,17.8,12.36,...,1.2222,1.0,1.74515,1.42143,1.83704,0.58,27,6.376667,0.052071,reluctant start one really ashamed ask help on...
1,1.0,3,4.220972,108,19.76,11.45,95.96,1.0,15.43,12.96,...,1.1429,1.0,1.71501,1.57333,1.85046,1.0,0,5.59369,-0.022917,severe addict overdose young addictionalcoholi...
2,0.5,15,5.548481,79,12.35,44.98,69.44,1.0,15.8,16.46,...,1.1429,1.0,1.80983,1.48267,1.86705,0.91,5,6.703646,-0.46,feel sad feel sad pray get help desperately ne...
3,0.833333,18,4.352143,81,93.97,40.24,99.0,25.77,16.2,12.35,...,1.2,1.0,1.70892,1.84444,1.81327,0.85,11,5.150095,0.048636,sleep car month month back foot stay parking s...
4,0.833333,4,7.366712,135,40.97,72.32,75.74,1.0,27.0,12.59,...,1.1429,1.0,1.75929,1.58319,1.84228,1.0,5,7.778319,0.03125,speed try catch continue yell face girl turn a...


# Bag Of Words

In [None]:
X_train_bow = X_train.drop(['clean_text'], axis=1).to_numpy()
X_test_bow = X_test.drop(['clean_text'], axis=1).to_numpy()

In [None]:
cv = CountVectorizer(max_features = 5000)

In [None]:
cv.fit(pd.concat([X_train['clean_text'], X_test['clean_text']],ignore_index=True))

In [None]:
train_bow = cv.transform(X_train['clean_text']).toarray()

In [None]:
X_train_bow = np.concatenate((X_train_bow, train_bow), axis=1)

In [None]:
test_bow = cv.transform(X_test['clean_text']).toarray()

In [None]:
X_test_bow = np.concatenate((X_test_bow, test_bow), axis=1)

In [None]:
print(X_train_bow.shape)
print(X_test_bow.shape)

(2842, 5109)
(711, 5109)


# Training-Testing Phase

In [None]:
def training_and_prediction(training_model, X_train, X_test, y_train, y_test):
  model = training_model
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1score = f1_score(y_test, y_pred)
  # print("Accuracy =", accuracy)
  # print("Confusion Matrix =\n", confusion_matrix(y_test, y_pred))
  # print(classification_report(y_test, y_pred))
  return {'Accuracy' : accuracy, 'Precision': precision, 'Recall' : recall, 'F1-Score' : f1score}

In [None]:
def get_report(y_test, y_pred):
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1score = f1_score(y_test, y_pred)
  return {'Accuracy' : accuracy, 'Precision': precision, 'Recall' : recall, 'F1-Score' : f1score}

## Naive Bayes

In [None]:
scaler = MinMaxScaler()
training_and_prediction(MultinomialNB(alpha=0.7), scaler.fit_transform(X_train_bow), scaler.fit_transform(X_test_bow), y_train, y_test)

{'Accuracy': 0.7327707454289732,
 'Precision': 0.6885644768856448,
 'Recall': 0.8202898550724638,
 'F1-Score': 0.7486772486772488}

In [None]:
training_and_prediction(GaussianNB(), X_train_bow, X_test_bow, y_train, y_test)

{'Accuracy': 0.6315049226441631,
 'Precision': 0.6,
 'Recall': 0.7217391304347827,
 'F1-Score': 0.6552631578947369}

In [None]:
training_and_prediction(BernoulliNB(alpha=0.3), X_train_bow, X_test_bow, y_train, y_test)

{'Accuracy': 0.7524613220815752,
 'Precision': 0.7340720221606648,
 'Recall': 0.7681159420289855,
 'F1-Score': 0.7507082152974504}

## SVM

In [None]:
# param_grid = {
#     'C': [0.1, 1, 10, 100],
#     'gamma': [0.001, 0.01, 0.1, 1],
#     'kernel': ['sigmoid', 'rbf', 'poly']
# }

In [None]:
# grid_search = GridSearchCV(estimator=SVC(), param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
# grid_search.fit(X_train_bow, y_train)


In [None]:
# grid_search.best_estimator_

In [None]:
training_and_prediction(SVC(kernel='poly'), X_train_bow, X_test_bow, y_train, y_test)

{'Accuracy': 0.759493670886076,
 'Precision': 0.7253886010362695,
 'Recall': 0.8115942028985508,
 'F1-Score': 0.7660738714090288}

## Random Forest

In [None]:
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5, 10],
# }

# grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
# grid_search.fit(X_train_bow, y_train)
# print("Best Parameters:", grid_search.best_params_)
# print("Best Score:", grid_search.best_score_)

# Fitting 5 folds for each of 27 candidates, totalling 135 fits
# Best Parameters: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 200}
# Best Score: 0.7631896829129433

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Parameters: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 200}
Best Score: 0.7631896829129433


In [None]:
training_and_prediction(RandomForestClassifier(min_samples_split=10, n_estimators=200), X_train_bow, X_test_bow, y_train, y_test)

{'Accuracy': 0.7524613220815752,
 'Precision': 0.7302452316076294,
 'Recall': 0.7768115942028986,
 'F1-Score': 0.752808988764045}

## Logistic Regression

In [None]:
# # Define the parameter grid
# param_grid = {
#     'penalty': ['l1', 'l2'],
#     'C': [0.001, 0.01, 0.1, 1, 10, 100],
#     'solver': ['liblinear', 'saga']
# }

# # Create a Logistic Regression classifier
# logistic_reg = LogisticRegression()

# # Initialize GridSearchCV
# grid_search = GridSearchCV(estimator=logistic_reg, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

# # Perform grid search
# grid_search.fit(X_train_bow, y_train)

# # Print the best parameters and best score
# print("Best Parameters:", grid_search.best_params_)
# print("Best Score:", grid_search.best_score_)


In [None]:
training_and_prediction(LogisticRegression(max_iter=5000, solver='saga'), X_train_bow, X_test_bow, y_train, y_test)



{'Accuracy': 0.7552742616033755,
 'Precision': 0.7435897435897436,
 'Recall': 0.7565217391304347,
 'F1-Score': 0.7499999999999999}

## Gredient Boosting

In [None]:
# param_grid = {
#     'learning_rate': [0.01, 0.1, 0.2],
#     'n_estimators': [100, 200, 300],
#     'max_depth': [3, 4, 5],
#     'subsample': [0.5, 0.7, 1.0],
#     'colsample_bytree': [0.5, 0.7, 1.0]
# }
# xgb_classifier = XGBClassifier()
# grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=2, scoring='accuracy', verbose=2, n_jobs=-1)
# grid_search.fit(X_train_bow, y_train)
# print("Best Parameters:", grid_search.best_params_)
# print("Best Score:", grid_search.best_score_)

# Best Parameters: {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}
# Best Score: 0.7638986629134412

Fitting 2 folds for each of 243 candidates, totalling 486 fits




Best Parameters: {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}
Best Score: 0.7638986629134412


In [None]:
training_and_prediction(XGBClassifier(colsample_bytree=0.5, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1.0), X_train_bow, X_test_bow, y_train, y_test)

{'Accuracy': 0.7623066104078763,
 'Precision': 0.7404371584699454,
 'Recall': 0.7855072463768116,
 'F1-Score': 0.7623066104078762}

## Deep Learning Techniques with Word2vec

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GlobalAveragePooling1D, SpatialDropout1D, Bidirectional, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

### ANN

In [None]:
def ann_models(num_neurons_layer1, num_neurons_layer2, dropout_rate=0):
    model_dense = Sequential()
    model_dense.add(Dense(num_neurons_layer1, activation='relu', input_shape=(X_train_bow.shape[1],)))
    model_dense.add(Dropout(dropout_rate))
    model_dense.add(Dense(num_neurons_layer2, activation='relu'))
    model_dense.add(Dense(1, activation='sigmoid'))
    model_dense.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model_dense.fit(X_train_bow, y_train, epochs=100, batch_size=32, verbose=0, callbacks=[early_stop], validation_data=(X_test_bow, y_test))
    y_pred_dense = model_dense.predict(X_test_bow)
    y_pred_dense = (y_pred_dense > 0.5).astype(int)
    return [{'num_neurons_layer1': num_neurons_layer1, 'num_neurons_layer2': num_neurons_layer2, 'dropout_rate': dropout_rate}, get_report(y_test, y_pred_dense)]

In [None]:
for num_neurons_layer1 in [256, 128, 64]:
  for num_neurons_layer2 in [64, 32]:
    for dropout_rate in [0.0, 0.1, 0.2, 0.3, 0.4]:
      print(ann_models(num_neurons_layer1, num_neurons_layer2, dropout_rate))

[{'num_neurons_layer1': 256, 'num_neurons_layer2': 64, 'dropout_rate': 0.0}, {'Accuracy': 0.7412095639943741, 'Precision': 0.7017543859649122, 'Recall': 0.8115942028985508, 'F1-Score': 0.7526881720430108}]
[{'num_neurons_layer1': 256, 'num_neurons_layer2': 64, 'dropout_rate': 0.1}, {'Accuracy': 0.7609001406469761, 'Precision': 0.7410468319559229, 'Recall': 0.7797101449275362, 'F1-Score': 0.7598870056497176}]
[{'num_neurons_layer1': 256, 'num_neurons_layer2': 64, 'dropout_rate': 0.2}, {'Accuracy': 0.770745428973277, 'Precision': 0.76, 'Recall': 0.7710144927536232, 'F1-Score': 0.7654676258992807}]
[{'num_neurons_layer1': 256, 'num_neurons_layer2': 64, 'dropout_rate': 0.3}, {'Accuracy': 0.7791842475386779, 'Precision': 0.7919254658385093, 'Recall': 0.7391304347826086, 'F1-Score': 0.7646176911544228}]
[{'num_neurons_layer1': 256, 'num_neurons_layer2': 64, 'dropout_rate': 0.4}, {'Accuracy': 0.7665260196905767, 'Precision': 0.7687687687687688, 'Recall': 0.7420289855072464, 'F1-Score': 0.7551