In [1]:
!gdown 1OS2Aurb2oKmooCeYwYw0cnp542t7dyHo
!gdown 1Oibb4sFzJNLt0iBWOD4LAIKaWT7cCNT1

Downloading...
From: https://drive.google.com/uc?id=1OS2Aurb2oKmooCeYwYw0cnp542t7dyHo
To: /content/test_data.csv
100% 565k/565k [00:00<00:00, 50.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Oibb4sFzJNLt0iBWOD4LAIKaWT7cCNT1
To: /content/train_data.csv
100% 2.24M/2.24M [00:00<00:00, 84.5MB/s]


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [4]:
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

In [5]:
train_df = train_df[train_df['confidence'] >= 0.8].reset_index(drop=True)

In [6]:
train_df.shape

(1832, 111)

In [7]:
test_df = test_df[test_df['confidence'] >= 0.8].reset_index(drop=True)
test_df.shape

(465, 111)

In [8]:
X_train = train_df['clean_text'].to_numpy()
y_train = train_df['target'].to_numpy()

In [9]:
X_test = test_df['clean_text'].to_numpy()
y_test = test_df['target'].to_numpy()

# TF-IDF

In [10]:
tv = TfidfVectorizer()
X_train_tfidf = tv.fit_transform(X_train).toarray()
X_test_tfidf = tv.transform(X_test).toarray()

In [11]:
X_train_tfidf.shape

(1832, 7192)

# Training-Testing Phase

In [12]:
def training_and_prediction(training_model, X_train, X_test, y_train, y_test):
  model = training_model
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1score = f1_score(y_test, y_pred)
  # print("Accuracy =", accuracy)
  # print("Confusion Matrix =\n", confusion_matrix(y_test, y_pred))
  # print(classification_report(y_test, y_pred))
  return {'Accuracy' : accuracy, 'Precision': precision, 'Recall' : recall, 'F1-Score' : f1score}

In [13]:
def get_report(y_test, y_pred):
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1score = f1_score(y_test, y_pred)
  return {'Accuracy' : accuracy, 'Precision': precision, 'Recall' : recall, 'F1-Score' : f1score}

## Naive Bayes

In [15]:
training_and_prediction(MultinomialNB(alpha=0.1), X_train_tfidf, X_test_tfidf, y_train, y_test)

{'Accuracy': 0.7548387096774194,
 'Precision': 0.71875,
 'Recall': 0.8625,
 'F1-Score': 0.7840909090909092}

In [16]:
training_and_prediction(GaussianNB(), X_train_tfidf, X_test_tfidf, y_train, y_test)

{'Accuracy': 0.6365591397849463,
 'Precision': 0.6329588014981273,
 'Recall': 0.7041666666666667,
 'F1-Score': 0.6666666666666667}

In [17]:
training_and_prediction(BernoulliNB(alpha=0.2), X_train_tfidf, X_test_tfidf, y_train, y_test)

{'Accuracy': 0.7655913978494624,
 'Precision': 0.7434944237918215,
 'Recall': 0.8333333333333334,
 'F1-Score': 0.7858546168958743}

## SVM

In [21]:
training_and_prediction(SVC(C=10), X_train_tfidf, X_test_tfidf, y_train, y_test)

{'Accuracy': 0.7763440860215054,
 'Precision': 0.772,
 'Recall': 0.8041666666666667,
 'F1-Score': 0.7877551020408163}

## Random Forest

In [22]:
training_and_prediction(RandomForestClassifier(), X_train_tfidf, X_test_tfidf, y_train, y_test)

{'Accuracy': 0.7010752688172043,
 'Precision': 0.6644951140065146,
 'Recall': 0.85,
 'F1-Score': 0.7458866544789763}

## Logistic Regression

In [23]:
training_and_prediction(LogisticRegression(solver='saga'), X_train_tfidf, X_test_tfidf, y_train, y_test)

{'Accuracy': 0.7548387096774194,
 'Precision': 0.7423076923076923,
 'Recall': 0.8041666666666667,
 'F1-Score': 0.7720000000000001}

## Gradient Boosting

In [24]:
training_and_prediction(XGBClassifier(), X_train_tfidf, X_test_tfidf, y_train, y_test)

{'Accuracy': 0.7462365591397849,
 'Precision': 0.75,
 'Recall': 0.7625,
 'F1-Score': 0.7561983471074379}

## ANN

In [25]:
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

In [26]:
def ann_models(num_neurons_layer1, num_neurons_layer2, dropout_rate):
    model_dense = Sequential()
    model_dense.add(Dense(num_neurons_layer1, input_dim=X_train_tfidf.shape[1], activation='relu'))
    model_dense.add(Dropout(dropout_rate))
    model_dense.add(Dense(num_neurons_layer2, activation='relu'))
    model_dense.add(Dense(1, activation='sigmoid'))
    model_dense.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model_dense.fit(X_train_tfidf, y_train, epochs=20, batch_size=32, verbose=0, callbacks=[early_stop], validation_data=(X_test_tfidf, y_test))
    y_pred_dense = model_dense.predict(X_test_tfidf)
    y_pred_dense = (y_pred_dense > 0.5).astype(int)
    return [{'num_neurons_layer1': num_neurons_layer1, 'num_neurons_layer2': num_neurons_layer2, 'dropout_rate': dropout_rate}, get_report(y_test, y_pred_dense)]


In [27]:
for num_neurons_layer1 in [128, 64]:
  for num_neurons_layer2 in [64, 32]:
    for dropout_rate in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]:
      print(ann_models(num_neurons_layer1, num_neurons_layer2, dropout_rate))

[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.0}, {'Accuracy': 0.7569892473118279, 'Precision': 0.757085020242915, 'Recall': 0.7791666666666667, 'F1-Score': 0.7679671457905545}]
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.1}, {'Accuracy': 0.7548387096774194, 'Precision': 0.7128378378378378, 'Recall': 0.8791666666666667, 'F1-Score': 0.7873134328358208}]
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.2}, {'Accuracy': 0.7720430107526882, 'Precision': 0.8221153846153846, 'Recall': 0.7125, 'F1-Score': 0.7633928571428571}]
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.3}, {'Accuracy': 0.7784946236559139, 'Precision': 0.7818930041152263, 'Recall': 0.7916666666666666, 'F1-Score': 0.7867494824016562}]
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.4}, {'Accuracy': 0.7677419354838709, 'Precision': 0.7538461538461538, 'Recall': 0.8166666666666667, 'F1-Score': 0.78