In [None]:
!gdown 1XGLAnvZt9oEYCscQcqHjbva0k5S605BZ
!gdown 1EEBfIUZD0q0wRXo8pPFnIGzlzaexI6si

Downloading...
From: https://drive.google.com/uc?id=1XGLAnvZt9oEYCscQcqHjbva0k5S605BZ
To: /content/test_data.csv
100% 176k/176k [00:00<00:00, 75.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1EEBfIUZD0q0wRXo8pPFnIGzlzaexI6si
To: /content/train_data.csv
100% 692k/692k [00:00<00:00, 113MB/s]


In [38]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
train_df = pd.read_csv("train_data.csv")
X_train = train_df['clean_text'].to_numpy()
y_train = train_df['target'].to_numpy()

In [None]:
test_df = pd.read_csv("test_data.csv")
X_test = test_df['clean_text'].to_numpy()
y_test = test_df['target'].to_numpy()

# TF-IDF

In [None]:
tv = TfidfVectorizer()
X_train_tfidf = tv.fit_transform(X_train).toarray()
X_test_tfidf = tv.transform(X_test).toarray()

In [None]:
X_train_tfidf.shape

(2842, 9050)

# Training-Testing Phase

In [None]:
def training_and_prediction(training_model, X_train, X_test, y_train, y_test):
  model = training_model
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1score = f1_score(y_test, y_pred)
  # print("Accuracy =", accuracy)
  # print("Confusion Matrix =\n", confusion_matrix(y_test, y_pred))
  # print(classification_report(y_test, y_pred))
  return {'Accuracy' : accuracy, 'Precision': precision, 'Recall' : recall, 'F1-Score' : f1score}

In [None]:
def get_report(y_test, y_pred):
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1score = f1_score(y_test, y_pred)
  return {'Accuracy' : accuracy, 'Precision': precision, 'Recall' : recall, 'F1-Score' : f1score}

## Naive Bayes

In [None]:
training_and_prediction(MultinomialNB(alpha=0.1), X_train_tfidf, X_test_tfidf, y_train, y_test)

{'Accuracy': 0.70042194092827,
 'Precision': 0.6549295774647887,
 'Recall': 0.808695652173913,
 'F1-Score': 0.7237354085603114}

In [None]:
training_and_prediction(GaussianNB(), X_train_tfidf, X_test_tfidf, y_train, y_test)

{'Accuracy': 0.569620253164557,
 'Precision': 0.5467625899280576,
 'Recall': 0.6608695652173913,
 'F1-Score': 0.5984251968503937}

In [None]:
training_and_prediction(BernoulliNB(alpha=0.2), X_train_tfidf, X_test_tfidf, y_train, y_test)

{'Accuracy': 0.7116736990154712,
 'Precision': 0.6758793969849246,
 'Recall': 0.7797101449275362,
 'F1-Score': 0.7240915208613729}

## SVM

In [None]:
training_and_prediction(SVC(C=10), X_train_tfidf, X_test_tfidf, y_train, y_test)

{'Accuracy': 0.7018284106891702,
 'Precision': 0.6873239436619718,
 'Recall': 0.7072463768115942,
 'F1-Score': 0.6971428571428572}

## Random Forest

In [None]:
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5, 10],
# }

# grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=2, scoring='accuracy', verbose=2, n_jobs=-1)
# grid_search.fit(X_train_tfidf, y_train)
# print("Best Parameters:", grid_search.best_params_)
# print("Best Score:", grid_search.best_score_)

In [None]:
training_and_prediction(RandomForestClassifier(), X_train_tfidf, X_test_tfidf, y_train, y_test)

{'Accuracy': 0.6933895921237694,
 'Precision': 0.6420581655480985,
 'Recall': 0.8318840579710145,
 'F1-Score': 0.7247474747474747}

## Logistic Regression

In [None]:
# Define the parameter grid
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

# Create a Logistic Regression classifier
logistic_reg = LogisticRegression()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=logistic_reg, param_grid=param_grid, cv=2, scoring='f1', verbose=2, n_jobs=-1)

# Perform grid search
grid_search.fit(X_train_tfidf, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 2 folds for each of 24 candidates, totalling 48 fits
Best Parameters: {'C': 1, 'penalty': 'l2', 'solver': 'saga'}
Best Score: 0.764020177000592


In [None]:
training_and_prediction(LogisticRegression(C=1, penalty='l2', solver='saga'), X_train_tfidf, X_test_tfidf, y_train, y_test)

{'Accuracy': 0.6976090014064698,
 'Precision': 0.6756756756756757,
 'Recall': 0.7246376811594203,
 'F1-Score': 0.6993006993006994}

## Gradient Boosting

In [None]:
# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'subsample': [0.5, 0.7, 1.0],
}

# Create an XGBoost classifier
xgb_classifier = XGBClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=2, scoring='f1', verbose=2, n_jobs=-1)

# Perform grid search
grid_search.fit(X_train_tfidf, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 2 folds for each of 81 candidates, totalling 162 fits




Best Parameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200, 'subsample': 1.0}
Best Score: 0.7424713358198647


In [None]:
training_and_prediction(XGBClassifier(learning_rate=0.1, max_depth=4, n_estimators=200, subsample=1.0), X_train_tfidf, X_test_tfidf, y_train, y_test)

{'Accuracy': 0.6962025316455697,
 'Precision': 0.6827195467422096,
 'Recall': 0.6985507246376812,
 'F1-Score': 0.6905444126074498}

## ANN

In [None]:
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

In [35]:
def ann_models(num_neurons_layer1, num_neurons_layer2, dropout_rate):
    model_dense = Sequential()
    model_dense.add(Dense(num_neurons_layer1, input_dim=X_train_tfidf.shape[1], activation='relu'))
    model_dense.add(Dropout(dropout_rate))
    model_dense.add(Dense(num_neurons_layer2, activation='relu'))
    model_dense.add(Dense(1, activation='sigmoid'))
    model_dense.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model_dense.fit(X_train_tfidf, y_train, epochs=20, batch_size=32, verbose=0, callbacks=[early_stop], validation_data=(X_test_tfidf, y_test))
    y_pred_dense = model_dense.predict(X_test_tfidf)
    y_pred_dense = (y_pred_dense > 0.5).astype(int)
    return [{'num_neurons_layer1': num_neurons_layer1, 'num_neurons_layer2': num_neurons_layer2, 'dropout_rate': dropout_rate}, get_report(y_test, y_pred_dense)]


In [41]:
for num_neurons_layer1 in [128, 64]:
  for num_neurons_layer2 in [64, 32]:
    for dropout_rate in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]:
      print(ann_models(num_neurons_layer1, num_neurons_layer2, dropout_rate))

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.0}, {'Accuracy': 0.6976090014064698, 'Precision': 0.6846590909090909, 'Recall': 0.6985507246376812, 'F1-Score': 0.6915351506456241}]
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.1}, {'Accuracy': 0.6947960618846695, 'Precision': 0.6739130434782609, 'Recall': 0.7188405797101449, 'F1-Score': 0.6956521739130435}]
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.2}, {'Accuracy': 0.6962025316455697, 'Precision': 0.6469248291571754, 'Recall': 0.8231884057971014, 'F1-Score': 0.7244897959183674}]
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.3}, {'Accuracy': 