In [1]:
!gdown 1XGLAnvZt9oEYCscQcqHjbva0k5S605BZ
!gdown 1EEBfIUZD0q0wRXo8pPFnIGzlzaexI6si

Downloading...
From: https://drive.google.com/uc?id=1XGLAnvZt9oEYCscQcqHjbva0k5S605BZ
To: /content/test_data.csv
100% 176k/176k [00:00<00:00, 81.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1EEBfIUZD0q0wRXo8pPFnIGzlzaexI6si
To: /content/train_data.csv
100% 692k/692k [00:00<00:00, 52.3MB/s]


In [2]:
import numpy as np
import pandas as pd
import gensim
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

In [3]:
train_df = pd.read_csv("train_data.csv")
X_train = train_df['clean_text'].to_numpy()
y_train = train_df['target'].to_numpy()

In [4]:
test_df = pd.read_csv("test_data.csv")
X_test = test_df['clean_text'].to_numpy()
y_test = test_df['target'].to_numpy()

# Word2Vec

In [5]:
model_w2v = gensim.models.Word2Vec(window=15, min_count=2, vector_size=300)

In [6]:
texts = []
for text in X_train:
  texts.append(text.split())
for text in X_test:
  texts.append(text.split())

In [7]:
model_w2v.build_vocab(texts)

In [8]:
model_w2v.train(texts, total_examples=model_w2v.corpus_count, epochs=20)

(2306102, 2698160)

In [9]:
model_w2v.wv.most_similar("good")

[('friend', 0.7422623634338379),
 ('coincidentally', 0.727066695690155),
 ('enjoy', 0.7040348052978516),
 ('girlfriend', 0.7037444710731506),
 ('great', 0.6885247230529785),
 ('chore', 0.6868208646774292),
 ('kind', 0.6847022175788879),
 ('c', 0.6739140748977661),
 ('stance', 0.6736131906509399),
 ('tldr', 0.6710459589958191)]

In [10]:
model_w2v.wv.doesnt_match(['good', 'great', 'nice', 'bad'])

'bad'

In [11]:
model_w2v.save('word2vec.model')

In [12]:
def document_vector(doc):
    voc = set(model_w2v.wv.index_to_key)
    doc = doc.split()
    doc = [word for word in doc if word in voc]
    return np.mean(model_w2v.wv[doc], axis=0)

In [13]:
X_train_w2v = []
for review in X_train:
    X_train_w2v.append(document_vector(review))

In [14]:
X_test_w2v = []
for review in X_test:
    X_test_w2v.append(document_vector(review))

In [15]:
np.array(X_train_w2v).shape

(2842, 300)

# Training and Testing

In [16]:
def training_and_prediction(training_model, X_train, X_test, y_train, y_test):
  model = training_model
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1score = f1_score(y_test, y_pred)
  # print("Accuracy =", accuracy)
  # print("Confusion Matrix =\n", confusion_matrix(y_test, y_pred))
  # print(classification_report(y_test, y_pred))
  return {'Accuracy' : accuracy, 'Precision': precision, 'Recall' : recall, 'F1-Score' : f1score}

In [17]:
def get_report(y_test, y_pred):
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1score = f1_score(y_test, y_pred)
  return {'Accuracy' : accuracy, 'Precision': precision, 'Recall' : recall, 'F1-Score' : f1score}

## Naive Bayes

In [None]:
scaler = MinMaxScaler()
training_and_prediction(MultinomialNB(alpha=10), scaler.fit_transform(X_train_w2v), scaler.fit_transform(X_test_w2v), y_train, y_test)

{'Accuracy': 0.6582278481012658,
 'Precision': 0.6214285714285714,
 'Recall': 0.7565217391304347,
 'F1-Score': 0.6823529411764705}

In [None]:
training_and_prediction(GaussianNB(), X_train_w2v, X_test_w2v, y_train, y_test)

{'Accuracy': 0.6779184247538678,
 'Precision': 0.6518324607329843,
 'Recall': 0.7217391304347827,
 'F1-Score': 0.6850068775790923}

In [None]:
training_and_prediction(BernoulliNB(alpha=10), X_train_w2v, X_test_w2v, y_train, y_test)

{'Accuracy': 0.6469760900140648,
 'Precision': 0.6270270270270271,
 'Recall': 0.672463768115942,
 'F1-Score': 0.6489510489510489}

## SVM

In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1],
    'kernel': ['sigmoid', 'rbf', 'poly', 'linear']
}
grid_search = GridSearchCV(estimator=SVC(), param_grid=param_grid, cv=5, scoring='f1', verbose=2, n_jobs=-1)
grid_search.fit(X_train_w2v, y_train)
grid_search.best_estimator_

Fitting 5 folds for each of 64 candidates, totalling 320 fits


In [None]:
grid_search.best_score_

0.7550232445048259

In [None]:
training_and_prediction(SVC(C=100, gamma=0.001, kernel='linear'), X_train_w2v, X_test_w2v, y_train, y_test)

{'Accuracy': 0.6891701828410689,
 'Precision': 0.6791907514450867,
 'Recall': 0.6811594202898551,
 'F1-Score': 0.6801736613603473}

## Random Forest

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=2, scoring='f1', verbose=2, n_jobs=-1)
grid_search.fit(X_train_w2v, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 2 folds for each of 27 candidates, totalling 54 fits
Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
Best Score: 0.7419985811290158


In [None]:
training_and_prediction(RandomForestClassifier(n_estimators=200, min_samples_split=5), X_train_w2v, X_test_w2v, y_train, y_test)

{'Accuracy': 0.6821378340365682,
 'Precision': 0.6639118457300276,
 'Recall': 0.6985507246376812,
 'F1-Score': 0.6807909604519774}

## Logistic Regression

In [None]:
# Define the parameter grid
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'max_iter': [500]
}

# Create a Logistic Regression classifier
logistic_reg = LogisticRegression()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=logistic_reg, param_grid=param_grid, cv=3, scoring='f1', verbose=2, n_jobs=-1)

# Perform grid search
grid_search.fit(X_train_w2v, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best Parameters: {'C': 1, 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'}
Best Score: 0.7555097235063098


In [None]:
training_and_prediction(LogisticRegression(max_iter=500, penalty='l2', solver='saga'), X_train_w2v, X_test_w2v, y_train, y_test)

{'Accuracy': 0.6863572433192686,
 'Precision': 0.6675824175824175,
 'Recall': 0.7043478260869566,
 'F1-Score': 0.685472496473907}

## Gradient Boosting

In [None]:
# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'subsample': [0.5, 0.7, 1.0],
}

# Create an XGBoost classifier
xgb_classifier = XGBClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=3, scoring='f1', verbose=2, n_jobs=-1)

# Perform grid search
grid_search.fit(X_train_w2v, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.7}
Best Score: 0.7441546734860139


In [None]:
training_and_prediction(XGBClassifier(learning_rate=0.1, max_depth=5, n_estimators=300, subsample=0.7), X_train_w2v, X_test_w2v, y_train, y_test)

{'Accuracy': 0.6722925457102672,
 'Precision': 0.6581920903954802,
 'Recall': 0.6753623188405797,
 'F1-Score': 0.6666666666666666}

## Deep Learning Techniques with Word2vec

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GlobalAveragePooling1D, SpatialDropout1D, Bidirectional, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

In [19]:
# Tokenize text data
tokenizer = Tokenizer(num_words=7000)
tokenizer.fit_on_texts(np.concatenate((X_train, X_test), axis=0))
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [20]:
high = 0
for temp in X_train_sequences:
  high = max(high, len(temp))
high

125

In [21]:
# Pad sequences to same length
max_len = 150
X_train_padded_sequences = pad_sequences(X_train_sequences, maxlen=max_len)
X_test_padded_sequences = pad_sequences(X_test_sequences, maxlen=max_len)

In [22]:
# Create embedding layer
embedding_dim = 300
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in model_w2v.wv:
        embedding_matrix[i] = model_w2v.wv[word]
embedding_layer = Embedding(len(tokenizer.word_index) + 1, embedding_dim, weights=[embedding_matrix], trainable=False)

### ANN

In [None]:
def ann_models(num_neurons_layer1, num_neurons_layer2=0, dropout_rate=0):
    model_dense = Sequential()
    model_dense.add(embedding_layer)
    model_dense.add(GlobalAveragePooling1D())
    model_dense.add(Dense(num_neurons_layer1, activation='relu'))
    model_dense.add(Dropout(dropout_rate))
    if num_neurons_layer2 > 0:
      model_dense.add(Dense(num_neurons_layer2, activation='relu'))
    model_dense.add(Dense(1, activation='sigmoid'))
    model_dense.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model_dense.fit(X_train_padded_sequences, y_train, epochs=20, batch_size=32, verbose=0, callbacks=[early_stop], validation_data=(X_test_padded_sequences, y_test))
    y_pred_dense = model_dense.predict(X_test_padded_sequences)
    y_pred_dense = (y_pred_dense > 0.5).astype(int)
    return [{'num_neurons_layer1': num_neurons_layer1, 'num_neurons_layer2': num_neurons_layer2, 'dropout_rate': dropout_rate}, get_report(y_test, y_pred_dense)]


In [None]:
for num_neurons_layer1 in [128, 64]:
  for num_neurons_layer2 in [64, 32, 0]:
    for dropout_rate in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]:
      print(ann_models(num_neurons_layer1, num_neurons_layer2, dropout_rate))

[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.0}, {'Accuracy': 0.679324894514768, 'Precision': 0.6511627906976745, 'Recall': 0.7304347826086957, 'F1-Score': 0.6885245901639344}]
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.1}, {'Accuracy': 0.70042194092827, 'Precision': 0.6727748691099477, 'Recall': 0.744927536231884, 'F1-Score': 0.7070151306740027}]
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.2}, {'Accuracy': 0.70042194092827, 'Precision': 0.6746031746031746, 'Recall': 0.7391304347826086, 'F1-Score': 0.7053941908713693}]
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.3}, {'Accuracy': 0.6962025316455697, 'Precision': 0.6767123287671233, 'Recall': 0.7159420289855073, 'F1-Score': 0.695774647887324}]
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.4}, {'Accuracy': 0.70042194092827, 'Precision': 0.6907514450867052, 'Recall': 0.6927536231884058, 'F1-Score': 

## LSTM

In [None]:
def lstm_models(num_neurons_layer1, dropout_rate=0):
    model_lstm = Sequential()
    model_lstm.add(embedding_layer)
    model_lstm.add(LSTM(128, dropout=dropout_rate))
    model_lstm.add(Dense(1, activation='sigmoid'))
    model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    model_lstm.fit(X_train_padded_sequences, y_train, epochs=20, batch_size=32, verbose=0, callbacks=[early_stop], validation_data=(X_test_padded_sequences, y_test))
    y_pred_lstm = model_lstm.predict(X_test_padded_sequences)
    y_pred_lstm = (y_pred_lstm > 0.5).astype(int)
    return [{'num_neurons_layer1': num_neurons_layer1, 'dropout_rate': dropout_rate}, get_report(y_test, y_pred_lstm)]

In [None]:
for num_neurons_layer1 in [128, 64]:
  for dropout_rate in [0.0, 0.1, 0.2, 0.3, 0.4]:
    print(lstm_models(num_neurons_layer1, dropout_rate))

[{'num_neurons_layer1': 128, 'dropout_rate': 0.0}, {'Accuracy': 0.6877637130801688, 'Precision': 0.654911838790932, 'Recall': 0.7536231884057971, 'F1-Score': 0.7008086253369272}]
[{'num_neurons_layer1': 128, 'dropout_rate': 0.1}, {'Accuracy': 0.6849507735583685, 'Precision': 0.6621983914209115, 'Recall': 0.7159420289855073, 'F1-Score': 0.6880222841225627}]
[{'num_neurons_layer1': 128, 'dropout_rate': 0.2}, {'Accuracy': 0.6652601969057665, 'Precision': 0.6851211072664359, 'Recall': 0.5739130434782609, 'F1-Score': 0.6246056782334385}]
[{'num_neurons_layer1': 128, 'dropout_rate': 0.3}, {'Accuracy': 0.680731364275668, 'Precision': 0.6620879120879121, 'Recall': 0.6985507246376812, 'F1-Score': 0.6798307475317348}]
[{'num_neurons_layer1': 128, 'dropout_rate': 0.4}, {'Accuracy': 0.7046413502109705, 'Precision': 0.6934097421203438, 'Recall': 0.7014492753623188, 'F1-Score': 0.6974063400576368}]
[{'num_neurons_layer1': 64, 'dropout_rate': 0.0}, {'Accuracy': 0.6863572433192686, 'Precision': 0.6826

### Bidirectional LSTM

In [23]:
model_bilstm = Sequential()
model_bilstm.add(embedding_layer)
model_bilstm.add(SpatialDropout1D(0.2))
model_bilstm.add(Bidirectional(LSTM(128, return_sequences=False)))
model_bilstm.add(Dropout(0.2))
model_bilstm.add(Dense(1, activation='sigmoid'))
model_bilstm.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         3080400   
                                                                 
 spatial_dropout1d (Spatial  (None, None, 300)         0         
 Dropout1D)                                                      
                                                                 
 bidirectional (Bidirection  (None, 256)               439296    
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense (Dense)               (None, 1)                 257       
                                                                 
Total params: 3519953 (13.43 MB)
Trainable params: 43955

In [24]:
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_bilstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_bilstm.fit(X_train_padded_sequences, y_train, epochs=10, batch_size=32, validation_data=(X_test_padded_sequences, y_test), callbacks=[early_stop])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


<keras.src.callbacks.History at 0x7f20f6f6b220>

In [25]:
y_pred_bilstm = model_bilstm.predict(X_test_padded_sequences)
y_pred_bilstm = (y_pred_bilstm > 0.5).astype(int)
get_report(y_test, y_pred_bilstm)



{'Accuracy': 0.6905766526019691,
 'Precision': 0.7090301003344481,
 'Recall': 0.6144927536231884,
 'F1-Score': 0.6583850931677018}

### GRU

In [26]:
model_gru = Sequential()
model_gru.add(embedding_layer)
model_gru.add(SpatialDropout1D(0.2))
model_gru.add(GRU(128, return_sequences = False))
model_gru.add(Dropout(0.2))
model_gru.add(Dense(1, activation = 'sigmoid'))
model_gru.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         3080400   
                                                                 
 spatial_dropout1d_1 (Spati  (None, None, 300)         0         
 alDropout1D)                                                    
                                                                 
 gru (GRU)                   (None, 128)               165120    
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 3245649 (12.38 MB)
Trainable params: 165249 (645.50 KB)
Non-trainable params: 3080400 (11.75 MB)
_________

In [27]:
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_gru.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_gru.fit(X_train_padded_sequences, y_train, epochs=10, batch_size=32, validation_data=(X_test_padded_sequences, y_test), callbacks=[early_stop])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


<keras.src.callbacks.History at 0x7f20f78e6650>

In [29]:
y_pred_gru = model_gru.predict(X_test_padded_sequences)
y_pred_gru = (y_pred_gru > 0.5).astype(int)
get_report(y_test, y_pred_gru)



{'Accuracy': 0.6919831223628692,
 'Precision': 0.6730769230769231,
 'Recall': 0.7101449275362319,
 'F1-Score': 0.691114245416079}