In [None]:
!gdown 1OS2Aurb2oKmooCeYwYw0cnp542t7dyHo
!gdown 1Oibb4sFzJNLt0iBWOD4LAIKaWT7cCNT1

Downloading...
From: https://drive.google.com/uc?id=1OS2Aurb2oKmooCeYwYw0cnp542t7dyHo
To: /content/test_data.csv
100% 565k/565k [00:00<00:00, 76.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Oibb4sFzJNLt0iBWOD4LAIKaWT7cCNT1
To: /content/train_data.csv
100% 2.24M/2.24M [00:00<00:00, 135MB/s]


In [None]:
import numpy as np
import pandas as pd
import gensim
import gensim.downloader as api
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

In [None]:
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

In [None]:
train_df = train_df[train_df['confidence'] >= 0.8].reset_index(drop=True)

In [None]:
train_df.shape

(1832, 111)

In [None]:
test_df = test_df[test_df['confidence'] >= 0.8].reset_index(drop=True)
test_df.shape

(465, 111)

In [None]:
X_train = train_df['clean_text'].to_numpy()
y_train = train_df['target'].to_numpy()

In [None]:
X_test = test_df['clean_text'].to_numpy()
y_test = test_df['target'].to_numpy()

# Pretrained Fasttext

In [None]:
path = api.load("fasttext-wiki-news-subwords-300", return_path=True)
print(path)

/root/gensim-data/fasttext-wiki-news-subwords-300/fasttext-wiki-news-subwords-300.gz


In [None]:
model_fasttext = gensim.models.KeyedVectors.load_word2vec_format(path)

In [None]:
voc = set(model_fasttext.index_to_key)
def document_vector(doc):
    doc = doc.split()
    doc = [word for word in doc if word in voc]
    return np.mean(model_fasttext[doc], axis=0)

In [None]:
X_train_fasttext = []
for review in X_train:
    X_train_fasttext.append(document_vector(review))

In [None]:
X_test_fasttext = []
for review in X_test:
    X_test_fasttext.append(document_vector(review))

In [None]:
np.array(X_train_fasttext).shape

(1832, 300)

# Training and Testing

In [None]:
def training_and_prediction(training_model, X_train, X_test, y_train, y_test):
  model = training_model
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1score = f1_score(y_test, y_pred)
  # print("Accuracy =", accuracy)
  # print("Confusion Matrix =\n", confusion_matrix(y_test, y_pred))
  # print(classification_report(y_test, y_pred))
  return {'Accuracy' : accuracy, 'Precision': precision, 'Recall' : recall, 'F1-Score' : f1score}

In [None]:
def get_report(y_test, y_pred):
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1score = f1_score(y_test, y_pred)
  return {'Accuracy' : accuracy, 'Precision': precision, 'Recall' : recall, 'F1-Score' : f1score}

## Naive Bayes

In [None]:
scaler = MinMaxScaler()
training_and_prediction(MultinomialNB(), scaler.fit_transform(X_train_fasttext), scaler.fit_transform(X_test_fasttext), y_train, y_test)

{'Accuracy': 0.7268817204301076,
 'Precision': 0.7215686274509804,
 'Recall': 0.7666666666666667,
 'F1-Score': 0.7434343434343434}

In [None]:
training_and_prediction(GaussianNB(), X_train_fasttext, X_test_fasttext, y_train, y_test)

{'Accuracy': 0.7290322580645161,
 'Precision': 0.7111111111111111,
 'Recall': 0.8,
 'F1-Score': 0.7529411764705882}

In [None]:
training_and_prediction(BernoulliNB(), X_train_fasttext, X_test_fasttext, y_train, y_test)

{'Accuracy': 0.7397849462365591,
 'Precision': 0.7428571428571429,
 'Recall': 0.7583333333333333,
 'F1-Score': 0.7505154639175258}

## SVM

In [None]:
training_and_prediction(SVC(C=10), X_train_fasttext, X_test_fasttext, y_train, y_test)

{'Accuracy': 0.7956989247311828,
 'Precision': 0.7865612648221344,
 'Recall': 0.8291666666666667,
 'F1-Score': 0.8073022312373225}

In [None]:
training_and_prediction(SVC(kernel='poly', C=5), X_train_fasttext, X_test_fasttext, y_train, y_test)

{'Accuracy': 0.7956989247311828,
 'Precision': 0.7865612648221344,
 'Recall': 0.8291666666666667,
 'F1-Score': 0.8073022312373225}

## Random Forest

In [None]:
training_and_prediction(RandomForestClassifier(), X_train_fasttext, X_test_fasttext, y_train, y_test)

{'Accuracy': 0.7634408602150538,
 'Precision': 0.7462121212121212,
 'Recall': 0.8208333333333333,
 'F1-Score': 0.7817460317460317}

## Logistic Regression

In [None]:
training_and_prediction(LogisticRegression(penalty='l2', C=10, solver='saga'), X_train_fasttext, X_test_fasttext, y_train, y_test)

{'Accuracy': 0.7978494623655914,
 'Precision': 0.7829457364341085,
 'Recall': 0.8416666666666667,
 'F1-Score': 0.8112449799196787}

## Gradient Boosting

In [None]:
training_and_prediction(XGBClassifier(), X_train_fasttext, X_test_fasttext, y_train, y_test)

{'Accuracy': 0.7849462365591398,
 'Precision': 0.7713178294573644,
 'Recall': 0.8291666666666667,
 'F1-Score': 0.7991967871485945}

## Deep Learning Techniques with Word2vec

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GlobalAveragePooling1D, SpatialDropout1D, Bidirectional, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# Tokenize text data
tokenizer = Tokenizer(num_words=7000)
tokenizer.fit_on_texts(np.concatenate((X_train, X_test), axis=0))
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [None]:
high = 0
for temp in X_train_sequences:
  high = max(high, len(temp))
high

141

In [None]:
# Pad sequences to same length
max_len = 150
X_train_padded_sequences = pad_sequences(X_train_sequences, maxlen=max_len)
X_test_padded_sequences = pad_sequences(X_test_sequences, maxlen=max_len)

In [None]:
# Create embedding layer
embedding_dim = 300
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in model_fasttext:
        embedding_matrix[i] = model_fasttext[word]
embedding_layer = Embedding(len(tokenizer.word_index) + 1, embedding_dim, weights=[embedding_matrix], trainable=False)

### ANN

In [None]:
def ann_models(num_neurons_layer1, num_neurons_layer2=0, dropout_rate=0):
    model_dense = Sequential()
    model_dense.add(embedding_layer)
    model_dense.add(GlobalAveragePooling1D())
    model_dense.add(Dense(num_neurons_layer1, activation='relu'))
    model_dense.add(Dropout(dropout_rate))
    if num_neurons_layer2 > 0:
      model_dense.add(Dense(num_neurons_layer2, activation='relu'))
    model_dense.add(Dense(1, activation='sigmoid'))
    model_dense.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model_dense.fit(X_train_padded_sequences, y_train, epochs=100, batch_size=32, verbose=0, callbacks=[early_stop], validation_data=(X_test_padded_sequences, y_test))
    y_pred_dense = model_dense.predict(X_test_padded_sequences)
    y_pred_dense = (y_pred_dense > 0.5).astype(int)
    return [{'num_neurons_layer1': num_neurons_layer1, 'num_neurons_layer2': num_neurons_layer2, 'dropout_rate': dropout_rate}, get_report(y_test, y_pred_dense)]


In [None]:
for num_neurons_layer1 in [128, 64]:
  for num_neurons_layer2 in [64, 32]:
    for dropout_rate in [0.0, 0.1, 0.2, 0.3, 0.4]:
      print(ann_models(num_neurons_layer1, num_neurons_layer2, dropout_rate))

[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.0}, {'Accuracy': 0.8064516129032258, 'Precision': 0.8, 'Recall': 0.8333333333333334, 'F1-Score': 0.816326530612245}]
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.1}, {'Accuracy': 0.8064516129032258, 'Precision': 0.8073770491803278, 'Recall': 0.8208333333333333, 'F1-Score': 0.8140495867768595}]
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.2}, {'Accuracy': 0.8150537634408602, 'Precision': 0.826271186440678, 'Recall': 0.8125, 'F1-Score': 0.819327731092437}]
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.3}, {'Accuracy': 0.8064516129032258, 'Precision': 0.7884615384615384, 'Recall': 0.8541666666666666, 'F1-Score': 0.8200000000000001}]
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.4}, {'Accuracy': 0.8064516129032258, 'Precision': 0.8073770491803278, 'Recall': 0.8208333333333333, 'F1-Score': 0.8140495867768595}]


### LSTM

In [None]:
def lstm_models(num_neurons_layer1, dropout_rate=0):
    model_lstm = Sequential()
    model_lstm.add(embedding_layer)
    model_lstm.add(LSTM(128, dropout=dropout_rate))
    model_lstm.add(Dense(1, activation='sigmoid'))
    model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    model_lstm.fit(X_train_padded_sequences, y_train, epochs=20, batch_size=32, verbose=0, callbacks=[early_stop], validation_data=(X_test_padded_sequences, y_test))
    y_pred_lstm = model_lstm.predict(X_test_padded_sequences)
    y_pred_lstm = (y_pred_lstm > 0.5).astype(int)
    return [{'num_neurons_layer1': num_neurons_layer1, 'dropout_rate': dropout_rate}, get_report(y_test, y_pred_lstm)]

In [None]:
for num_neurons_layer1 in [128, 64]:
  for dropout_rate in [0.0, 0.1, 0.2, 0.3, 0.4]:
    print(lstm_models(num_neurons_layer1, dropout_rate))

[{'num_neurons_layer1': 128, 'dropout_rate': 0.0}, {'Accuracy': 0.8021505376344086, 'Precision': 0.8083333333333333, 'Recall': 0.8083333333333333, 'F1-Score': 0.8083333333333333}]
[{'num_neurons_layer1': 128, 'dropout_rate': 0.1}, {'Accuracy': 0.8064516129032258, 'Precision': 0.8151260504201681, 'Recall': 0.8083333333333333, 'F1-Score': 0.8117154811715481}]
[{'num_neurons_layer1': 128, 'dropout_rate': 0.2}, {'Accuracy': 0.8043010752688172, 'Precision': 0.792156862745098, 'Recall': 0.8416666666666667, 'F1-Score': 0.8161616161616161}]
[{'num_neurons_layer1': 128, 'dropout_rate': 0.3}, {'Accuracy': 0.789247311827957, 'Precision': 0.7817460317460317, 'Recall': 0.8208333333333333, 'F1-Score': 0.8008130081300813}]
[{'num_neurons_layer1': 128, 'dropout_rate': 0.4}, {'Accuracy': 0.7913978494623656, 'Precision': 0.7677902621722846, 'Recall': 0.8541666666666666, 'F1-Score': 0.8086785009861932}]
[{'num_neurons_layer1': 64, 'dropout_rate': 0.0}, {'Accuracy': 0.8064516129032258, 'Precision': 0.7884

### Bidirectional LSTM

In [None]:
def bilstm_models(num_neurons_layer1, spatial_dropout_rate=0, dropout_rate=0):
    model_bilstm = Sequential()
    model_bilstm.add(embedding_layer)
    model_bilstm.add(SpatialDropout1D(spatial_dropout_rate))
    model_bilstm.add(Bidirectional(LSTM(num_neurons_layer1, return_sequences=False)))
    model_bilstm.add(Dropout(dropout_rate))
    model_bilstm.add(Dense(1, activation='sigmoid'))
    model_bilstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    model_bilstm.fit(X_train_padded_sequences, y_train, epochs=10, batch_size=32, verbose=0, callbacks=[early_stop], validation_data=(X_test_padded_sequences, y_test))
    y_pred_bilstm = model_bilstm.predict(X_test_padded_sequences)
    y_pred_bilstm = (y_pred_bilstm > 0.5).astype(int)
    return [{'num_neurons_layer1': num_neurons_layer1, 'spatial_dropout_rate': spatial_dropout_rate, 'dropout_rate': dropout_rate}, get_report(y_test, y_pred_bilstm)]

In [None]:
for num_neurons_layer1 in [128, 64]:
  for spatial_dropout_rate in [0, 0.2]:
    for dropout_rate in [0, 0.1, 0.2, 0.3, 0.4]:
      print(bilstm_models(num_neurons_layer1, spatial_dropout_rate, dropout_rate))

[{'num_neurons_layer1': 128, 'spatial_dropout_rate': 0, 'dropout_rate': 0}, {'Accuracy': 0.7677419354838709, 'Precision': 0.7214765100671141, 'Recall': 0.8958333333333334, 'F1-Score': 0.7992565055762082}]
[{'num_neurons_layer1': 128, 'spatial_dropout_rate': 0, 'dropout_rate': 0.1}, {'Accuracy': 0.7827956989247312, 'Precision': 0.740484429065744, 'Recall': 0.8916666666666667, 'F1-Score': 0.8090737240075614}]
[{'num_neurons_layer1': 128, 'spatial_dropout_rate': 0, 'dropout_rate': 0.2}, {'Accuracy': 0.7784946236559139, 'Precision': 0.7306397306397306, 'Recall': 0.9041666666666667, 'F1-Score': 0.8081936685288641}]
[{'num_neurons_layer1': 128, 'spatial_dropout_rate': 0, 'dropout_rate': 0.3}, {'Accuracy': 0.7870967741935484, 'Precision': 0.8105726872246696, 'Recall': 0.7666666666666667, 'F1-Score': 0.7880085653104925}]
[{'num_neurons_layer1': 128, 'spatial_dropout_rate': 0, 'dropout_rate': 0.4}, {'Accuracy': 0.789247311827957, 'Precision': 0.7448275862068966, 'Recall': 0.9, 'F1-Score': 0.815

### GRU

In [None]:
model_gru = Sequential()
model_gru.add(embedding_layer)
# model_gru.add(SpatialDropout1D(0.2))
model_gru.add(GRU(64, return_sequences = False))
model_gru.add(Dropout(0.2))
model_gru.add(Dense(1, activation = 'sigmoid'))
model_gru.summary()

Model: "sequential_51"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         2455800   
                                                                 
 gru (GRU)                   (None, 64)                70272     
                                                                 
 dropout_41 (Dropout)        (None, 64)                0         
                                                                 
 dense_93 (Dense)            (None, 1)                 65        
                                                                 
Total params: 2526137 (9.64 MB)
Trainable params: 70337 (274.75 KB)
Non-trainable params: 2455800 (9.37 MB)
_________________________________________________________________


In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model_gru.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_gru.fit(X_train_padded_sequences, y_train, epochs=50, batch_size=32, validation_data=(X_test_padded_sequences, y_test), callbacks=[early_stop])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50


<keras.src.callbacks.History at 0x7ab37c151000>

In [None]:
y_pred_gru = model_gru.predict(X_test_padded_sequences)
y_pred_gru = (y_pred_gru > 0.5).astype(int)
get_report(y_test, y_pred_gru)



{'Accuracy': 0.7849462365591398,
 'Precision': 0.75,
 'Recall': 0.875,
 'F1-Score': 0.8076923076923077}