In [1]:
!gdown 1OS2Aurb2oKmooCeYwYw0cnp542t7dyHo
!gdown 1Oibb4sFzJNLt0iBWOD4LAIKaWT7cCNT1

Downloading...
From: https://drive.google.com/uc?id=1OS2Aurb2oKmooCeYwYw0cnp542t7dyHo
To: /content/test_data.csv
100% 565k/565k [00:00<00:00, 47.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Oibb4sFzJNLt0iBWOD4LAIKaWT7cCNT1
To: /content/train_data.csv
100% 2.24M/2.24M [00:00<00:00, 52.6MB/s]


In [2]:
import numpy as np
import pandas as pd
import gensim
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [3]:
train_df = pd.read_csv("train_data.csv")
X_train = train_df.drop(['target'], axis=1)
y_train = train_df['target'].to_numpy()

In [4]:
test_df = pd.read_csv("test_data.csv")
X_test = test_df.drop(['target'], axis=1)
y_test = test_df['target'].to_numpy()

# Word2Vec

In [5]:
model_w2v = gensim.models.Word2Vec(window=10, min_count=2, vector_size=300)

In [6]:
texts = []
for text in X_train['clean_text']:
  texts.append(text.split())
for text in X_test:
  texts.append(text.split())

In [7]:
model_w2v.build_vocab(texts)

In [8]:
model_w2v.train(texts, total_examples=model_w2v.corpus_count, epochs=20)

(1826342, 2152600)

In [9]:
model_w2v.wv.most_similar("good")

[('friend', 0.8160386085510254),
 ('obnoxious', 0.8057385087013245),
 ('c', 0.8008529543876648),
 ('enjoy', 0.7915821075439453),
 ('tldr', 0.7796458005905151),
 ('well', 0.7654371857643127),
 ('uninterested', 0.754152238368988),
 ('lily', 0.7516642808914185),
 ('sweetest', 0.7499420046806335),
 ('girlfriend', 0.7395845651626587)]

In [10]:
model_w2v.wv.doesnt_match(['good', 'great', 'nice', 'bad'])

'bad'

In [11]:
model_w2v.save('word2vec.model')

In [12]:
def document_vector(doc):
    voc = set(model_w2v.wv.index_to_key)
    doc = doc.split()
    doc = [word for word in doc if word in voc]
    return np.mean(model_w2v.wv[doc], axis=0)

In [13]:
train_w2v = []
for review in X_train['clean_text']:
  train_w2v.append(document_vector(review))

In [14]:
train_w2v = np.array(train_w2v)
train_w2v.shape

(2842, 300)

In [15]:
test_w2v = []
for review in X_test['clean_text']:
  test_w2v.append(document_vector(review))

In [16]:
test_w2v = np.array(test_w2v)

In [17]:
X_train_w2v= X_train.drop(['clean_text'], axis=1).to_numpy()
X_test_w2v = X_test.drop(['clean_text'], axis=1).to_numpy()

In [18]:
X_train_w2v = np.concatenate((X_train_w2v, train_w2v), axis=1)
X_test_w2v = np.concatenate((X_test_w2v, test_w2v), axis=1)

In [19]:
print(X_train_w2v.shape)
print(X_test_w2v.shape)

(2842, 409)
(711, 409)


# Training and Testing

In [20]:
def training_and_prediction(training_model, X_train, X_test, y_train, y_test):
  model = training_model
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1score = f1_score(y_test, y_pred)
  # print("Accuracy =", accuracy)
  # print("Confusion Matrix =\n", confusion_matrix(y_test, y_pred))
  # print(classification_report(y_test, y_pred))
  return {'Accuracy' : accuracy, 'Precision': precision, 'Recall' : recall, 'F1-Score' : f1score}

In [21]:
def get_report(y_test, y_pred):
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1score = f1_score(y_test, y_pred)
  return {'Accuracy' : accuracy, 'Precision': precision, 'Recall' : recall, 'F1-Score' : f1score}

## Naive Bayes

In [22]:
scaler = MinMaxScaler()
training_and_prediction(MultinomialNB(), scaler.fit_transform(X_train_w2v), scaler.fit_transform(X_test_w2v), y_train, y_test)

{'Accuracy': 0.710267229254571,
 'Precision': 0.6724565756823822,
 'Recall': 0.7855072463768116,
 'F1-Score': 0.7245989304812834}

In [24]:
training_and_prediction(GaussianNB(), X_train_w2v, X_test_w2v, y_train, y_test)

{'Accuracy': 0.6779184247538678,
 'Precision': 0.6464646464646465,
 'Recall': 0.7420289855072464,
 'F1-Score': 0.6909581646423751}

In [23]:
training_and_prediction(BernoulliNB(), X_train_w2v, X_test_w2v, y_train, y_test)

{'Accuracy': 0.6779184247538678,
 'Precision': 0.6611111111111111,
 'Recall': 0.6898550724637681,
 'F1-Score': 0.675177304964539}

## SVM

In [26]:
training_and_prediction(SVC(kernel='poly'), X_train_w2v, X_test_w2v, y_train, y_test)

{'Accuracy': 0.759493670886076,
 'Precision': 0.7277486910994765,
 'Recall': 0.8057971014492754,
 'F1-Score': 0.7647867950481431}

## Random Forest

In [27]:
training_and_prediction(RandomForestClassifier(), X_train_w2v, X_test_w2v, y_train, y_test)

{'Accuracy': 0.7130801687763713,
 'Precision': 0.6974789915966386,
 'Recall': 0.7217391304347827,
 'F1-Score': 0.7094017094017094}

## Logistic Regression

In [29]:
training_and_prediction(LogisticRegression(max_iter=5000, solver='saga'), X_train_w2v, X_test_w2v, y_train, y_test)

{'Accuracy': 0.759493670886076,
 'Precision': 0.7377049180327869,
 'Recall': 0.782608695652174,
 'F1-Score': 0.7594936708860759}

## Gradient Boosting

In [30]:
training_and_prediction(XGBClassifier(colsample_bytree=0.5, learning_rate=0.1, max_depth=5, n_estimators=300, subsample= 0.7), X_train_w2v, X_test_w2v, y_train, y_test)

{'Accuracy': 0.7454289732770746,
 'Precision': 0.731638418079096,
 'Recall': 0.7507246376811594,
 'F1-Score': 0.7410586552217453}

## Deep Learning Techniques with Word2vec

In [31]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GlobalAveragePooling1D, SpatialDropout1D, Bidirectional, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

In [32]:
X_train_w2v.shape[1]

409

### ANN

In [34]:
def ann_models(num_neurons_layer1, num_neurons_layer2, dropout_rate=0):
    model_dense = Sequential()
    model_dense.add(Dense(num_neurons_layer1, activation='relu', input_shape=(X_train_w2v.shape[1],)))
    model_dense.add(Dropout(dropout_rate))
    model_dense.add(Dense(num_neurons_layer2, activation='relu'))
    model_dense.add(Dense(1, activation='sigmoid'))
    model_dense.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model_dense.fit(X_train_w2v, y_train, epochs=100, batch_size=32, verbose=0, callbacks=[early_stop], validation_data=(X_test_w2v, y_test))
    y_pred_dense = model_dense.predict(X_test_w2v)
    y_pred_dense = (y_pred_dense > 0.5).astype(int)
    return [{'num_neurons_layer1': num_neurons_layer1, 'num_neurons_layer2': num_neurons_layer2, 'dropout_rate': dropout_rate}, get_report(y_test, y_pred_dense)]

In [35]:
for num_neurons_layer1 in [128, 64]:
  for num_neurons_layer2 in [64, 32]:
    for dropout_rate in [0, 0.1, 0.2, 0.3, 0.4]:
      print(ann_models(num_neurons_layer1, num_neurons_layer2, dropout_rate))

[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0}, {'Accuracy': 0.739803094233474, 'Precision': 0.723463687150838, 'Recall': 0.7507246376811594, 'F1-Score': 0.736842105263158}]
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.1}, {'Accuracy': 0.7566807313642757, 'Precision': 0.7388888888888889, 'Recall': 0.7710144927536232, 'F1-Score': 0.754609929078014}]
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.2}, {'Accuracy': 0.7552742616033755, 'Precision': 0.7567567567567568, 'Recall': 0.7304347826086957, 'F1-Score': 0.7433628318584071}]
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.3}, {'Accuracy': 0.770745428973277, 'Precision': 0.7757575757575758, 'Recall': 0.7420289855072464, 'F1-Score': 0.7585185185185185}]
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate': 0.4}, {'Accuracy': 0.749648382559775, 'Precision': 0.7434402332361516, 'Recall': 0.7391304347826086, 'F1-Score':