In [None]:
!gdown 1XGLAnvZt9oEYCscQcqHjbva0k5S605BZ
!gdown 1EEBfIUZD0q0wRXo8pPFnIGzlzaexI6si

Downloading...
From: https://drive.google.com/uc?id=1XGLAnvZt9oEYCscQcqHjbva0k5S605BZ
To: /content/test_data.csv
100% 176k/176k [00:00<00:00, 72.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1EEBfIUZD0q0wRXo8pPFnIGzlzaexI6si
To: /content/train_data.csv
100% 692k/692k [00:00<00:00, 36.6MB/s]


In [None]:
pip install --upgrade tensorflow-hub

In [15]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.layers import Input, Lambda, Dense, Dropout, Layer, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
import numpy as np
import pandas as pd

In [None]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

In [None]:
train_df = pd.read_csv("train_data.csv")
X_train = train_df['clean_text'].tolist()
y_train = train_df['target'].tolist()

In [None]:
test_df = pd.read_csv("test_data.csv")
X_test = test_df['clean_text'].tolist()
y_test = test_df['target'].tolist()

In [None]:
# elmo = hub.load("https://tfhub.dev/google/elmo/3").signatures["default"]

In [None]:
# elmo(tf.constant(["go hell man", "hell wrost place world"]))


In [17]:
elmo = hub.KerasLayer("https://tfhub.dev/google/elmo/3", signature='default', signature_outputs_as_dict=True)

In [None]:
elmo(tf.squeeze(tf.cast(["go hell man", "hell wrost place world"], tf.string)))['default']

<tf.Tensor: shape=(2, 1024), dtype=float32, numpy=
array([[ 0.45859107, -0.5309581 ,  0.09181502, ..., -0.05945195,
         0.6796574 , -0.18722355],
       [ 0.3864336 , -0.36333314,  0.25561047, ..., -0.06183494,
         0.93176365,  0.07429498]], dtype=float32)>

In [None]:
def ELMoEmbedding(x):
    return elmo(tf.squeeze(tf.cast(x, tf.string)))['default']

In [None]:
def embedding_generator(data):
  def batch_generator(data, batch_size):
      for i in range(0, len(data), batch_size):
          yield data[i:i + batch_size]

  batch_size = 100

  batched_embeddings = []

  for batch_strings in batch_generator(data, batch_size):
      batch_embeddings = ELMoEmbedding(batch_strings)
      batched_embeddings.append(batch_embeddings)

  return np.concatenate(batched_embeddings, axis=0)

In [None]:
X_train_embeddings = embedding_generator(X_train)

In [None]:
X_test_embeddings = embedding_generator(X_test)

In [None]:
print(X_train_embeddings.shape, X_test_embeddings.shape)

(2842, 1024) (711, 1024)


# Training and Testing

In [None]:
def training_and_prediction(training_model, X_train, X_test, y_train, y_test):
  model = training_model
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1score = f1_score(y_test, y_pred)
  # print("Accuracy =", accuracy)
  # print("Confusion Matrix =\n", confusion_matrix(y_test, y_pred))
  # print(classification_report(y_test, y_pred))
  return {'Accuracy' : accuracy, 'Precision': precision, 'Recall' : recall, 'F1-Score' : f1score}

In [None]:
def get_report(y_test, y_pred):
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1score = f1_score(y_test, y_pred)
  return {'Accuracy' : accuracy, 'Precision': precision, 'Recall' : recall, 'F1-Score' : f1score}

## Naive Bayes

In [None]:
scaler = MinMaxScaler()
training_and_prediction(MultinomialNB(alpha=10), scaler.fit_transform(X_train_embeddings), scaler.fit_transform(X_test_embeddings), y_train, y_test)

{'Accuracy': 0.6680731364275668,
 'Precision': 0.6326034063260341,
 'Recall': 0.7536231884057971,
 'F1-Score': 0.6878306878306879}

In [None]:
training_and_prediction(GaussianNB(), X_train_embeddings, X_test_embeddings, y_train, y_test)

{'Accuracy': 0.6638537271448663,
 'Precision': 0.6292682926829268,
 'Recall': 0.7478260869565218,
 'F1-Score': 0.6834437086092715}

In [None]:
training_and_prediction(BernoulliNB(alpha=10), X_train_embeddings, X_test_embeddings, y_train, y_test)

{'Accuracy': 0.6751054852320675,
 'Precision': 0.6439393939393939,
 'Recall': 0.7391304347826086,
 'F1-Score': 0.688259109311741}

## SVM

In [None]:
training_and_prediction(SVC(C=3), X_train_embeddings, X_test_embeddings, y_train, y_test)

{'Accuracy': 0.7482419127988749,
 'Precision': 0.7255434782608695,
 'Recall': 0.7739130434782608,
 'F1-Score': 0.7489481065918653}

## Random Forest

In [None]:
training_and_prediction(RandomForestClassifier(), X_train_embeddings, X_test_embeddings, y_train, y_test)

{'Accuracy': 0.6863572433192686,
 'Precision': 0.6540404040404041,
 'Recall': 0.7507246376811594,
 'F1-Score': 0.699055330634278}

## Logistic Regression

In [None]:
training_and_prediction(LogisticRegression(max_iter=500, solver='saga'), X_train_embeddings, X_test_embeddings, y_train, y_test)

{'Accuracy': 0.7144866385372715,
 'Precision': 0.7028571428571428,
 'Recall': 0.7130434782608696,
 'F1-Score': 0.7079136690647483}

## Gradient Boosting

In [None]:
training_and_prediction(XGBClassifier(), X_train_embeddings, X_test_embeddings, y_train, y_test)

{'Accuracy': 0.7158931082981715,
 'Precision': 0.6906666666666667,
 'Recall': 0.7507246376811594,
 'F1-Score': 0.7194444444444446}

## ANN

In [None]:
X_train_embeddings

array([[ 0.15054145, -0.23045449,  0.07539876, ..., -0.02576121,
         0.05825442, -0.13898185],
       [ 0.3116423 , -0.13860574,  0.11634748, ..., -0.12212291,
         0.39752665,  0.25772002],
       [ 0.15665506, -0.21737473,  0.00180799, ...,  0.13307352,
         0.23066385, -0.03902674],
       ...,
       [ 0.2207749 , -0.01463366,  0.14317514, ..., -0.04515651,
         0.20261212,  0.11536255],
       [ 0.28964156, -0.20121856,  0.08562777, ..., -0.14125249,
         0.11452472,  0.24354057],
       [ 0.13207194, -0.14568658,  0.26370496, ...,  0.00411364,
         0.24496566, -0.09277743]], dtype=float32)

In [None]:
def ann_models(num_neurons_layer1, num_neurons_layer2, dropout_rate1, dropout_rate2):
    model_dense = Sequential()
    if dropout_rate1 > 0:
      model_dense.add(Dropout(dropout_rate1))
    model_dense.add(Dense(num_neurons_layer1, activation='relu'))
    if dropout_rate2 > 0:
      model_dense.add(Dropout(dropout_rate2))
    model_dense.add(Dense(num_neurons_layer2, activation='relu'))
    model_dense.add(Dense(1, activation='sigmoid'))
    model_dense.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model_dense.fit(X_train_embeddings, np.array(y_train), epochs=20, batch_size=32, verbose=0, callbacks=[early_stop], validation_data=(X_test_embeddings, np.array(y_test)))
    y_pred_dense = model_dense.predict(X_test_embeddings)
    y_pred_dense = (y_pred_dense > 0.5).astype(int)
    return [{'num_neurons_layer1': num_neurons_layer1, 'num_neurons_layer2': num_neurons_layer2, 'dropout_rate1': dropout_rate1, 'dropout_rate2':dropout_rate2}, get_report(y_test, y_pred_dense)]


In [None]:
for num_neurons_layer1 in [128, 64]:
  for num_neurons_layer2 in [64, 32]:
    for dropout_rate1 in [0.0, 0.1, 0.2, 0.3, 0.4]:
      for dropout_rate2 in [0.0, 0.1, 0.2, 0.3, 0.4]:
        print(ann_models(num_neurons_layer1, num_neurons_layer2, dropout_rate1, dropout_rate2))

[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate1': 0.0, 'dropout_rate2': 0.0}, {'Accuracy': 0.7383966244725738, 'Precision': 0.7097625329815304, 'Recall': 0.7797101449275362, 'F1-Score': 0.7430939226519337}]
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate1': 0.0, 'dropout_rate2': 0.1}, {'Accuracy': 0.7383966244725738, 'Precision': 0.7086614173228346, 'Recall': 0.782608695652174, 'F1-Score': 0.743801652892562}]
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate1': 0.0, 'dropout_rate2': 0.2}, {'Accuracy': 0.7369901547116737, 'Precision': 0.717032967032967, 'Recall': 0.7565217391304347, 'F1-Score': 0.7362482369534555}]
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate1': 0.0, 'dropout_rate2': 0.3}, {'Accuracy': 0.7271448663853727, 'Precision': 0.711484593837535, 'Recall': 0.736231884057971, 'F1-Score': 0.7236467236467237}]
[{'num_neurons_layer1': 128, 'num_neurons_layer2': 64, 'dropout_rate1': 0.0, 'dropout_rat