In [1]:
# import packages
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA,TruncatedSVD

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Input, LSTM, Embedding, SpatialDropout1D, Lambda, Reshape, Conv1D, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow_hub as hub

from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, log_loss

import time

import gc

from numba import cuda

In [2]:
from cuml.naive_bayes import MultinomialNB
from cuml.ensemble import RandomForestClassifier as cuRFC

In [3]:
import cudf, cuml
from cuml.neighbors import KNeighborsClassifier as cuKNeighbors
from cuml.ensemble import RandomForestClassifier as cumlRandomForestClassifier

In [4]:
import cupy as cp
from cuml.svm import SVC

In [5]:
num_sample = 5000
#num_sample = 50000
#num_sample = 100000

In [6]:
results = []

In [7]:
df = pd.read_csv('/kaggle/input/686-project/df_new.csv').sample(num_sample,random_state = 1234)
decode_map = {"negative":0, "positive":1}
df.target = df.target.apply(lambda x: decode_map[x])
# split train and test data
X_train, X_test, y_train, y_test = train_test_split(df['text'].astype(str), df['target'],test_size=0.20,
                                                           random_state=1234,
                                                           stratify = df['target'])

In [8]:
%%time
# consider both unigrams and bigrams in vectorizer
#tokenizer = TfidfVectorizer(min_df = 0.0001, ngram_range = (1,2))
tokenizer = TfidfVectorizer(min_df = 0.001, ngram_range = (1,2))
tokenizer.fit(X_train)
X_train = tokenizer.transform(X_train)
X_test = tokenizer.transform(X_test)
X_train.shape

CPU times: user 349 ms, sys: 0 ns, total: 349 ms
Wall time: 355 ms


(4000, 1310)

# KNN

In [9]:
%%time
model = cuKNeighbors()
start = time.time()
model.fit(X_train, y_train)
end = time.time()
start_pred = time.time()
predictions = model.predict(X_test)
end_pred = time.time()
results.append(["KNN",f1_score(y_test,predictions), accuracy_score(y_test,predictions), log_loss(y_test,predictions), end-start, end_pred - start_pred])
print("F_1 Score of Model:",f1_score(y_test,predictions))
print("Accuracy of Model:",accuracy_score(y_test,predictions))
print("Cross-entropy loss of Model:",log_loss(y_test,predictions))

F_1 Score of Model: 0.6725925925925926
Accuracy of Model: 0.558
Cross-entropy loss of Model: 15.26645820592517
CPU times: user 3.66 s, sys: 1 s, total: 4.66 s
Wall time: 8.34 s


# Naive Bayes

In [10]:
%%time
model = MultinomialNB()
start = time.time()
model.fit(X_train, y_train)
end = time.time()
start_pred = time.time()
predictions = model.predict(X_test)
end_pred = time.time()
results.append(["Naive Bayes",f1_score(y_test,predictions), accuracy_score(y_test,predictions), log_loss(y_test,predictions), end - start, end_pred - start_pred])
print("F_1 Score of Model:",f1_score(y_test,predictions))
print("Accuracy of Model:",accuracy_score(y_test,predictions))
print("Cross-entropy loss of Model:",log_loss(y_test,predictions))

F_1 Score of Model: 0.677720207253886
Accuracy of Model: 0.689
Cross-entropy loss of Model: 10.74167220205488
CPU times: user 5.9 s, sys: 153 ms, total: 6.05 s
Wall time: 6.49 s


# Random Forest

In [11]:
%%time
model = cuRFC()
start = time.time()
model.fit(X_train.toarray(), y_train)
end = time.time()
start_pred = time.time()
predictions = model.predict(X_test.toarray())
end_pred = time.time()
results.append(["Random Forest",f1_score(y_test,predictions), accuracy_score(y_test,predictions), log_loss(y_test,predictions),end - start, end_pred - start_pred])
print("F_1 Score of Model:",f1_score(y_test,predictions))
print("Accuracy of Model:",accuracy_score(y_test,predictions))
print("Cross-entropy loss of Model:",log_loss(y_test,predictions))

F_1 Score of Model: 0.6989351403678606
Accuracy of Model: 0.689
Cross-entropy loss of Model: 10.741699388367506
CPU times: user 1.16 s, sys: 359 ms, total: 1.52 s
Wall time: 1.3 s


  ret_val = func(*args, **kwargs)
Defaulting to CPU-based Prediction. 
To predict on float-64 data, set parameter predict_model = 'CPU'
  ret_val = func(*args, **kwargs)


# SVM

In [12]:
%%time
model = SVC()
start = time.time()
model.fit(X_train.toarray(), y_train)
end = time.time()
start_pred = time.time()
predictions = model.predict(X_test.toarray())
end_pred = time.time()
results.append(["SVM",f1_score(y_test,predictions), accuracy_score(y_test,predictions), log_loss(y_test,predictions), end - start, end_pred - start_pred])
print("F_1 Score of Model:",f1_score(y_test,predictions))
print("Accuracy of Model:",accuracy_score(y_test,predictions))
print("Cross-entropy loss of Model:",log_loss(y_test,predictions))

F_1 Score of Model: 0.6922300706357214
Accuracy of Model: 0.695
Cross-entropy loss of Model: 10.534447539659718
CPU times: user 1.62 s, sys: 219 ms, total: 1.84 s
Wall time: 2.48 s


# Xgboost

In [13]:
%%time
xgb = XGBClassifier(tree_method='gpu_hist')
start = time.time()
xgb.fit(X_train,y_train)
end = time.time()
start_pred = time.time()
predictions = xgb.predict(X_test)
end_pred = time.time()
results.append(["Xgboost",f1_score(y_test,predictions), accuracy_score(y_test,predictions), log_loss(y_test,predictions), end - start, end_pred - start_pred])
print("F_1 Score of Model:",f1_score(y_test,predictions))
print("Accuracy of Model:",accuracy_score(y_test,predictions))
print("Cross-entropy loss of Model:",log_loss(y_test,predictions))

F_1 Score of Model: 0.6883720930232559
Accuracy of Model: 0.665
Cross-entropy loss of Model: 11.57065640856056
CPU times: user 829 ms, sys: 140 ms, total: 968 ms
Wall time: 1.24 s


In [14]:
results = np.array(results)
result = pd.DataFrame(np.round(results[:,1:].astype(float),3), index = results[:,0], columns = ['F-1','Accuracy','Log-loss','training time','prediction time'])
print(result)

                 F-1  Accuracy  Log-loss  training time  prediction time
KNN            0.673     0.558    15.266          1.816            0.644
Naive Bayes    0.678     0.689    10.742          3.887            2.597
Random Forest  0.699     0.689    10.742          1.204            0.094
SVM            0.692     0.695    10.534          1.937            0.539
Xgboost        0.688     0.665    11.571          1.228            0.008


In [15]:
break

SyntaxError: 'break' outside loop (668683560.py, line 4)

In [None]:
model = keras.Sequential()
model.add(Input(shape = (X_train.shape[1],)))
#model.add(hub_layer)
model.add(tf.keras.layers.Reshape((X_train.shape[1],1)))
model.add(Bidirectional(LSTM(units=32)))
model.add(Dense(16))
model.add(Flatten())
model.add(Dense(1,activation='sigmoid'))
model.summary()

In [None]:
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [None]:
%%time
start = time.time()
batch_size = 128
model.fit(X_train.toarray(), y_train, epochs = 12, batch_size=batch_size, verbose = 2, shuffle=False)
end = time.time()

In [None]:
#predictions = (np.ones((X_test_sen.shape[0],2))* (model.predict(X_test_sen)>0.5))[:,1]
#y_test_sen = y_test_sen[:,1]
#results.append(["LSTM",f1_score(y_test,predictions), accuracy_score(y_test,predictions), log_loss(y_test,predictions), end - start])
print("F_1 Score of Model:",f1_score(y_test_sen,predictions))
print("Accuracy of Model:",accuracy_score(y_test_sen,predictions))
print("Cross-entropy loss of Model:",log_loss(y_test_sen,predictions))

In [None]:
# results = np.array(results)
# result = pd.DataFrame(np.round(results[:,1:].astype(float),3), index = results[:,0], columns = ['F-1','Accuracy','Log-loss','training time'])
# print(result)

In [None]:
break

In [None]:
df_sentence = pd.read_csv('/kaggle/input/686-project/df_new.csv')
decode_map = {"negative":0, "positive":1}
df_sentence.target = df_sentence.target.apply(lambda x: decode_map[x])
df_sentence = df_sentence.sample(num_sample,random_state = 1234)
label = np.array(pd.get_dummies(df_sentence.target), dtype=int)[:] 
# split train and test data
X_train_sen, X_test_sen, y_train_sen, y_test_sen = train_test_split(df_sentence['text'].astype(str), label,test_size=0.20,
                                                           random_state=1234,
                                                           stratify = label)

In [None]:
print("Training on GPU...") if tf.test.is_gpu_available() else print("Training on CPU...")
embed = "https://tfhub.dev/google/universal-sentence-encoder/4"
#hub_layer = hub.KerasLayer(embed, input_shape=[], dtype=tf.string, trainable=False)
model = keras.Sequential()
model.add(Input(shape = (X_train.shape[1],))
#model.add(hub_layer)
model.add(tf.keras.layers.Reshape((512,1)))
model.add(LSTM(256))
model.add(Dense(128))
model.add(Flatten())
model.add(Dense(2,activation='softmax'))
model.summary()

In [None]:
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [None]:
%%time
start = time.time()
batch_size = 128
model.fit(X_train_sen, y_train_sen, epochs = 12, batch_size=batch_size, verbose = 2, shuffle=False)
end = time.time()

In [None]:
predictions = (np.ones((X_test_sen.shape[0],2))* (model.predict(X_test_sen)>0.5))[:,1]
y_test_sen = y_test_sen[:,1]
results.append(["LSTM",f1_score(y_test_sen,predictions), accuracy_score(y_test_sen,predictions), log_loss(y_test_sen,predictions),end-start])
print("F_1 Score of Model:",f1_score(y_test_sen,predictions))
print("Accuracy of Model:",accuracy_score(y_test_sen,predictions))
print("Cross-entropy loss of Model:",log_loss(y_test_sen,predictions))

In [None]:
results = np.array(results)
result = pd.DataFrame(np.round(results[:,1:].astype(float),3), index = results[:,0], columns = ['F-1','Accuracy','Log-loss','training time'])
print(result)