In [4]:
# import packages
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA,TruncatedSVD

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Input, LSTM, Embedding, SpatialDropout1D, Lambda, Reshape
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow_hub as hub

from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

import time


In [5]:
df = pd.read_csv('/kaggle/input/686-project/df_new.csv')
decode_map = {"negative":0, "positive":1}
df.target = df.target.apply(lambda x: decode_map[x])
# split train and test data
X_train, X_test, y_train, y_test = train_test_split(df['text'].astype(str), df['target'],test_size=0.20,
                                                           random_state=1234,
                                                           stratify = df['target'])

In [6]:
%%time
# consider both unigrams and bigrams in vectorizer
#tokenizer = TfidfVectorizer(min_df = 0.0001, ngram_range = (1,2))
tokenizer = TfidfVectorizer(min_df = 0.001, ngram_range = (1,2))
tokenizer.fit(X_train)

CPU times: user 28.2 s, sys: 1.18 s, total: 29.4 s
Wall time: 29.5 s


TfidfVectorizer(min_df=0.001, ngram_range=(1, 2))

In [7]:
%%time
X_train = tokenizer.transform(X_train)
X_test = tokenizer.transform(X_test)
X_train.shape

CPU times: user 24.4 s, sys: 120 ms, total: 24.5 s
Wall time: 24.6 s


(1280000, 1059)

%%time
svd = TruncatedSVD(n_components=100, random_state = 1234)
svd.fit(X_train)
print(svd.explained_variance_ratio_[:10])
print(sum(svd.explained_variance_ratio_))
X_train = svd.transform(X_train)
X_test = svd.transform(X_test)
X_train.shape

# KNN

In [None]:
%%time
model = cuKNeighbors(n_neighbors=7)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print("Accuracy of Model:",accuracy_score(y_test,predictions))

# Random Forest

In [None]:
%%time
model = RandomForestClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print("Accuracy of Model:",accuracy_score(y_test,predictions))

# Xgboost

In [8]:
%%time

xgb = XGBClassifier(tree_method='gpu_hist')
xgb.fit(X_train,y_train)
test_pred = xgb.predict_proba(X_test)
print(roc_auc_score(y_test, test_pred[:, 1]))

0.810113188515625
CPU times: user 4.92 s, sys: 453 ms, total: 5.37 s
Wall time: 6.23 s


# RNN + LSTM

In [10]:
df_sentence = pd.read_csv('/kaggle/input/686-project/df_new.csv')
decode_map = {"negative":0, "positive":1}
df_sentence.target = df_sentence.target.apply(lambda x: decode_map[x])
#df_sentence = df_sentence.sample(num_sample)
label = np.array(pd.get_dummies(df_sentence.target), dtype=int)[:] 
# split train and test data
X_train_sen, X_test_sen, y_train_sen, y_test_sen = train_test_split(df_sentence['text'].astype(str), label,test_size=0.20,
                                                           random_state=1234,
                                                           stratify = label)

In [11]:
print("Training on GPU...") if tf.test.is_gpu_available() else print("Training on CPU...")

Training on GPU...


2022-11-30 05:03:38.959097: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-30 05:03:38.967244: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-30 05:03:38.971602: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-30 05:03:38.972422: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA 

In [12]:
embed = "https://tfhub.dev/google/universal-sentence-encoder/4"
hub_layer = hub.KerasLayer(embed, input_shape=[], dtype=tf.string, trainable=False)

2022-11-30 05:03:56.020603: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-30 05:03:56.021504: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-30 05:03:56.022152: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-30 05:03:56.023013: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-30 05:03:56.023673: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

In [13]:
model = keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Reshape((512,1)))
model.add(LSTM(32))
model.add(Dense(10))
model.add(Flatten())
model.add(Dense(2,activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 512)               256797824 
_________________________________________________________________
reshape (Reshape)            (None, 512, 1)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 32)                4352      
_________________________________________________________________
dense (Dense)                (None, 10)                330       
_________________________________________________________________
flatten (Flatten)            (None, 10)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 22        
Total params: 256,802,528
Trainable params: 4,704
Non-trainable params: 256,797,824
______________________________________

In [14]:
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [None]:
%%time
batch_size = 128
model.fit(X_train_sen, y_train_sen, epochs = 10, batch_size=batch_size, verbose = 2, shuffle=False)

Epoch 1/10


2022-11-30 05:04:16.980374: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


In [None]:
predictions = np.ones((X_test_sen.shape[0],2))* (model.predict(X_test_sen)<0.5)
print("Accuracy of Model:",accuracy_score(y_test_sen,predictions))

In [None]:
del model