In [65]:
# import packages
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Input, LSTM, Embedding, SpatialDropout1D, Lambda, Reshape
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow_hub as hub

from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

import time

In [66]:
os.environ["TFHUB_MODEL_LOAD_FORMAT"] = "UNCOMPRESSED"

In [67]:
# if don't have TPU, ignore this cell
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

Running on TPU  ['10.0.0.2:8470']


2022-11-29 22:59:42.491616: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job worker -> {0 -> 10.0.0.2:8470}
2022-11-29 22:59:42.491673: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job localhost -> {0 -> localhost:30020}
2022-11-29 22:59:42.494355: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job worker -> {0 -> 10.0.0.2:8470}
2022-11-29 22:59:42.494405: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job localhost -> {0 -> localhost:30020}


In [68]:
# settings
#num_word = 1000
#num_word = 500
num_word = 100
#num_word = 10
#num_sample = 1000
num_sample = 100000

In [69]:
df_sentence = pd.read_csv('/kaggle/input/686-project/df_new.csv')
decode_map = {"negative":0, "positive":1}
df_sentence.target = df_sentence.target.apply(lambda x: decode_map[x])
df_sentence = df_sentence.sample(num_sample)
label = np.array(pd.get_dummies(df_sentence.target), dtype=int)[:] 
# split train and test data
X_train_sen, X_test_sen, y_train_sen, y_test_sen = train_test_split(df_sentence['text'].astype(str), label,test_size=0.20,
                                                           random_state=1234,
                                                           stratify = label)

In [70]:
df = pd.read_csv('/kaggle/input/686-project/df_new.csv')
df.head()

Unnamed: 0,target,text
0,negative,switchfoot twitpicyzl awww bummer shoulda got ...
1,negative,upset updat hi facebook text might cri result ...
2,negative,kenichan dive mani time ball manag save rest g...
3,negative,whole bodi feel itchi like fire
4,negative,nationwideclass behav mad whi becaus see


In [71]:
df = df.sample(num_sample)
df.shape

(100000, 2)

In [72]:
decode_map = {"negative":0, "positive":1}
df.target = df.target.apply(lambda x: decode_map[x])

In [73]:
# split train and test data
X_train, X_test, y_train, y_test = train_test_split(df['text'].astype(str), df['target'],test_size=0.20,
                                                           random_state=1234,
                                                           stratify = df['target'])

In [74]:
tokenizer = Tokenizer(num_words=num_word)
tokenizer.fit_on_texts(X_train.tolist())

In [75]:
X_train = tokenizer.texts_to_matrix(X_train, mode='count')
X_test = tokenizer.texts_to_matrix(X_test, mode='count')

2022-11-29 22:59:56.593351: W ./tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h:57] Ignoring an error encountered when deleting remote tensors handles: Invalid argument: Unable to find the relevant tensor remote_handle: Op ID: 750, Output num: 180
Additional GRPC error information from remote target /job:worker/replica:0/task:0:
:{"created":"@1669762796.593187975","description":"Error received from peer ipv4:10.0.0.2:8470","file":"external/com_github_grpc_grpc/src/core/lib/surface/call.cc","file_line":1056,"grpc_message":"Unable to find the relevant tensor remote_handle: Op ID: 750, Output num: 180","grpc_status":3}


In [76]:
X_train[:5,:]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# KNN (baseline)

In [None]:
%%time
neigh = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)
predictions = neigh.predict(X_test)
print("Accuracy of Model:",accuracy_score(y_test,predictions))

# MLP

In [None]:
%%time
clf = MLPClassifier(random_state=1234, max_iter=300).fit(X_train, y_train)
predictions = clf.predict(X_test)
print("Accuracy of Model:",accuracy_score(y_test,predictions))

# Xgboost

In [None]:
%%time
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
test_pred = xgb.predict_proba(X_test)
print(roc_auc_score(y_test, test_pred[:, 1]))

In [None]:
test_pred = xgb.predict_proba(X_test)
print(roc_auc_score(y_test, test_pred[:, 1]))

In [None]:
predictions = xgb.predict(X_test)
print("Accuracy of Model:",accuracy_score(y_test,predictions))

# RNN + LSTM

In [None]:
embed = "https://tfhub.dev/google/universal-sentence-encoder/4"

In [None]:
hub_layer = hub.KerasLayer(embed, input_shape=[], dtype=tf.string, trainable=False)

In [None]:
max_features = 5000; 
#embed_dim = 128
embed_dim = 1
#inputs = Input(shape=(1,), dtype="string")
model = keras.Sequential()
#model.add(SpatialDropout1D(0.3))
#model.add(Input(shape = (X_train.shape[1],1)))
model.add(hub_layer)
model.add(tf.keras.layers.Reshape((512,1)))
model.add(LSTM(32))
model.add(Dense(10))
model.add(Flatten())
model.add(Dense(2,activation='softmax'))
model.summary()

In [None]:
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [None]:
X_train_sen.shape

In [None]:
y_train_sen.shape

In [None]:
%%time
batch_size = 32
model.fit(X_train_sen, y_train_sen, epochs = 10, batch_size=batch_size, verbose = 2, shuffle=False)

In [None]:
predictions = np.ones((X_test_sen.shape[0],2))* (model.predict(X_test_sen)<0.5)
print("Accuracy of Model:",accuracy_score(y_test_sen,predictions))

In [None]:
del model

# Transfer learning

In [None]:
#embedding = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1" # Token based text embedding trained on English Google News 200B corpus
embedding = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1" # Token based text embedding trained on English Google News 7B corpus

In [None]:
hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string, trainable=False)

In [None]:
model = tf.keras.Sequential()
#model.add(Embedding(max_features,embed_dim,input_length = X_train.shape[1]))
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(2,activation='softmax'))
model.summary()

In [None]:
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

In [None]:
X_train_sen.shape

In [None]:
%%timeit
#tf.config.run_functions_eagerly(True)
history = model.fit(
    X_train_sen, y_train_sen,
    batch_size = 128,
    epochs=10,                  
    verbose=1,
    validation_data=(X_test_sen,y_test_sen),                         
)

In [None]:
predictions = np.ones((X_test_sen.shape[0],2))* (model.predict(X_test_sen)<0.5)
print("Accuracy of Model:",accuracy_score(y_test_sen,predictions))

In [None]:
del model