In [1]:
import pandas as pd
import random
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay

import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Embedding
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras import layers 

import matplotlib.pyplot as plt



In [2]:
# data import
import pandas as pd

# Specify the file path
file_path = '/Users/wangfan/Downloads/tweets.csv'

# Load the dataset
df = pd.read_csv(file_path)

df = df.iloc[1:,1:]
df

Unnamed: 0,text,sentiment
1,rising cases of covid does not alarm me rising...,1
2,please vote for chicagoindiaresolution marking...,0
3,wishing all of you eidaladha hazrat ibrahim as...,1
4,daily coronavirus cases in india top for first...,1
5,sitting here india style watching the raindrop...,0
...,...,...
134343,happydiwali to india from dgisi and dgispr you...,1
134344,india has been involved in many countries with...,1
134345,does anybody know website which ships pre orde...,0
134346,what hinted few weeks back in an interview wit...,0


In [3]:
df = df.dropna(axis=0)
df.sample(5)

Unnamed: 0,text,sentiment
122511,my one third grader just walked up to our assi...,0
57247,it is absolutely wild to me that people who kn...,1
32985,dont blame ariana deleting this app because it...,1
109829,dame fuckin lillard,0
34913,vacation wooooo that was lot of damage in debt...,1


In [4]:
X = df['text']
y = df['sentiment']

y.value_counts()

sentiment
1    68676
0    65653
Name: count, dtype: int64

In [5]:
from tensorflow.keras.utils import to_categorical

y_Onehot =  to_categorical(y)

In [6]:
# splitting data to test and train sets

X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(), y_Onehot, test_size=0.25,random_state = 42)
print(len(X_train), len(X_test), len(y_train), len(y_test))

100746 33583 100746 33583


In [7]:
print(X_test)

['there is great feeling when someone who never understood bitcoin spends their time reading about it and realizing how amazing it is there is great feeling when someone who never understood ethereum spends their time reading about it and realizing how amazing it is'
 'didn enjoy some money troubles credit cards perhaps online gambling'
 'vh playlist love by chungha' ... 'justice needed ssr'
 'just need you all to know that my mother has become real human again since was deplatformed the idea that companies will forget the damage he did and let him rejoin their platforms is so horrific'
 'will not stop worrying no matter how many email leaks lab leaks until lockdowns become the focus of conversation universily maligned and those in positions of power responsible for knowingly instituting them propagandizing them or falsifying data for them']


In [8]:
# basic adjustment for vectorizer based on a dataset

max_vocab_length = 50000 # how many words our dictionary will include
max_length = 25 # how many words from a tweet will be included

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode='int',
                                    output_sequence_length=max_length,
                                    standardize='lower_and_strip_punctuation'
                                   )

In [9]:
# fit the text vectorizer to teh train data
text_vectorizer.adapt(X_train)

In [10]:
X_train = text_vectorizer(X_train)
X_test = text_vectorizer(X_test)

In [11]:
print(X_test)

tf.Tensor(
[[  64    6  200 ...   64    6  200]
 [ 418  747   81 ...    0    0    0]
 [2393 1387   71 ...    0    0    0]
 ...
 [ 381  776  837 ...    0    0    0]
 [  27  106    9 ...  600    2  125]
 [  37   12  183 ...  173    7 3847]], shape=(33583, 25), dtype=int64)


In [12]:
np.save('/Users/wangfan/Documents/wukm/Uncertainty Projects/tweets/X_test_tweets.npy', X_test)
np.save('/Users/wangfan/Documents/wukm/Uncertainty Projects/tweets/y_test_tweets.npy', y_test)

In [16]:
import keras
from keras.models import Sequential  
from keras.layers import Dense,Dropout,Flatten,LSTM,BatchNormalization

model = Sequential()

model.add(Embedding(max_vocab_length,128))

model.add(LSTM(units=64,dropout=0.2,recurrent_dropout=0.2,kernel_regularizer=keras.regularizers.l1(0.001)))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(2,activation='softmax'))

model.summary()  
print("")  




In [17]:
from keras.optimizers import SGD

callback = keras.callbacks.EarlyStopping(monitor = 'loss', patience = 3)

# 定義訓練方式  
sgd = SGD(learning_rate = 0.001, momentum = 0.95)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  
  
# Train the model
train_history = model.fit(x=X_train, y=y_train, validation_split=0.2,callbacks=[callback], epochs=3, batch_size=128, verbose=1)

Epoch 1/3
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 38ms/step - accuracy: 0.5231 - loss: 1.5006 - val_accuracy: 0.8352 - val_loss: 0.4414
Epoch 2/3
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 34ms/step - accuracy: 0.8497 - loss: 0.4165 - val_accuracy: 0.8589 - val_loss: 0.3561
Epoch 3/3
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 36ms/step - accuracy: 0.9020 - loss: 0.2988 - val_accuracy: 0.8624 - val_loss: 0.3491


In [18]:
score = model.evaluate(X_test, y_test, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

Test loss: 0.35166338086128235
Test accuracy: 0.8578745126724243


In [19]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Make predictions on the test set
y_pred = model.predict(X_test)

# Convert predictions and true labels from one-hot encoding to class indices
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test_classes, y_pred_classes)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

[1m1050/1050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step
Confusion Matrix:
[[14776  1601]
 [ 3172 14034]]


In [20]:
model.save('/Users/wangfan/Documents/wukm/Uncertainty Projects/tweets/tweets.keras')