In [31]:
# Data Preprocessing
import pandas as pd
import numpy as np
import chardet
import nltk
import keras
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [13]:
# Read the CSV file with the detected encoding
data = pd.read_csv('training_data.csv', header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [14]:
#Rename the columns using the rename method
data.rename(columns={
    0: 'target',
    1: 'ids',
    2: 'date', 
    3: 'flag', 
    4: 'user', 
    5: 'text'
}, inplace=True)

In [15]:
data.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [16]:
data['target'].value_counts()

0    800000
4    800000
Name: target, dtype: int64

In [17]:
data.columns

Index(['target', 'ids', 'date', 'flag', 'user', 'text'], dtype='object')

In [19]:
data = data.sample(frac=1, random_state=42)

In [20]:
#Preprocess for tokenization
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    return ' '.join(tokens)

In [21]:
data['processed_text'] = data['text'].apply(preprocess_text)

In [1]:
# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['processed_text'])
sequences = tokenizer.texts_to_sequences(data['processed_text'])
X = pad_sequences(sequences)


# Binary classification 
# Convert 0 and 4 to 0 and 1
y = (data['target'] == 4).astype(int)

print(y.value_counts()) 

# Here is where we would have exported a csv to showcase pre-processing,
# but the dataset is too large and LFS for the account is maxed out.

NameError: name 'Tokenizer' is not defined

In [39]:
# Build model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [40]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

In [41]:
#Adding patience parameter
from keras.callbacks import EarlyStopping
callback = EarlyStopping(monitor='loss', patience=3)

In [45]:
type(X_train)
len(X_train)

1280000

In [46]:
# Partial train for the model to verify function.
model.fit(X_train[:1000], y_train[:1000], callbacks=[callback], epochs=2, batch_size=32, validation_data=(X_test[:1000], y_test[:1000]))

Epoch 1/2
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 373ms/step - accuracy: 0.4869 - loss: 0.6949 - val_accuracy: 0.4850 - val_loss: 0.6948
Epoch 2/2
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 359ms/step - accuracy: 0.5424 - loss: 0.6907 - val_accuracy: 0.5540 - val_loss: 0.6837


<keras.src.callbacks.history.History at 0x2700aa36440>

In [None]:
# Train the complete model.
model.fit(X_train, y_train, callbacks=[callback], epochs=2, batch_size=32, validation_data=(X_test, y_test))

In [19]:
model.save('./FirstRun.keras')  # The file needs to end with the .keras extension

In [28]:
from pathlib import Path
import tensorflow as tf

# Set the model's file path
file_path = Path("FirstRun.keras")

# Load the model to a new object
nn_import = tf.keras.models.load_model(file_path)

In [34]:
nn_import.summary()

In [48]:
print(np.unique(y_pred))  # Print unique values in predictions

[0.40145975 0.41418007 0.4163262  ... 0.5435873  0.5444033  0.56531656]


In [29]:
# Evaluation
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, y_pred))

# Application (simple example)
def predict_polarity(text):
    processed = preprocess_text(text)
    sequence = tokenizer.texts_to_sequences([processed])
    padded = pad_sequences(sequence, maxlen=X.shape[1])
    prediction = model.predict(padded)[0][0]
    return "Most Likely Negative" if prediction > 0.5 else "Most Likely Positive"


[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 4ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00  159989.0
         1.0       0.00      0.00      0.00       0.0
         4.0       0.00      0.00      0.00  160011.0

    accuracy                           0.00  320000.0
   macro avg       0.00      0.00      0.00  320000.0
weighted avg       0.00      0.00      0.00  320000.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [50]:
print(classification_report(y_test, y_pred > 0.5))

              precision    recall  f1-score   support

           0       0.52      0.97      0.67    159989
           1       0.75      0.10      0.18    160011

    accuracy                           0.53    320000
   macro avg       0.63      0.53      0.43    320000
weighted avg       0.63      0.53      0.43    320000



In [52]:
# Test the application
test_text = "I've been feeling really down lately and I don't know what to do."
print(predict_polarity(test_text))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Most Likely Negative
