In [None]:
!pip install nltk

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
twit = pd.read_csv(r"C:\Users\sujee\OneDrive\Desktop\Dataset\Twitter_Data.csv")
twit.head()

In [None]:
# Change dependent variable to categorical
twit['category'] = twit['category'].map({0.0: 'Neutral', -1.0: 'Negative', 1.0: 'Positive'})

In [None]:
twit.head()

In [None]:
# Drop null/missing values
twit.dropna(inplace=True)

In [None]:
twit.isnull().sum()

In [None]:
# Text cleaning function
def clean_text(text):
    # Remove symbols except alphanumeric
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Transform to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

In [None]:
nltk.download('stopwords')

In [None]:
nltk.download('punkt')

In [None]:
# Apply text cleaning
twit['cleaned_text'] = twit['clean_text'].apply(clean_text)

In [None]:
twit.drop('clean_text', axis=1, inplace=True)

In [None]:
# Create a new column for the length of each sentence
twit['sentence_length'] = twit['cleaned_text'].apply(lambda x: len(x.split()))

In [None]:
twit.head()

In [None]:
twit = twit[['cleaned_text','sentence_length','category']]
twit.head()

In [None]:
# Split data into dependent (X) and independent (y) dataframes
X = twit['cleaned_text']
y = twit['category']

In [None]:
# Do one-hot encoding for each sentence
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_encoded = tokenizer.texts_to_sequences(X)

In [None]:
# Add padding from the front side
max_sequence_length = max(twit['sentence_length'])
X_padded = pad_sequences(X_encoded, maxlen=max_sequence_length, padding='pre')

In [None]:
# Build LSTM model
vocabulary_size = len(tokenizer.word_index) + 1
embedding_dim = 128
lstm_units = 64

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocabulary_size, embedding_dim, input_length=max_sequence_length),
    tf.keras.layers.LSTM(lstm_units),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(3, activation='softmax')
])

In [None]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Do dummy variable creation for the dependent variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = tf.keras.utils.to_categorical(y_encoded)

In [None]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_categorical, test_size=0.2, random_state=42)

In [None]:
# Train the model
training = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

In [None]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test)

In [None]:
# Normalize predictions
y_pred = np.round(y_pred_prob)

In [None]:
# Convert predictions back to original categories
y_pred_category = label_encoder.inverse_transform(np.argmax(y_pred, axis=1))
y_true_category = label_encoder.inverse_transform(np.argmax(y_test, axis=1))

In [None]:
# Print classification report
print(classification_report(y_true_category, y_pred_category))