In [60]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [5]:
data = pd.read_csv('twitter_training.csv')

In [6]:
data.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [31]:
x = data.iloc[:,-1].values
y = data.iloc[:,-2].values

In [37]:
print(x)
print(type(x))
print(x.shape)

['I am coming to the borders and I will kill you all,'
 'im getting on borderlands and i will kill you all,'
 'im coming on borderlands and i will murder you all,' ...
 'Just realized the windows partition of my Mac is now 6 years behind on Nvidia drivers and I have no idea how he didn’t notice'
 'Just realized between the windows partition of my Mac is like being 6 years behind on Nvidia drivers and cars I have no fucking idea how I ever didn ’ t notice'
 'Just like the windows partition of my Mac is like 6 years behind on its drivers So you have no idea how I didn’t notice']
<class 'numpy.ndarray'>
(74681,)


In [38]:
print(type(y))
print(y)
print(y.shape)

<class 'numpy.ndarray'>
['Positive' 'Positive' 'Positive' ... 'Positive' 'Positive' 'Positive']
(74681,)


In [None]:
# to see the type of sentiments.....
a = data.iloc[:, -2].value_counts()
print(a)

Positive
Negative      22542
Positive      20831
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64


In [44]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [69]:
for index, class_label in enumerate(label_encoder.classes_):
    print(f"Class: {class_label}, Number: {index}")

Class: Irrelevant, Number: 0
Class: Negative, Number: 1
Class: Neutral, Number: 2
Class: Positive, Number: 3


In [45]:
print(y)

[3 3 3 ... 3 3 3]


In [46]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [64]:
X_train = X_train.astype(str)
X_test = X_test.astype(str)
tokenizer = Tokenizer(num_words=10000)  # Keep top 10,000 words
tokenizer.fit_on_texts(X_train)
print(len(tokenizer.word_index))

32261


In [51]:
X_train_seq = tokenizer.texts_to_sequences(X_train)  # Convert to sequences
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [58]:

X_train_padded = pad_sequences(X_train_seq,120,  padding='post')
X_test_padded = pad_sequences(X_test_seq,120, padding='post')

In [59]:
print(X_test_padded.shape)
print(X_train_padded.shape)

(14937, 120)
(59744, 120)


In [65]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=120),
    LSTM(128, return_sequences=True),
    LSTM(64),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Sigmoid for binary classification
])



In [66]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(
    X_train_padded, y_train,
    validation_split=0.2,
    epochs=1,
    batch_size=32
)

[1m1494/1494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 124ms/step - accuracy: 0.2977 - loss: -561.0189 - val_accuracy: 0.2942 - val_loss: -4777.7930


In [70]:
test_loss, test_accuracy = model.evaluate(X_test_padded, y_test)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 34ms/step - accuracy: 0.2993 - loss: -4827.1841
Test Loss: -4756.92724609375, Test Accuracy: 0.29932382702827454


In [77]:
def predict_sentiment(text):
    sequence = tokenizer.texts_to_sequences(text)
    padd = pad_sequences(sequence,120,padding='post')
    predict = model.predict(padd)
    p_class = np.argmax(predict)
    label = ['irrelevant','negative','neutral','positive']
    print(label[p_class])



In [78]:
text = 'Pccoe is the best college in pune'
predict_sentiment(text)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
irrelevant
