<a href="https://colab.research.google.com/github/koushikjoshi/Political-Sentiment-Analysis/blob/main/twitter_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
import re
from keras.models import Sequential
from sklearn.utils import shuffle
import pickle

In [None]:
nltk.download('stopwords')
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
data = pd.read_csv('training.1600000.processed.noemoticon.csv',names = ['target','id','date','flag','user','tweet'], encoding="ISO-8859-1")

In [None]:
del data['id']
del data['date']
del data['flag']
del data['user']

In [None]:
data.head()

Unnamed: 0,target,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [None]:
data = shuffle(data)
data.reset_index(inplace=True, drop=True)

In [None]:
def preprocess(text, stem=False):
    text = re.sub("@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+", ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [None]:
X = []
for sen in data['tweet']:
  X.append(preprocess(sen))
y = np.array(data['target']/4)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [None]:
print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

1072000
528000
1072000
528000


In [None]:
tokenizer = Tokenizer(10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

In [None]:
with open('twitter_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
train_seq = tokenizer.texts_to_sequences(X_train)
test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
train_pad = pad_sequences(train_seq, maxlen=200, truncating='post')
test_pad = pad_sequences(test_seq, maxlen=200, truncating='post')

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=10000, output_dim=64, input_length=200))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)))
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(train_pad, y_train, epochs=2, validation_data=(test_pad, y_test))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f038ae41fd0>

In [None]:
model.evaluate(test_pad, y_test)



[0.446145623922348, 0.7894905209541321]

In [None]:
model.save('twitter_model.h5')