<a href="https://colab.research.google.com/github/kamijoseph/Twitter-Sentiment-Analysis/blob/main/DL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# deep learning variation of the twitter sentiment analysis

## dataset

In [6]:
#!pip install kaggle

In [7]:
#!pip install gensim

In [8]:
# configuring the path of kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [9]:
# api to fetch the datasset from kaggle
!kaggle datasets download kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
  0% 0.00/80.9M [00:00<?, ?B/s]
100% 80.9M/80.9M [00:00<00:00, 1.43GB/s]


In [10]:
# extracting the data
from zipfile import ZipFile
dataset = "/content/sentiment140.zip"
with ZipFile(dataset, "r") as zip:
  zip.extractall()
  print("the dataset is extracted")

the dataset is extracted


In [11]:
# depencies
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.preprocessing.text import text_to_word_sequence

In [12]:
# downloading nltk resources
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [13]:
# loading the dataset
column_names = ["target", "id", "date", "flag", "user", "text"]
data = pd.read_csv("/content/training.1600000.processed.noemoticon.csv", names=column_names, encoding="ISO-8859-1")
data.shape

(1600000, 6)

In [14]:
data.head(10)

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


In [15]:
# missing values
data.isnull().sum()

Unnamed: 0,0
target,0
id,0
date,0
flag,0
user,0
text,0


In [16]:
data.duplicated().sum()

0

In [17]:
data["target"].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


In [18]:
# replacing positive target (4) to 1
data["target"] = data["target"].replace(4, 1)
data["target"].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000


In [19]:
data = data.sample(100_000, random_state=21)

## preprocessing

In [20]:
# feature text and label target only
data = data[["text", "target"]]
data.shape

(100000, 2)

In [21]:
data.head(10)

Unnamed: 0,text,target
1417652,@damnmikeyy Get 100 followers a day using www....,1
1427393,@lizzie_xoxo i knowwww! hahaha alyse has a bo...,1
1232589,@kristenstewart9 http://twitpic.com/6dty8 - Aw...,1
174162,http://twitpic.com/67iab - Rounding bases - sh...,0
190282,would like my jacket back tonight i miss it.,0
953339,Tonight was perfect,1
1330411,IÂ´m in Ibiza!! ItÂ´s bloody lovely too lots ...,1
1108509,getting ready to go to Hannah's Grad party,1
107251,eurovision party fail.,0
752944,"Wait, so Jon &amp; Kate are officially getting...",0


In [22]:
# english stop words
en_stopwords = stopwords.words("english")
print(en_stopwords)


['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [23]:
len(en_stopwords)

198

In [24]:
# lemmatization
lemmatizer = WordNetLemmatizer()
stop_words = set(en_stopwords)

In [25]:
# tweet preprocesser function
def preprocess_tweet(text):
  text = text.lower()
  text = re.sub(r"http\S+|www\S+|http\S+", "URL", text)
  text = re.sub(r"@\w+", "USER", text)
  text = re.sub(r"[^a-zA-Z\s]", "", text)
  tokens = text_to_word_sequence(text)
  tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]
  return tokens

In [26]:
# applying preprocessing
data["tokens"] = data["text"].apply(preprocess_tweet)

## training the word2vec

In [27]:
data["tokens"].head()

Unnamed: 0,tokens
1417652,"[user, get, follower, day, using, url, add, ev..."
1427393,"[user, knourl, hahaha, alyse, boat, right]"
1232589,"[user, url, awesome, picture, congrats, award,..."
174162,"[url, rounding, base, fast, relay, fast, game]"
190282,"[would, like, jacket, back, tonight, miss]"


In [28]:
# cores
import multiprocessing

num_cores = multiprocessing.cpu_count()
print("available cores: ", num_cores)

available cores:  2


In [29]:
tokens_list = data["tokens"].tolist()
w2v_model = Word2Vec(
    sentences = tokens_list,
    vector_size = 100,
    window = 5,
    min_count = 5,
    workers = num_cores
)
w2v_model.save("tweets_word2vec.model")
vector_size = w2v_model.vector_size

## preparing sequences

In [30]:
max_len = 30

def tweet_to_sequence(tokens, model, max_len):
  seq = []
  for token in tokens:
    if token in model.wv:
      seq.append(model.wv[token])
    else:
      seq.append(np.zeros(vector_size))

  # pad sequences
  if len(seq) < max_len:
    padding = [np.zeros(vector_size)] * (max_len - len(seq))
    seq.extend(padding)
  else:
    seq = seq[:max_len]
  return np.array(seq)

In [31]:
X = np.array(
    [tweet_to_sequence(tokens, w2v_model, max_len) for tokens in data["tokens"]]
)
y = data["target"].values

In [32]:
X.shape

(100000, 30, 100)

## train test splitting

In [33]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.2,
    random_state = 21
)
print(X_train.shape, X_test.shape)

(80000, 30, 100) (20000, 30, 100)


## building the bilstm model

In [34]:
# model
model = tf.keras.Sequential([
    tf.keras.Input(shape=(max_len, vector_size)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

In [41]:
# compiling the model
model.compile(
    loss = "binary_crossentropy",
    optimizer = "adam",
    metrics = ["accuracy"]
)
model.summary()

In [42]:
# early stopping
es = tf.keras.callbacks.EarlyStopping(
    monitor = "val_loss",
    patience = 2,
    restore_best_weights = True
)

In [43]:
# training the model
history = model.fit(
    X_train,
    y_train,
    validation_split = 0.1,
    batch_size = 32,
    epochs = 5,
    callbacks = [es]
)

Epoch 1/5
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m368s[0m 160ms/step - accuracy: 0.7008 - loss: 0.5660 - val_accuracy: 0.7345 - val_loss: 0.5257
Epoch 2/5
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m353s[0m 157ms/step - accuracy: 0.7414 - loss: 0.5197 - val_accuracy: 0.7418 - val_loss: 0.5159
Epoch 3/5
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m335s[0m 149ms/step - accuracy: 0.7525 - loss: 0.5035 - val_accuracy: 0.7531 - val_loss: 0.5041
Epoch 4/5
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m355s[0m 158ms/step - accuracy: 0.7591 - loss: 0.4911 - val_accuracy: 0.7433 - val_loss: 0.5070
Epoch 5/5
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m374s[0m 155ms/step - accuracy: 0.7651 - loss: 0.4801 - val_accuracy: 0.7508 - val_loss: 0.5035


In [44]:
y_pred = (model.predict(X_test) > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 47ms/step
Accuracy: 0.75395
              precision    recall  f1-score   support

           0       0.76      0.74      0.75      9907
           1       0.75      0.76      0.76     10093

    accuracy                           0.75     20000
   macro avg       0.75      0.75      0.75     20000
weighted avg       0.75      0.75      0.75     20000



In [46]:
model.save("sentiment_bilstm_w2v.keras")
print("Model saved as sentiment_bilstm_w2v.keras")

Model saved as sentiment_bilstm_w2v.keras


# Wrap. Tuning Later