<a href="https://colab.research.google.com/github/kamijoseph/Twitter-Sentiment-Analysis/blob/main/DL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# deep learning variation of the twitter sentiment analysis

## dataset

In [2]:
!pip install kaggle



In [3]:
!pip install gensim



In [4]:
# configuring the path of kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
# api to fetch the datasset from kaggle
!kaggle datasets download kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)


In [6]:
# extracting the data
from zipfile import ZipFile
dataset = "/content/sentiment140.zip"
with ZipFile(dataset, "r") as zip:
  zip.extractall()
  print("the dataset is extracted")

the dataset is extracted


In [25]:
# depencies
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.preprocessing.text import text_to_word_sequence

In [8]:
# downloading nltk resources
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
# loading the dataset
column_names = ["target", "id", "date", "flag", "user", "text"]
data = pd.read_csv("/content/training.1600000.processed.noemoticon.csv", names=column_names, encoding="ISO-8859-1")
data.shape

(1600000, 6)

In [10]:
data.head(10)

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


In [11]:
# missing values
data.isnull().sum()

Unnamed: 0,0
target,0
id,0
date,0
flag,0
user,0
text,0


In [12]:
data.duplicated().sum()

0

In [13]:
data["target"].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


In [14]:
# replacing positive target (4) to 1
data["target"] = data["target"].replace(4, 1)
data["target"].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000


## preprocessing

In [15]:
# feature text and label target only
data = data[["text", "target"]]
data.shape

(1600000, 2)

In [16]:
data.head(10)

Unnamed: 0,text,target
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0
5,@Kwesidei not the whole crew,0
6,Need a hug,0
7,@LOLTrish hey long time no see! Yes.. Rains a...,0
8,@Tatiana_K nope they didn't have it,0
9,@twittera que me muera ?,0


In [17]:
# english stop words
en_stopwords = stopwords.words("english")
print(en_stopwords)


['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [18]:
len(en_stopwords)

198

In [19]:
# lemmatization
lemmatizer = WordNetLemmatizer()
stop_words = set(en_stopwords)

In [20]:
# tweet preprocesser function
def preprocess_tweet(text):
  text = text.lower()
  text = re.sub(r"http\S+|www\S+|http\S+", "URL", text)
  text = re.sub(r"@\w+", "USER", text)
  text = re.sub(r"[^a-zA-Z\s]", "", text)
  tokens = text_to_word_sequence(text)
  tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]
  return tokens

In [21]:
# applying preprocessing
data["tokens"] = data["text"].apply(preprocess_tweet)

## training the word2vec

In [22]:
data["tokens"].head()

Unnamed: 0,tokens
0,"[user, url, aurl, thats, bummer, shoulda, got,..."
1,"[upset, cant, update, facebook, texting, might..."
2,"[user, dived, many, time, ball, managed, save,..."
3,"[whole, body, feel, itchy, like, fire]"
4,"[user, behaving, im, mad, cant, see]"


In [23]:
# cores
import multiprocessing

num_cores = multiprocessing.cpu_count()
print("available cores: ", num_cores)

available cores:  2


In [27]:
tokens_list = data["tokens"].tolist()
w2v_model = Word2Vec(
    sentences = tokens_list,
    vector_size = 100,
    window = 5,
    min_count = 5,
    workers = num_cores
)
w2v_model.save("tweets_word2vec.model")
vector_size = w2v_model.vector_size