# `SENTIMENTAL ANALYSIS`

#### `IMPORTIN LIBRARIES`

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer
import gensim
import re
import pickle
import itertools
import string
import tensorflow as tf

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### `IMPORTING DATASET`

In [3]:
dataset_COL=["target","ids","date","flag","user","text"]

In [4]:
final_dataset=pd.read_csv('train.csv')

In [5]:
final_dataset.head(10)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before the...
6,7,0,@user camping tomorrow @user @user @user @use...
7,8,0,the next school year is the year for exams.ð...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...
9,10,0,@user @user welcome here ! i'm it's so #gr...


In [6]:
final_dataset.tail()

Unnamed: 0,id,label,tweet
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."
31961,31962,0,thank you @user for you follow


In [7]:
final_dataset=final_dataset.drop(labels='id',axis=1)

In [8]:
cols=final_dataset.columns.tolist()

In [9]:
cols

['label', 'tweet']

In [10]:
cols=['tweet','label']

In [11]:
final_dataset=final_dataset[cols]

In [12]:
final_dataset.head(20)

Unnamed: 0,tweet,label
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0
5,[2/2] huge fan fare and big talking before the...,0
6,@user camping tomorrow @user @user @user @use...,0
7,the next school year is the year for exams.ð...,0
8,we won!!! love the land!!! #allin #cavs #champ...,0
9,@user @user welcome here ! i'm it's so #gr...,0


### `PRE-PROCESSING`

##### `PRE PROCESSING VARIABLE SETTING`

In [13]:
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
stop_words=stopwords.words('english')
stemmer=SnowballStemmer('english')

In [14]:
def process_tweet(text,stem=False):
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    words = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                words.append(stemmer.stem(token))
            else:
                words.append(token)
    return words
  

In [15]:
final_dataset.tweet=final_dataset.tweet.apply(lambda x: process_tweet(x))

In [16]:
final_dataset.head()

Unnamed: 0,tweet,label
0,"[user, father, dysfunctional, selfish, drags, ...",0
1,"[user, thanks, lyft, credit, use, cause, offer...",0
2,"[bihday, majesty]",0
3,"[model, love, u, take, u, time, ur]",0
4,"[factsguide, society, motivation]",0


### `SPLITTING DATA FOR TRAINING AND TESTING`

In [17]:
from sklearn.model_selection import train_test_split
x=final_dataset['tweet']
y=final_dataset['label']
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=1,test_size=0.2)

In [18]:
y_train

2051     0
20151    0
6595     0
8676     0
13588    0
        ..
17289    0
5192     0
12172    0
235      0
29733    0
Name: label, Length: 25569, dtype: int64

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
vocabulary_size=1000
count_vector=CountVectorizer(max_features=vocabulary_size,preprocessor=lambda x:x,tokenizer=lambda x:x)
x_train=count_vector.fit_transform(x_train).toarray()
x_test=count_vector.fit_transform(x_test).toarray()

In [20]:
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
max_words = 1000
max_len=50

def tokenize_pad_sequences(text):
    tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ')
    tokenizer.fit_on_texts(text)
    X = tokenizer.texts_to_sequences(text)
    X = pad_sequences(X, padding='post', maxlen=max_len)
    return X, tokenizer

print('Before Tokenization & Padding \n',final_dataset['tweet'])
X, tokenizer = tokenize_pad_sequences(final_dataset['tweet'])
print('After Tokenization & Padding \n', X[0])

Before Tokenization & Padding 
 0        [user, father, dysfunctional, selfish, drags, ...
1        [user, thanks, lyft, credit, use, cause, offer...
2                                        [bihday, majesty]
3                      [model, love, u, take, u, time, ur]
4                        [factsguide, society, motivation]
                               ...                        
31957                              [ate, user, isz, youuu]
31958    [see, nina, turner, airwaves, trying, wrap, ma...
31959    [listening, sad, songs, monday, morning, otw, ...
31960    [sikh, temple, vandalised, calgary, wso, conde...
31961                                [thank, user, follow]
Name: tweet, Length: 31962, dtype: object
After Tokenization & Padding 
 [  1  22 163 350   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0]


In [21]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [22]:
print(x_train.shape,y_train.shape)

(25569, 1000) (25569,)


In [23]:
print(x_train)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


#### `LSTM MODEL`

In [24]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout
from keras.metrics import Precision, Recall
from tensorflow.keras.optimizers import SGD
import keras.backend as K

In [25]:
vocab_size = 5000
embedding_size = 32
epochs=20
lr = 0.1
decay_rate = lr / epochs
momentum = 0.8

sgd = SGD(learning_rate=lr, momentum=momentum, decay=decay_rate, nesterov=False)
model= Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=max_words))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(32)))
model.add(Dropout(0.2))
model.add(Dense(1, activation='softmax'))




In [26]:
print(model.summary())

model.compile(loss='categorical_crossentropy', optimizer=sgd, 
               metrics=['accuracy', Precision(), Recall()])


history = model.fit(x_train, y_train,
                      validation_data=(x_test, y_test),
                      batch_size=32, epochs=30, verbose=1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1000, 32)          160000    
                                                                 
 conv1d (Conv1D)             (None, 1000, 32)          3104      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 500, 32)          0         
 )                                                               
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16896     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 1)                 6

InvalidArgumentError: No OpKernel was registered to support Op 'CudnnRNNV2' used by {{node sequential/bidirectional/forward_cu_dnnlstm/CudnnRNNV2}} with these attrs: [seed=0, dropout=0, input_mode="linear_input", T=DT_FLOAT, direction="unidirectional", rnn_mode="lstm", is_training=true, seed2=0]
Registered devices: [CPU]
Registered kernels:
  device='GPU'; T in [DT_HALF]
  device='GPU'; T in [DT_FLOAT]
  device='GPU'; T in [DT_DOUBLE]

	 [[sequential/bidirectional/forward_cu_dnnlstm/CudnnRNNV2]] [Op:__inference_train_function_2906]

In [1]:
!pip list

Package                      Version
---------------------------- -------------------
absl-py                      1.0.0
asttokens                    2.0.5
astunparse                   1.6.3
backcall                     0.2.0
beautifulsoup4               4.11.1
bs4                          0.0.1
cachetools                   5.0.0
certifi                      2021.10.8
charset-normalizer           2.0.12
click                        8.1.2
colorama                     0.4.4
configparser                 5.2.0
cycler                       0.11.0
Cython                       0.29.23
debugpy                      1.6.0
decorator                    5.1.1
entrypoints                  0.4
executing                    0.8.3
flatbuffers                  2.0
fonttools                    4.32.0
gast                         0.5.3
gensim                       4.1.2
google-auth                  2.6.3
google-auth-oauthlib         0.4.6
google-pasta                 0.2.0
grpcio                       1.44