# `SENTIMENTAL ANALYSIS`

#### `IMPORTIN LIBRARIES`

In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
import pickle
from nltk.corpus import stopwords
from textblob import Word
from keras.models import load_model

### `IMPORTING DATASET`

In [2]:
ds=pd.read_csv("tweet_train.csv",encoding="ISO-8859-1",names=["label", "ids", "date", "flag", "user", "tweet"])
ds=ds.drop(["ids","date","flag","user"],axis=1)

In [3]:
positive=ds[ds["label"]==0][:3000]
neagtive=ds[ds["label"]==4][:3000]

In [4]:
positive.info()
print()
neagtive.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000 entries, 0 to 2999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   3000 non-null   int64 
 1   tweet   3000 non-null   object
dtypes: int64(1), object(1)
memory usage: 70.3+ KB

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000 entries, 800000 to 802999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   3000 non-null   int64 
 1   tweet   3000 non-null   object
dtypes: int64(1), object(1)
memory usage: 70.3+ KB


In [5]:
positive

Unnamed: 0,label,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
2995,0,@ZoeAimee I'm trying to research some dude for...
2996,0,"just got up, pshhh going on the trampoline, ev..."
2997,0,Homework
2998,0,@pmarnandus re: daily gossip.. well the twitte...


In [6]:
neagtive["label"]=neagtive["label"].apply(lambda x:1)
neagtive

Unnamed: 0,label,tweet
800000,1,I LOVE @Health4UandPets u guys r the best!!
800001,1,im meeting up with one of my besties tonight! ...
800002,1,"@DaRealSunisaKim Thanks for the Twitter add, S..."
800003,1,Being sick can be really cheap when it hurts t...
800004,1,@LovesBrooklyn2 he has that effect on everyone
...,...,...
802995,1,@lollyjay Love the Snoopy background
802996,1,@scenariogirl I know one dog friendly office ...
802997,1,got the magic numbers and is thanking y'all fo...
802998,1,new to tweetar


In [7]:
dataset=pd.concat([positive,neagtive])
dataset=dataset.sample(frac=1)
dataset

Unnamed: 0,label,tweet
801946,1,"ok, it's officially my birthday! crack a beer..."
2699,0,Literally spitting blood.
2551,0,MY FAST REMAINS UN BROKEN
802637,1,@dagfinnr any reason in particular?
801834,1,listening to songs
...,...,...
802034,1,"Putting my RESUME in at foodworks tomorrow, om..."
802003,1,"Wooohooo, amazon lowering mp3 prices. Fuck you..."
800338,1,@Laatina MONDAYS ARE WHAT YOU MAKE OF IT.
2776,0,My personal Web site was hacked. What shoul...


### `PRE-PROCESSING`

##### `PRE PROCESSING VARIABLE SETTING`

In [8]:
# dataset['tweet'] = dataset['tweet'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
dataset['tweet'] = dataset['tweet'].apply(lambda x: re.sub("@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+",' ',x))

In [9]:
dataset

Unnamed: 0,label,tweet
801946,1,ok it s officially my birthday crack a beer if...
2699,0,Literally spitting blood
2551,0,MY FAST REMAINS UN BROKEN
802637,1,any reason in particular
801834,1,listening to songs
...,...,...
802034,1,Putting my RESUME in at foodworks tomorrow omg...
802003,1,Wooohooo amazon lowering mp3 prices Fuck you i...
800338,1,MONDAYS ARE WHAT YOU MAKE OF IT
2776,0,My personal Web site was hacked What should I do


In [10]:
dataset["tweet"]=dataset["tweet"].apply(lambda x: " ".join([w for w in x.split() if w not in stopwords.words("english")]))
dataset

Unnamed: 0,label,tweet
801946,1,ok officially birthday crack beer got one I
2699,0,Literally spitting blood
2551,0,MY FAST REMAINS UN BROKEN
802637,1,reason particular
801834,1,listening songs
...,...,...
802034,1,Putting RESUME foodworks tomorrow omg hope get...
802003,1,Wooohooo amazon lowering mp3 prices Fuck iTunes
800338,1,MONDAYS ARE WHAT YOU MAKE OF IT
2776,0,My personal Web site hacked What I


In [11]:
# Lemmatizing

dataset['tweet']=dataset['tweet'].apply(lambda x:" ".join([Word(word).lemmatize() for word in x.split()]))
dataset

Unnamed: 0,label,tweet
801946,1,ok officially birthday crack beer got one I
2699,0,Literally spitting blood
2551,0,MY FAST REMAINS UN BROKEN
802637,1,reason particular
801834,1,listening song
...,...,...
802034,1,Putting RESUME foodworks tomorrow omg hope get...
802003,1,Wooohooo amazon lowering mp3 price Fuck iTunes
800338,1,MONDAYS ARE WHAT YOU MAKE OF IT
2776,0,My personal Web site hacked What I


### `SPLITTING DATA FOR TRAINING AND TESTING`

#### `LSTM MODEL`

In [12]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(dataset['tweet'].values)
X = tokenizer.texts_to_sequences(dataset['tweet'].values)
X = pad_sequences(X)

In [13]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 27, 128)           256000    
                                                                 
 spatial_dropout1d (SpatialD  (None, 27, 128)          0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 196)               254800    
                                                                 
 dense (Dense)               (None, 2)                 394       
                                                                 
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [15]:
from sklearn.model_selection import train_test_split
Y=pd.get_dummies(dataset["label"]).values
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.1,random_state=42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(5400, 27) (5400, 2)
(600, 27) (600, 2)


In [16]:
batch_size = 32
model.fit(X_train, y_train, epochs = 10, batch_size=batch_size, verbose = 2)

Epoch 1/10
169/169 - 39s - loss: 0.6458 - accuracy: 0.6122 - 39s/epoch - 228ms/step
Epoch 2/10
169/169 - 35s - loss: 0.5036 - accuracy: 0.7537 - 35s/epoch - 208ms/step
Epoch 3/10
169/169 - 31s - loss: 0.4431 - accuracy: 0.7985 - 31s/epoch - 184ms/step
Epoch 4/10
169/169 - 22s - loss: 0.3978 - accuracy: 0.8180 - 22s/epoch - 130ms/step
Epoch 5/10
169/169 - 22s - loss: 0.3565 - accuracy: 0.8398 - 22s/epoch - 129ms/step
Epoch 6/10
169/169 - 24s - loss: 0.3115 - accuracy: 0.8650 - 24s/epoch - 143ms/step
Epoch 7/10
169/169 - 22s - loss: 0.2865 - accuracy: 0.8770 - 22s/epoch - 131ms/step
Epoch 8/10
169/169 - 21s - loss: 0.2603 - accuracy: 0.8904 - 21s/epoch - 125ms/step
Epoch 9/10
169/169 - 23s - loss: 0.2259 - accuracy: 0.9057 - 23s/epoch - 138ms/step
Epoch 10/10
169/169 - 23s - loss: 0.2118 - accuracy: 0.9141 - 23s/epoch - 135ms/step


<keras.callbacks.History at 0x2948935cfa0>

In [17]:
twt = ['Dear All,Kindly carry pendrive for your tomorrows EDI presentation.Thank you..']
#vectorizing the tweet by the pre-fitted tokenizer instance
twt = tokenizer.texts_to_sequences(twt)
#padding the tweet to have exactly the same shape as `embedding_2` input
twt = pad_sequences(twt, maxlen=27, dtype='int32', value=0)
print(twt)
sentiment = model.predict(twt,batch_size=1,verbose = 2)[0]
if(np.argmax(sentiment) == 0):
    print("negative")
elif (np.argmax(sentiment) == 1):
    print("positive")

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0  500  300  455  431 1313  120   43]]
1/1 - 0s - 221ms/epoch - 221ms/step
positive


In [18]:
model.evaluate(X_test,y_test)



[1.0306850671768188, 0.6483333110809326]

In [22]:
m1=load_model("lstm_model.h5")



In [24]:
def processing(text):
    text=re.sub("@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+",' ',text)
    text=" ".join([w for w in text.split() if w not in stopwords.words("english")])
    text=" ".join([Word(w).lemmatize() for w in text.split()])
    return text

In [25]:
def a1(twt):
    #vectorizing the tweet by the pre-fitted tokenizer instance
    twt = tokenizer.texts_to_sequences(twt)
    #padding the tweet to have exactly the same shape as `embedding_2` input
    twt = pad_sequences(twt, maxlen=28, dtype='int32', value=0)
    return twt

SyntaxError: invalid syntax (1258052265.py, line 1)

In [19]:
# model.save("lstm2_model.h5")