
## **Case Study 04 Machine Learning to predict public sentiment from text data.**

In [11]:
import numpy as np
import pandas as pd

In [12]:
data=pd.read_csv('/content/judge-1377884607_tweet_product_company.csv',encoding='ISO-8859-1')

In [13]:
data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [14]:
data=data[['tweet_text','is_there_an_emotion_directed_at_a_brand_or_product']]

In [15]:
data.head()

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion


In [16]:
data.is_there_an_emotion_directed_at_a_brand_or_product.value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [17]:
data[data['is_there_an_emotion_directed_at_a_brand_or_product']=='No emotion toward brand or product']='neutral'
data[data['is_there_an_emotion_directed_at_a_brand_or_product']=="I can't tell"]='no_idea'
data[data['is_there_an_emotion_directed_at_a_brand_or_product']=='Positive emotion']='positive'
data[data['is_there_an_emotion_directed_at_a_brand_or_product']=='Negative emotion']='negative'

In [18]:
data.is_there_an_emotion_directed_at_a_brand_or_product.value_counts()

neutral     5389
positive    2978
negative     570
no_idea      156
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [19]:
pd.set_option('display.max_colwidth',None)
messages=data[['tweet_text','is_there_an_emotion_directed_at_a_brand_or_product']]
messages.columns=['text','label']
messages.head()

Unnamed: 0,text,label
0,negative,negative
1,positive,positive
2,positive,positive
3,negative,negative
4,positive,positive


In [20]:
X=messages['text']
y=messages['label'].values

In [21]:
y = y.reshape(-1,1)

In [22]:
#onehot encoding of target
from sklearn.preprocessing import OneHotEncoder
one=OneHotEncoder()
y = one.fit_transform(y)
y = y.toarray()

In [23]:
from keras.preprocessing import text
tokenizer = text.Tokenizer() 
tokenizer.fit_on_texts(list(messages['text']))
tokenized_texts = tokenizer.texts_to_sequences(messages['text'])

In [24]:
from keras.utils import pad_sequences
X=pad_sequences(tokenized_texts,maxlen=100)

In [25]:
tokenizer.word_index

{'neutral': 1, 'positive': 2, 'negative': 3, 'no': 4, 'idea': 5}

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [27]:
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding,SimpleRNN,Dropout

In [28]:
model=Sequential()

model.add(Embedding(input_dim=len(tokenizer.word_index)+1,output_dim=128,input_length=100))
model.add(SimpleRNN(10))
model.add(Dense(50,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(4,activation='softmax'))

In [29]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [30]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          768       
                                                                 
 simple_rnn (SimpleRNN)      (None, 10)                1390      
                                                                 
 dense (Dense)               (None, 50)                550       
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                                 
 dense_1 (Dense)             (None, 4)                 204       
                                                                 
Total params: 2,912
Trainable params: 2,912
Non-trainable params: 0
_________________________________________________________________


In [31]:
model.fit(X_train,y_train,epochs=10,validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7feb4ec93940>

In [32]:
y_pred=model.predict(X_test)



In [33]:
#we have encoded the labels.

#so lets do the inverse one hot encoding for predicted labels
pred = []
for i in range(len(y_pred)):
    pred.append(np.argmax(y_pred[i]))
    
#also inverse encoding for actual labels

test = []
for i in range(len(y_test)):
    test.append(np.argmax(y_test[i]))

In [34]:
from sklearn.metrics import accuracy_score
accuracy_score(test,pred)

1.0