# Assignment 8 - Deep Learning for NLP

[Dataset](https://drive.google.com/file/d/1HEcyWf93HxbJeC5kE0nLQH8fPevEiUhM/view?usp=sharing)

In [None]:
import numpy as np
import pandas as pd
import re

import gensim
from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

pd.set_option('display.max_colwidth',None)

1. Importing dataset

In [None]:
import chardet

# Detect the encoding of the CSV file
with open('/content/drive/MyDrive/Data/tweet_product_company.csv', 'rb') as f:
    encoding = chardet.detect(f.read())['encoding']

print(encoding)

MacRoman


In [None]:
# Read the CSV file with the detected encoding
data = pd.read_csv('/content/drive/MyDrive/Data/tweet_product_company.csv', encoding=encoding)

In [None]:
data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative emotion
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative emotion
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive emotion


In [None]:
data.columns

Index(['tweet_text', 'emotion_in_tweet_is_directed_at',
       'is_there_an_emotion_directed_at_a_brand_or_product'],
      dtype='object')

In [None]:
data.isna().sum()

tweet_text                                               1
emotion_in_tweet_is_directed_at                       5802
is_there_an_emotion_directed_at_a_brand_or_product       0
dtype: int64

In [None]:
data['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

is_there_an_emotion_directed_at_a_brand_or_product
No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: count, dtype: int64

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


2. Creating 'df' with necessary attributes from 'data'

In [None]:
df = data[['tweet_text','is_there_an_emotion_directed_at_a_brand_or_product']]

In [None]:
df.head()

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",Negative emotion
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,Positive emotion
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,Negative emotion
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Positive emotion


3. Renaming columns in df

In [None]:
new_columns = ['tweet','sentiment']
df.columns = new_columns
df.head()

Unnamed: 0,tweet,sentiment
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",Negative emotion
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,Positive emotion
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,Negative emotion
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Positive emotion


4. Mapping 'sentiment' values to numeric values

In [None]:
# Define the mapping of sentiment values to numeric values
sentiment_mapping = {
    "No emotion toward brand or product": 0,
    "Positive emotion": 1,
    "Negative emotion": -1,
    "I can't tell": 0
}

# Replace values in the 'sentiment' column using the mapping
df['sentiment'] = df['sentiment'].replace(sentiment_mapping)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = df['sentiment'].replace(sentiment_mapping)


Unnamed: 0,tweet,sentiment
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",-1
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",1
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,1
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,-1
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",1


5. Checking for empty cells

In [None]:
df.isna().sum()

tweet        1
sentiment    0
dtype: int64

6. Dropping empties

In [None]:
df = df.dropna()
df.reset_index(drop=True, inplace=True)
df.isna().sum()

tweet        0
sentiment    0
dtype: int64

In [None]:
df.head()

Unnamed: 0,tweet,sentiment
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",-1
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",1
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,1
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,-1
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9092 entries, 0 to 9091
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet      9092 non-null   object
 1   sentiment  9092 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 142.2+ KB


7. Preprocessing 'tweets' using gensim's simple_preprocessor

In [None]:
df['cleaned_text'] = df['tweet'].apply(lambda x:gensim.utils.simple_preprocess(x))
df.head()

Unnamed: 0,tweet,sentiment,cleaned_text
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",-1,"[wesley, have, iphone, after, hrs, tweeting, at, rise_austin, it, was, dead, need, to, upgrade, plugin, stations, at, sxsw]"
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",1,"[jessedee, know, about, fludapp, awesome, ipad, iphone, app, that, you, ll, likely, appreciate, for, its, design, also, they, re, giving, free, ts, at, sxsw]"
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,1,"[swonderlin, can, not, wait, for, ipad, also, they, should, sale, them, down, at, sxsw]"
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,-1,"[sxsw, hope, this, year, festival, isn, as, crashy, as, this, year, iphone, app, sxsw]"
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",1,"[sxtxstate, great, stuff, on, fri, sxsw, marissa, mayer, google, tim, reilly, tech, books, conferences, amp, matt, mullenweg, wordpress]"


8. Vocabulary size

In [None]:
from collections import Counter

words_count = Counter([word for text in df['cleaned_text'] for word in text])
vocab_size = len(words_count)
print("Vocabulary Size:", vocab_size)


Vocabulary Size: 9333


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

9. Tokenization

  1. Creates a `Tokenizer` object with the vocabulary size set to the number of unique words in the corpus.
  2. Fits the tokenizer on the `cleaned_text` column of the `df` dataframe. This creates a mapping between words and integers.
  3. Converts the `cleaned_text` column into a list of sequences of integers, where each integer represents a word in the vocabulary.

In [None]:
# Tokenization
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(df['cleaned_text'])
X_sequences = tokenizer.texts_to_sequences(df['cleaned_text'])

10. Padding

Pads the sequences to a maximum length of 100, ensuring that all sequences have the same length. This is necessary for feeding the data into a neural network.

This preprocessed data can now be used to train a machine learning model for sentiment analysis.


In [None]:
# Padding sequences
X_padded = pad_sequences(X_sequences, maxlen=100, padding='post', truncating='post')

11. Encoding the labels using LabelEncoder()

In [None]:
# One-Hot Encoding for sentiment labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['sentiment'])

12. Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

13. Building model

In [None]:
from keras.layers import Dense, LSTM, Embedding
from keras.models import Sequential

In [None]:
from keras.layers import Dropout

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=5, input_length=100))
model.add(LSTM(50, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(60))
model.add(Dense(50, activation='relu'))
model.add(Dense(3, activation='softmax'))

In [None]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 5)            46665     
                                                                 
 lstm_4 (LSTM)               (None, 100, 50)           11200     
                                                                 
 dropout_1 (Dropout)         (None, 100, 50)           0         
                                                                 
 lstm_5 (LSTM)               (None, 60)                26640     
                                                                 
 dense_4 (Dense)             (None, 50)                3050      
                                                                 
 dense_5 (Dense)             (None, 3)                 153       
                                                                 
Total params: 87708 (342.61 KB)
Trainable params: 8770

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

14. Training the model

In [None]:
history = model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
