# Sentiment Analysis

In [6]:
import pandas as pd
import nltk
from bs4 import BeautifulSoup
import string

# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split 

In [7]:
# Custom Preferences

normalize = 'lemmatization' # 'stemming' or 'lemmatization'
model = 'linear_svc' # 'svc' or 'linear_svc'

In [8]:
# dataset = pd.read_csv('Twitter_Data.csv')
dataset = pd.read_csv('Twitter_Data.csv')

# imdb = imdb.iloc[:2000,:]
print(dataset.head())
print(dataset['sentiment'].value_counts())

                                              review sentiment
0  when modi promised “minimum government maximum...  negative
1  talk all the nonsense and continue all the dra...   neutral
2  what did just say vote for modi  welcome bjp t...  positive
3  asking his supporters prefix chowkidar their n...  positive
4  answer who among these the most powerful world...  positive
sentiment
positive    72250
neutral     55213
negative    35510
Name: count, dtype: int64


In [9]:
print("Total length: " + str(len(dataset)))
len(dataset[dataset['sentiment'] == 'positive' ])

Total length: 162980


72250

In [10]:
# ## Preprocessing
dataset.dropna(subset=['review', 'sentiment'], how='any', inplace=True)
X = dataset['review']
y = dataset['sentiment']

# remove neutral reviews
# X = X[y != 'neutral']
# y = y[y != 'neutral']

# print nan
print(X[X.isnull()])
print(y[y.isnull()])

# count of x and y
print("X length: " + str(len(X)))
print("Y length: " + str(len(y)))

Series([], Name: review, dtype: object)
Series([], Name: sentiment, dtype: object)
X length: 162969
Y length: 162969


In [11]:
# Remove html tags using beautiful soup
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()
X = X.apply(remove_html_tags)

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text
X = X.apply(remove_punctuations)


print(X.head()) 
print(X.tail())

  soup = BeautifulSoup(text, "html.parser")


0    when modi promised “minimum government maximum...
1    talk all the nonsense and continue all the dra...
2    what did just say vote for modi  welcome bjp t...
3    asking his supporters prefix chowkidar their n...
4    answer who among these the most powerful world...
Name: review, dtype: object
162975    why these 456 crores paid neerav modi not reco...
162976    dear rss terrorist payal gawar what about modi...
162977    did you cover her interaction forum where she ...
162978    there big project came into india modi dream p...
162979    have you ever listen about like gurukul where ...
Name: review, dtype: object


In [12]:
# Tokenize
X_tokens = X.apply(word_tokenize)

In [13]:
print(X_tokens.head())

0    [when, modi, promised, “, minimum, government,...
1    [talk, all, the, nonsense, and, continue, all,...
2    [what, did, just, say, vote, for, modi, welcom...
3    [asking, his, supporters, prefix, chowkidar, t...
4    [answer, who, among, these, the, most, powerfu...
Name: review, dtype: object


In [14]:

# Remove stopwords
stop_words = set(stopwords.words('english'))
print(stop_words)


X_tokens = X_tokens.apply(lambda x: [item for item in x if item not in stop_words])
print(X_tokens.head())

{'mustn', "it's", "isn't", 'am', 'up', 'too', 'he', 'after', 'himself', 'what', 'o', "hadn't", 'weren', 'themselves', 'until', 'those', 'so', 'while', 'both', 'but', 'such', 'her', 'and', 't', "mustn't", 'his', 'these', 'has', 'all', 'this', 'nor', 'above', 've', 'here', "you've", "you're", 'them', 'now', 'how', "needn't", 'isn', 'we', 'yourselves', 'were', 'did', 'wouldn', 'into', 'do', "that'll", 'd', 'have', 'before', 'when', 'no', 'm', 'at', "mightn't", "aren't", 'a', 'being', "doesn't", 'in', 'been', 'ma', 'hadn', 'which', 'very', 'with', 'is', 'be', 'there', 'your', "you'll", 'don', 'they', 'for', 'if', 'shan', 'was', 'to', 'or', 'from', 'ain', 'couldn', 'their', 's', 'y', 'yourself', 'where', 'whom', 'over', 'will', 'few', 'hasn', 'll', 're', 'any', 'haven', "shan't", 'own', 'can', "shouldn't", "she's", 'that', 'who', 'didn', 'its', 'by', 'during', 'as', 'ours', 'does', 'doing', 'it', 'shouldn', 'she', 'itself', 'about', 'of', 'further', 'out', 'once', 'off', 'mightn', 'why', 'd

Stemming identifies the common root form of a word by removing or replacing word suffixes (e.g. “flooding” is stemmed as “flood”),

In [15]:
# #Stemming
from nltk.stem import PorterStemmer
if normalize == 'stemming':
    ps = PorterStemmer()
    X_tokens = X_tokens.apply(lambda x: [ps.stem(item) for item in x])

Lemmatization is a text pre-processing technique used in natural language processing (NLP) models to break a word down to its root meaning to identify similarities. For example, a lemmatization algorithm would reduce the word better to its root word, or lemme, good.  

In [16]:
# Lemmatization

from nltk.stem import WordNetLemmatizer

if normalize == 'lemmatization':
    lemmatizer = WordNetLemmatizer()
    X_tokens = X_tokens.apply(lambda x: [lemmatizer.lemmatize(item) for item in x])

print(X_tokens.head())

0    [modi, promised, “, minimum, government, maxim...
1        [talk, nonsense, continue, drama, vote, modi]
2    [say, vote, modi, welcome, bjp, told, rahul, m...
3    [asking, supporter, prefix, chowkidar, name, m...
4    [answer, among, powerful, world, leader, today...
Name: review, dtype: object


We will split the dataset into 4:1 ratio of training and test

In [17]:
# split into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X_tokens, y, test_size=0.2, random_state=42)

In [18]:
# Vectorization
X_train_seq = [str(x) for x in X_train]
X_test_seq = [str(x) for x in X_test]

cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train_seq)
X_test_cv = cv.transform(X_test_seq)


In [19]:
print(X_train_cv.shape)
print(X_test_cv.shape)

len(cv.get_feature_names_out())

(130375, 88384)
(32594, 88384)


88384

## USING SVM

In [20]:

# Let's now use SVM to train our model
from sklearn.svm import SVC, LinearSVC

if model == 'svc':
    SVM = SVC()
    SVM.fit(X_train_cv, Y_train)

elif model == 'linear_svc':
    SVM = LinearSVC()
    SVM.fit(X_train_cv, Y_train)




In [21]:

from sklearn.metrics import classification_report

predicted_SVM  = SVM.predict(X_test_cv)
print(classification_report(Y_test, predicted_SVM))


              precision    recall  f1-score   support

    negative       0.84      0.81      0.82      7152
     neutral       0.89      0.94      0.91     11067
    positive       0.91      0.89      0.90     14375

    accuracy                           0.89     32594
   macro avg       0.88      0.88      0.88     32594
weighted avg       0.89      0.89      0.89     32594



In [28]:
# TEST A CUSTOM STRING

def predict_sentiment(custom_string):
    custom_string = custom_string.replace('<.*?>', '')
    custom_string_tokens = word_tokenize(custom_string)
    custom_string_tokens = [item for item in custom_string_tokens if item not in stop_words]
    
    if normalize == 'stemming':
        custom_string_tokens = [ps.stem(item) for item in custom_string_tokens]
    if normalize == 'lemmatization':
        custom_string_tokens = [lemmatizer.lemmatize(item) for item in custom_string_tokens]

    custom_string_seq = [str(custom_string_tokens)]
    custom_string_cv = cv.transform(custom_string_seq)

    return SVM.predict(custom_string_cv)

# Predicting a positive review
test_positive = predict_sentiment("I loved this movie. It was so good. I would recommend it to everyone. I would watch it again and again. I loved the acting and the story. It was so good")
assert test_positive[0] == 'positive'

# Predicting a negative review
test_negative = predict_sentiment("I hated this movie. It was so bad. I would not recommend it to anyone. I would not watch it again and again. I hated the acting and the story. It was so bad")
assert test_negative[0] == 'negative'

print(test_positive)
print(test_negative)

while True:
    custom_string = input("Enter your review: ")
    if custom_string == "exit" or custom_string == "":
        break
    
    print("Input: ", custom_string)
    print("Sentiment: " + predict_sentiment(custom_string)[0] + "\n")

['positive']
['negative']
