<a href="https://colab.research.google.com/github/harshithakolipaka/Textual_Sentiment_Analysis/blob/main/Textual_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading dataset

In [None]:
from google.colab import drive

In [None]:
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("/content/drive/MyDrive/PROJECTS/Twitter_Data.csv")
df['category'] = df['category'].fillna(0).astype(int)
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1
1,talk all the nonsense and continue all the dra...,0
2,what did just say vote for modi welcome bjp t...,1
3,asking his supporters prefix chowkidar their n...,1
4,answer who among these the most powerful world...,1


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df["clean_text"].apply(str), df["category"], test_size = 0.2)

In [None]:
X_train = df["clean_text"].apply(str).tolist()
y_train = df["category"].tolist()
print(X_train[:4])
print(y_train[:4])

['when modi promised “minimum government maximum governance” expected him begin the difficult job reforming the state why does take years get justice state should and not business and should exit psus and temples', 'talk all the nonsense and continue all the drama will vote for modi ', 'what did just say vote for modi  welcome bjp told you rahul the main campaigner for modi think modi should just relax', 'asking his supporters prefix chowkidar their names modi did great service now there confusion what read what not now crustal clear what will crass filthy nonsensical see how most abuses are coming from chowkidars']
[-1, 0, 1, 1]


# Data Cleaning

In [None]:
from nltk.tokenize import RegexpTokenizer

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [None]:
import nltk
nltk.download("stopwords")
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
tokenizer = RegexpTokenizer(r'\w+') # only want word and which is concatanated after removing expresssions
english_stopwords = set(stopwords.words("english"))
ps = PorterStemmer()

In [None]:
def getCleanedText(text):
    
    text = re.sub("(\d*\.\d+)|(\d+\.[0-9 ]+)","",text)

    text = text.lower()

    #tokenize
    tokens = tokenizer.tokenize(text)
    new_tokens = [token for token in tokens if token not in english_stopwords]

    #stemming
    stemmed_tokens = [ps.stem(tokens) for tokens in new_tokens]

    clean_text = " ".join(stemmed_tokens)

    return clean_text

In [None]:
X_clean = [getCleanedText(i) for i in X_train]

In [None]:
X_clean[:2]

['modi promis minimum govern maximum govern expect begin difficult job reform state take year get justic state busi exit psu templ',
 'talk nonsens continu drama vote modi']

In [None]:
Xt_clean = [getCleanedText(i) for i in X_test]
print(Xt_clean[:2])

['say moment pride indian narendra modi said countri fourth countri space power congratul', 'modi kick elect campaign promis new india']


#Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(ngram_range = (1, 2))

In [None]:
X_vec = cv.fit_transform(X_clean[:1000]).toarray()

In [None]:
X_vec

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
cv.get_feature_names_out()

array(['100', '100 buck', '100 legal', ..., 'नऊआ', 'नऊआ alway', 'शबच'],
      dtype=object)

In [None]:
Xt_vect = cv.transform(Xt_clean).toarray()

#Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
mn = MultinomialNB()

In [None]:
mn.fit(X_vec, y_train[:1000])

MultinomialNB()

In [None]:
y_pred = list(mn.predict(Xt_vect))

In [None]:
print(y_pred)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 0, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, -1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, -1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, -1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1

In [None]:
def sentiment(y_pred):

    if y_pred.count(1) > y_pred.count(0) + 3: 
       return "Postive"
    elif y_pred.count(-1) > y_pred.count(1) + 3:
       return "negative"
    else:
       return "neutral"

In [None]:
sentiment(y_pred)

'Postive'

In [None]:
import sklearn
sklearn.metrics.accuracy_score(y_test, y_pred) * 100

51.064547797275736