In [1]:
# Importing libraries

import numpy as np
import pandas as pd


In [2]:
# Data loading 

df = pd.read_csv('Twitter_Data.csv')
df

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0
162977,did you cover her interaction forum where she ...,0.0
162978,there big project came into india modi dream p...,0.0


In [3]:
# Checking the shape of the data

df.shape

(162980, 2)

In [4]:
# Checking the data head

df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [5]:
# Checking the data information

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162976 non-null  object 
 1   category    162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB


In [6]:
# Checking for missing values

df.isnull().sum()

clean_text    4
category      7
dtype: int64

In [7]:
# Dealing with missing values

df[df['clean_text'].isna()]

Unnamed: 0,clean_text,category
148,,0.0
158694,,-1.0
159443,,0.0
160560,,1.0


In [8]:
df[df['category'].isna()]

Unnamed: 0,clean_text,category
130448,the foundation stone northeast gas grid inaugu...,
155642,dear terrorists you can run but you cant hide ...,
155698,offense the best defence with mission shakti m...,
155770,have always heard politicians backing out thei...,
158693,modi government plans felicitate the faceless ...,
159442,chidambaram gives praises modinomics,
160559,the reason why modi contested from seats 2014 ...,


In [9]:
# Deleting the NaN values so that it will not affect our result and as such values are very few

df = df.drop([130448,155642,155698,155770,158693,159442,160559,148,158694,159443,160560], axis =0).reset_index(drop=True)

In [10]:
df[df['clean_text'].isna()]

Unnamed: 0,clean_text,category


In [11]:
df[df['category'].isna()]

Unnamed: 0,clean_text,category


In [12]:
df.isna().sum()

clean_text    0
category      0
dtype: int64

In [13]:
# 1 ------> positive 
# 0 ------> Neutral 
# -1 -----> negative

df['category'].value_counts()

category
 1.0    72249
 0.0    55211
-1.0    35509
Name: count, dtype: int64

In [14]:
# Separating data and label

x = df['clean_text'].values
y = df['category'].values

In [15]:
# importing libraries for model building

import re
import nltk

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
all_stopwords=stopwords.words('english')

In [16]:
print(df.shape)

(162969, 2)


In [17]:
corpus =[]

for i in range(0,162969):
    clean_text = re.sub("[^a-zA-Z]"," ",df['clean_text'][i])
    clean_text = clean_text.lower()
    clean_text = clean_text.split()
    clean_text = [ps.stem(word) for word in clean_text if not word in all_stopwords]
    clean_text =' '.join(clean_text)
    corpus.append(clean_text)

In [18]:
corpus

['modi promis minimum govern maximum govern expect begin difficult job reform state take year get justic state busi exit psu templ',
 'talk nonsens continu drama vote modi',
 'say vote modi welcom bjp told rahul main campaign modi think modi relax',
 'ask support prefix chowkidar name modi great servic confus read crustal clear crass filthi nonsens see abus come chowkidar',
 'answer among power world leader today trump putin modi may',
 'kiya tho refresh maarkefir comment karo',
 'surat women perform yagna seek divin grace narendra modi becom',
 'come cabinet scholar like modi smriti hema time introspect',
 'upcom elect india saga go import pair look current modi lead govt elect deal brexit combin weekli look juici bear imho',
 'gandhi gay modi',
 'thing like demonetis gst good servic tax upper cast would sort either view favour say need give time cast like dalit muslim modi constitu',
 'hope tuthukudi peopl would prefer honest well behav nationalist courag likli minist modi cabinet vo

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features = 1420)

In [20]:
x = cv.fit_transform(corpus).toarray()
y = df.iloc[:,-1].values

In [21]:
# Splitting the data into train data and test data

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size =0.20, random_state = 0)

In [22]:
from sklearn.naive_bayes import GaussianNB


classifier = GaussianNB()
classifier.fit(x_train, y_train)

In [23]:
# model performance

y_pred = classifier.predict(x_test)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

accuracy_score(y_test, y_pred)


[[4350 1588 1094]
 [ 821 9649  545]
 [3054 2889 8604]]


0.6934711910167516