# Twitter Sentiment Analysis using NLP and Naive Bayes Classifier

### Keshava G
### 21ME63R40

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from textblob import TextBlob

In [2]:
from wordcloud import WordCloud

In [3]:
df = pd.read_csv('tweets_raw.csv')

In [7]:
df = df.drop(['Unnamed: 0','Unnamed: 0.1','Location','Username','Created at'],axis = 1)

In [8]:
df.head()

Unnamed: 0,Content,Retweet-Count,Favorites
0,innovate an innovative approach #quoteoftheday...,0,0
1,The pandemic is raising concerns about how tee...,0,0
2,STI: Staying Education-ready in the New Normal...,0,0
3,Digital Learning Through Digital RCRT\n.\n.\nR...,0,0
4,"Upswing Classroom: Out and Out Virtual School,...",1,0


In [9]:
import re
def clean(text):
    text = re.sub(r'@[^a-zA-Z0-9]+','',text)
    text = re.sub(r'#','',text)
    text = re.sub(r'\/ + n','',text)
    text = re.sub(r'RT[\s]+','',text)
    text = re.sub(r'https?:\/\/\S+','',text)
    text = text.replace('\n', ' ')
    text = text.replace('.', ' ')
    
    return text

In [10]:
df['Content'] = df['Content'].apply(clean)
df['Content']

0         innovate an innovative approach quoteoftheday ...
1         The pandemic is raising concerns about how tee...
2         STI: Staying Education-ready in the New Normal...
3         Digital Learning Through Digital RC    Registr...
4         Upswing Classroom: Out and Out Virtual School,...
                                ...                        
202640    New post by our amazing author, @DrJacieMaslyk...
202641    When you move into a new office on Monday and ...
202642    Looking for advice on how two full time workin...
202643    Join me tomorrow!!  MNEA NEA BetterTogether ed...
202644    First day of virtual school starts tomorrow! G...
Name: Content, Length: 202645, dtype: object

In [12]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, len(df['Content'])):
  review = re.sub('[^a-zA-Z]', ' ', df['Content'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to C:\Users\Keshava
[nltk_data]     G\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
df['corpus'] = pd.DataFrame(corpus)
df.head()

Unnamed: 0,Content,Retweet-Count,Favorites,corpus
0,innovate an innovative approach quoteoftheday ...,0,0,innov innov approach quoteoftheday digitalmark...
1,The pandemic is raising concerns about how tee...,0,0,pandem rais concern teen use technolog still l...
2,STI: Staying Education-ready in the New Normal...,0,0,sti stay educ readi new normal inquir technolo...
3,Digital Learning Through Digital RC Registr...,0,0,digit learn digit rc registr open soon websit ...
4,"Upswing Classroom: Out and Out Virtual School,...",1,0,upsw classroom virtual school not classroom ed...


In [14]:
def getsubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def getpolority(text):
    return TextBlob(text).sentiment.polarity

In [16]:
df['subjectivity'] = df['corpus'].apply(getsubjectivity)

In [17]:
df['polority'] = df['corpus'].apply(getpolority)

In [18]:
df.head()

Unnamed: 0,Content,Retweet-Count,Favorites,corpus,subjectivity,polority
0,innovate an innovative approach quoteoftheday ...,0,0,innov innov approach quoteoftheday digitalmark...,0.0,0.0
1,The pandemic is raising concerns about how tee...,0,0,pandem rais concern teen use technolog still l...,0.0,0.0
2,STI: Staying Education-ready in the New Normal...,0,0,sti stay educ readi new normal inquir technolo...,0.552273,0.143182
3,Digital Learning Through Digital RC Registr...,0,0,digit learn digit rc registr open soon websit ...,0.5,0.0
4,"Upswing Classroom: Out and Out Virtual School,...",1,0,upsw classroom virtual school not classroom ed...,0.0,0.0


In [19]:
df['Content'][4]

'Upswing Classroom: Out and Out Virtual School, Not Just a Classroom! - EdTechReview  education edtech educators students edchat learning teachers classroom DigitalLearning Teachers'

In [20]:
def getany(score):
    if score<0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'
df['Analysis'] = df['polority'].apply(getany)

In [21]:
df['Analysis'].unique()

array(['Neutral', 'Positive', 'Negative'], dtype=object)

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(df['corpus']).toarray()
y = df.iloc[:, -1].values

NameError: name 'df' is not defined

In [30]:
y

array(['Neutral', 'Neutral', 'Positive', ..., 'Positive', 'Neutral',
       'Positive'], dtype=object)

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [32]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

In [33]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[['Negative' 'Positive']
 ['Positive' 'Positive']
 ['Neutral' 'Positive']
 ...
 ['Neutral' 'Neutral']
 ['Neutral' 'Neutral']
 ['Neutral' 'Neutral']]


In [34]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[ 4578   550  1004]
 [  618 16044   306]
 [ 4146  1103 12180]]


0.8093463939401416