In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing
import os
# import tweepy as tw #for accessing Twitter API


#For Preprocessing
import re    # RegEx for removing non-letter characters
# import nltk  #natural language processing
# nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import *

# For Building the model
from sklearn.model_selection import train_test_split
# import tensorflow as tf
# import seaborn as sns

# #For data visualization
# import matplotlib.pyplot as plt
# import matplotlib.patches as mpatches
# %matplotlib inline

# pd.options.plotting.backend = "plotly"

In [2]:
# Load Tweet dataset
df = pd.read_csv('Twitter_Data.csv')
# Output first five rows
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [3]:
print(df.isnull().sum())

df.dropna(axis = 0, inplace =True)

clean_text    4
category      7
dtype: int64


In [4]:
# Map tweet categories
df['category'] = df['category'].map({-1.0:'Negative', 0.0:'Neutral', 1.0:'Positive'})
# Output first five rows
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,Negative
1,talk all the nonsense and continue all the dra...,Neutral
2,what did just say vote for modi welcome bjp t...,Positive
3,asking his supporters prefix chowkidar their n...,Positive
4,answer who among these the most powerful world...,Positive


In [5]:
def tweet_to_words(tweet):
    ''' Convert tweet text into a sequence of words '''
    
    # convert to lowercase
    text = tweet.lower()
    # remove non letters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    # tokenize
    words = text.split()
    # remove stopwords
    words = [w for w in words if w not in stopwords.words("english")]
    # apply stemming
    words = [PorterStemmer().stem(w) for w in words]
    # return list
    return words

print("\nOriginal tweet ->", df['clean_text'][0])
print("\nProcessed tweet ->", tweet_to_words(df['clean_text'][0]))




Original tweet -> when modi promised “minimum government maximum governance” expected him begin the difficult job reforming the state why does take years get justice state should and not business and should exit psus and temples

Processed tweet -> ['modi', 'promis', 'minimum', 'govern', 'maximum', 'govern', 'expect', 'begin', 'difficult', 'job', 'reform', 'state', 'take', 'year', 'get', 'justic', 'state', 'busi', 'exit', 'psu', 'templ']


In [6]:
# Apply data processing to each tweet
X = list(map(tweet_to_words, df['clean_text']))

from sklearn.preprocessing import LabelEncoder

# Encode target labels
le = LabelEncoder()
Y = le.fit_transform(df['category'])

# Apply data processing to each tweet
X = list(map(tweet_to_words, df['clean_text']))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

print('Number of tweets in the total set : {}'.format(len(X)))
print('Number of tweets in the training set : {}'.format(len(X_train)))
print('Number of tweets in the testing set : {}'.format(len(X_test)))


Number of tweets in the total set : 162969
Number of tweets in the training set : 130375
Number of tweets in the testing set : 32594


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer

vocabulary_size = 5000

# Tweets have already been preprocessed hence dummy function will be passed in 
# to preprocessor & tokenizer step
count_vector = CountVectorizer(max_features=vocabulary_size,
#                               ngram_range=(1,2),    # unigram and bigram
                                preprocessor=lambda x: x,
                               tokenizer=lambda x: x) 
#tfidf_vector = TfidfVectorizer(lowercase=True, stop_words='english')

# Fit the training data
X_train = count_vector.fit_transform(X_train).toarray()

# Transform testing data
X_test = count_vector.transform(X_test).toarray()

In [10]:
#NAIVE BAYES MODEL
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB() #Load Multinomial Naieve Bayeas
#Training#
model.fit(X_train, y_train)

MultinomialNB()

In [11]:
## Prediction ##
predicted = model.predict(X_test)

#ACCURACY MEASING
from sklearn.metrics import accuracy_score, confusion_matrix
#Accuracy#
Accuracy = accuracy_score(y_test, predicted)
Accuracy 

0.7438485610848622

In [12]:
df.shape

(162969, 2)