In [1]:
## Import Modules 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
import math
import re
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
## Read in dataset and select only positive or negative tweets denoted by Ran 
df = pd.read_csv('final.csv',encoding="ISO-8859-1")
df['Sentiment'].unique()
df = df[df.S]
df.shape

KeyError: 'Ran'

In [7]:
## Clean the tweet message
def clean_tweet(tweet):
    '''
    Utility function to clean the text in a tweet by removing 
    links and special characters using regex.
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())
# Update the text by the cleaned tweet
for index, row in df.iterrows():
    #print(row.text)
    row.text = TextBlob(clean_tweet(row.text))

In [54]:
## Split dataset as training and test data with ratio 0.2 (80% training and 20% test)
from sklearn.model_selection import train_test_split
X = df.text
y = df.Ran
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [55]:
## Covert test as one hot vector 
## "max_features = 1000" is a tunable parameter, we can either increase it or decrease it considering the size of the dataset,
## typically increase it for large data and decrease it for small data
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(max_features=1000, binary=True)
X_train_vect = vect.fit_transform(X_train)
## SMOTE function is to balance dataset, for example if there is too many positive tweets than negative tweets, SMOTE helps you oversample the 
## negative tweets in order to keep the numbers of the two types in the same level (or equal) 
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_train_res, y_train_res = sm.fit_sample(X_train_vect, y_train)
## Train naive bayes model on the oversampled training dataset
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_res, y_train_res)
nb.score(X_train_res, y_train_res)

In [60]:
## Fit model on the test data
X_test_vect = vect.transform(X_test)
y_pred = nb.predict(X_test_vect)
y_pred

array(['NG', 'NG', 'P', 'NG', 'NG', 'P', 'P', 'P', 'NG', 'P', 'P', 'NG'],
      dtype='<U2')

In [63]:
## Accuracy and confusion matrix on test dataset 
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\nCOnfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 75.00%

COnfusion Matrix:
 [[3 0]
 [3 6]]


In [65]:
df = pd.read_csv('final.csv',encoding="ISO-8859-1")
df.head()

Unnamed: 0,TweetID,replyToTweet,fromUser,toUser,mentions,text,createdAt,latitude,longitude,Sentiment
0,2.63e+17,,247102965,,,Sandy can you come in hot #please #noschool,Sun Oct 28 19:08:57 +0000 2012,39.346585,-76.623635,1
1,2.63e+17,,838309267,,,Wegmans before Hurricane Sandy's storm effects...,Sun Oct 28 16:26:21 +0000 2012,38.851924,-77.347214,1
2,2.63e+17,,756132696,,,Thank you #hurricanesandy for making me walk t...,Mon Oct 29 13:54:52 +0000 2012,42.111231,-77.944133,0
3,2.63e+17,,243472700,,,Power has been out now for three hours in Hazl...,Mon Oct 29 21:49:53 +0000 2012,40.432716,-74.170199,0
4,2.63e+17,,19210782,,,I'm at Frankenstorm Apocalypse - Hurricane San...,Mon Oct 29 06:31:25 +0000 2012,40.784701,-73.786926,0
