In [6]:
import nltk
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pylab as pl

In [7]:
def preprocess(tweet):
    
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    
    #Convert @username to __USERHANDLE
    tweet = re.sub('@[^\s]+','__USERHANDLE',tweet)  
    
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    
    #trim
    tweet = tweet.strip('\'"')
    
    # Repeating words like hellloooo
    repeat_char = re.compile(r"(.)\1{1,}", re.IGNORECASE)
    tweet = repeat_char.sub(r"\1\1", tweet)
    
    #Emoticons
    emoticons = \
    [
     ('__positive__',[ ':-)', ':)', '(:', '(-:', \
                       ':-D', ':D', 'X-D', 'XD', 'xD', \
                       '<3', ':\*', ';-)', ';)', ';-D', ';D', '(;', '(-;', ] ),\
     ('__negative__', [':-(', ':(', '(:', '(-:', ':,(',\
                       ':\'(', ':"(', ':((','D:' ] ),\
    ]

    def replace_parenthesis(arr):
       return [text.replace(')', '[)}\]]').replace('(', '[({\[]') for text in arr]
    
    def join_parenthesis(arr):
        return '(' + '|'.join( arr ) + ')'

    emoticons_regex = [ (repl, re.compile(join_parenthesis(replace_parenthesis(regx))) ) \
            for (repl, regx) in emoticons ]
    
    for (repl, regx) in emoticons_regex :
        tweet = re.sub(regx, ' '+repl+' ', tweet)

     #Convert to lower case
    tweet = tweet.lower()
    
    return tweet


In [8]:
#Stemming of Tweets

def stem(tweet):
        stemmer = nltk.stem.PorterStemmer()
        tweet_stem = ''
        words = [word if(word[0:2]=='__') else word.lower() \
                    for word in tweet.split() \
                    if len(word) >= 3]
        words = [stemmer.stem(w) for w in words] 
        tweet_stem = ' '.join(words)
        return tweet_stem

In [9]:
import pandas as pd


dataset = pd.read_csv('Training.csv',encoding='ISO-8859-1',header=None)
X=dataset.iloc[:,5].values
X=pd.Series(X)
y=dataset.iloc[:,0].values
'''
for row in range(0,1600000):
    if y[row]==4:
        y[row]=1
    else:
        y[row]=0
'''

'\nfor row in range(0,1600000):\n    if y[row]==4:\n        y[row]=1\n    else:\n        y[row]=0\n'

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, random_state=5)

X_train = [stem(preprocess(tweet)) for tweet in X_train]
X_test = [stem(preprocess(tweet)) for tweet in X_test]

vec = TfidfVectorizer(min_df=5, max_df=0.95, sublinear_tf = True,use_idf = True,ngram_range=(1, 2))
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)
nb = MultinomialNB()

In [11]:
nb.fit(X_train_vec, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
nb_predicted= nb.predict(X_test_vec)

In [13]:

nb_score = round(nb.score(X_train_vec, y_train) * 100, 2)
nb_score_test = round(nb.score(X_test_vec, y_test) * 100, 2)

In [15]:
print('Naive Bayes  Training Score: \n', nb_score)
print('Naive bayes Test Score: \n', nb_score_test)
print('Coefficient: \n', nb.coef_)
print('Intercept: \n', nb.intercept_)
print('Accuracy: \n', metrics.accuracy_score(y_test,nb_predicted))
print('Confusion Matrix: \n', metrics.confusion_matrix(y_test,nb_predicted))
print('Classification Report: \n', metrics.classification_report(y_test,nb_predicted))

('Naive Bayes  Training Score: \n', 82.87)
('Naive bayes Test Score: \n', 79.83)
('Coefficient: \n', array([[ -9.51111953, -12.27963753, -14.75549296, ..., -13.19106124,
        -12.85046565, -12.6598296 ]]))
('Intercept: \n', array([-0.69347692]))
('Accuracy: \n', 0.798309375)
('Confusion Matrix: \n', array([[128419,  31370],
       [ 33171, 127040]]))
('Classification Report: \n', u'              precision    recall  f1-score   support\n\n           0       0.79      0.80      0.80    159789\n           4       0.80      0.79      0.80    160211\n\n   micro avg       0.80      0.80      0.80    320000\n   macro avg       0.80      0.80      0.80    320000\nweighted avg       0.80      0.80      0.80    320000\n')
