## Naive Bayes

 This has been updated to match our newest preprocessing and subsampling

In [1]:
import pandas as pd

# Training Set:
!gdown https://drive.google.com/uc?id=1BLT_kqE1Uz0-NNDU1_BfoSPyV_EXGOxT
# Test Set:
!gdown https://drive.google.com/uc?id=1bF3gAkMFprumMgRWoNW3PFWXsxBY1Fqj

tweets_train = pd.read_excel('Tweet_Train_Set.xlsx', encoding='UTF-8')
tweets_test = pd.read_excel('Tweet_Test_Set.xlsx', encoding='UTF-8')


Downloading...
From: https://drive.google.com/uc?id=1BLT_kqE1Uz0-NNDU1_BfoSPyV_EXGOxT
To: /content/Tweet_Train_Set.xlsx
100% 1.22M/1.22M [00:00<00:00, 77.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1bF3gAkMFprumMgRWoNW3PFWXsxBY1Fqj
To: /content/Tweet_Test_Set.xlsx
100% 535k/535k [00:00<00:00, 87.0MB/s]


In [0]:
# Import pandas and sklearn:

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

#### Preprocessing

---
This section uses our latest preprocessing.

In [0]:
def preprocess(tweets):

  import nltk
  from nltk.stem import SnowballStemmer, PorterStemmer
  from nltk.tokenize import word_tokenize
  from nltk.corpus import stopwords
  
  nltk.download('stopwords')
  nltk.download('punkt')
  
  ss = SnowballStemmer('english')
  
    #new ones:
  tweets['punct_removed'] = tweets['text'].str.replace(r'@\w+', r'at_user')
  tweets['punct_removed'] = tweets['punct_removed'].str.replace(r"http\S+", r'')
  
  tweets['punct_removed'] = tweets['punct_removed'].str.replace(r'[^a-zA-Z0-9!)\( ]',r'')
  
  
  tweets['punct_removed'] = tweets['punct_removed'].str.lower()
  tweets['tokenized'] = tweets['punct_removed'].apply(word_tokenize)
  
  stops = stopwords.words('english')
  tweets['tokenized'] = tweets['tokenized'].apply(lambda x: [item for item in x if item not in stops])

  tweets['stemmed'] = tweets['tokenized'].apply(lambda x: [ss.stem(y) for y in x])

  
  return tweets




In [4]:
#tweets = preprocess(tweets)

tweets_train = preprocess(tweets_train)
tweets_test = preprocess(tweets_test)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
def do_nothing(input):
  return input

cv = CountVectorizer(preprocessor = do_nothing,
                             tokenizer = do_nothing,
                             analyzer = do_nothing)        #Used to making a matrix                      

In [0]:
#dtm = cv.fit_transform(tweets['stemmed'])

X_train = cv.fit_transform(tweets_train['stemmed'])
X_test = cv.transform(tweets_test['stemmed'])

le = LabelEncoder()

#labels = tweets.airline_sentiment
#labels = le.fit_transform(labels)

train_labels = tweets_train['airline_sentiment']
test_labels = tweets_test['airline_sentiment']

y_train = le.fit_transform(train_labels)
y_test = le.transform(test_labels)

#### Init and Train the Naive Bayes Model
___

In [0]:
# since the data is word counts, use Multinomial Distribution:  create untrained modal

nb = MultinomialNB()

In [8]:
nb.fit(X_train, y_train)  #X matrix , y is list

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

#### Evaluate model

In [9]:
# get test accuracy:
nb.score(X_test, y_test)

0.7343676539727315

In [0]:
# get test set predictions:
y_hat = nb.predict(X_test)

In [0]:
# create confusion matrix:
cf = pd.DataFrame(confusion_matrix(y_test,y_hat))
cf.columns = le.inverse_transform(cf.columns)
cf.index = le.inverse_transform(cf.index)

In [12]:
cf

Unnamed: 0,negative,neutral,positive
negative,581,65,63
neutral,161,399,149
positive,72,55,582


In [13]:
prfs = precision_recall_fscore_support(y_test,y_hat, average='macro')   #text data hence

print("Precision:\t{0:.4f} \nRecall: \t{1:.4f} \nF Score: \t{2:.4f}".format(prfs[0],prfs[1],prfs[2]))

Precision:	0.7385 
Recall: 	0.7344 
F Score: 	0.7291
