In [53]:
import pandas as pd
import numpy as np
import unicodedata
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import string
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from numpy.linalg import norm
import matplotlib.pyplot as plt
from itertools import combinations
from sklearn.decomposition import PCA
import spacy
from spacy import displacy

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Tweets.csv')
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


## Make X and y

In [3]:
X = df.text
y = df.airline_sentiment

## Create Bag of Words Matrix

In [6]:
tokens = list(
    map(lambda s: word_tokenize(s.lower()), X)
)

In [7]:
# Remove Stop Words
sw = stopwords.words('english')
pt = string.punctuation
filtered = [list(
            filter(lambda token: token not in sw and token not in pt, row)
) for row in tokens]

In [8]:
# Stemming & Lemmatization
stemmer_snowball = SnowballStemmer('english')
tokens_stemsnowball = [list(
    map(stemmer_snowball.stem, row)
) for row in filtered]

In [9]:
# Create N-grams
documents = [row + 
            list(
                map(lambda ng: '-'.join(ng), ngrams(row,2))
            )
            for row in tokens_stemsnowball]

In [10]:
# Create Matrix to hold the counts of each word for each document
vocabulary = set()
[[vocabulary.add(token) for token in row] for row in documents]
vocabulary_lookup = {word: i for i, word in enumerate(vocabulary)}
matrix = np.zeros((len(documents), len(vocabulary)))

In [11]:
for doc_id, document in enumerate(documents):
    for word in document:
        word_id = vocabulary_lookup[word]
        matrix[doc_id][word_id] += 1

In [13]:
X.shape

(14640,)

In [12]:
matrix.shape

(14640, 104804)

## Baseline Accuracy

In [32]:
y.value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [37]:
len(y[y=='negative'])/len(y)

0.6269125683060109

## Naive Bayes

In [27]:
X_train, X_test, y_train, y_test = train_test_split(matrix, y, test_size=0.33, random_state=21)

In [30]:
MultinomialNB().fit(X_train, y_train).score(X_test, y_test)

0.7580711920529801

## Word2Vec

In [41]:
nlp = spacy.load("en_core_web_lg")

In [43]:
X_vec = [nlp(t).vector for t in X]

## KNN

In [44]:
KX_train, KX_test, Ky_train, Ky_test = train_test_split(X_vec, y, test_size=0.33, random_state=21)

In [49]:
#Default Distance Metric
KNeighborsClassifier(n_neighbors=3).fit(KX_train, Ky_train).score(KX_test, Ky_test)

0.7251655629139073

In [54]:
#Cosine Similarity
def cosine_similarity(a, b):
    numerator = (a @ b)
    denominator = (norm(a) * norm(b))
    return 1 if denominator == 0 else 1 - (numerator / denominator)

KNeighborsClassifier(n_neighbors=3, metric=cosine_similarity).fit(KX_train, Ky_train).score(KX_test, Ky_test)

0.722682119205298