#### Imports

In [1]:
import nltk
import string
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from prettytable import PrettyTable

#### Load files, useful linguistic lists and tools

In [2]:
stop = set(stopwords.words('english'))
punct = set(string.punctuation)
punct.add('..')
punct.add('...')
lemma = WordNetLemmatizer()
tknzr = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

CONST_COMMONWORDS_ALL = "data/4000-most-common-english-words.csv"
# https://github.com/pkLazer/password_rank/blob/master/4000-most-common-english-words-csv.csv
common_words = pd.read_csv(CONST_COMMONWORDS_ALL)
common_words = list(common_words["words"])[:2000]

#### Load dataset

In [3]:
CONST_DATA_ALL = "data/Tweets.csv"
df = pd.read_csv(CONST_DATA_ALL)
cols = [10, 1]
df = df[df.columns[cols]]
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 2 columns):
text                 14640 non-null object
airline_sentiment    14640 non-null object
dtypes: object(2)
memory usage: 228.8+ KB
None


#### Useful functions

In [4]:
#check whether a sentence is in English or not using a 15% threshold
def isEnglish(sent):
    length = len(sent)
    cnt_of_english_word = 0
    for word in sent:
        if (word in common_words):
            cnt_of_english_word += 1
    if(cnt_of_english_word / length >= 0.15):
        return True
    return False

#### Process & Filter the document

In [5]:
#remove retweets
mask = (~(df["text"].str.contains("RT")))
df = df.loc[mask]

#remove tweets less than 20 characters in length
mask = ((df["text"].str.len() > 20))
df = df.loc[mask]

#case fold
df["text"] = df.apply(lambda row: row["text"].casefold(), axis=1)

#remove links
df["text"] = df.apply(lambda row: re.sub(r'http\S+', '', row["text"]), axis=1)

#remove punctuation
# df["text"] = df.apply(lambda row: re.sub(r'[^\w\s]','',row["text"]), axis=1)

#tokenize
df["text"] = df.apply(lambda row: [x for x in tknzr.tokenize(row["text"])], axis=1)

#remove non-english tweets
df['isEnglish'] = df['text'].apply(lambda row: isEnglish(row))
mask = (df['isEnglish'] == True)
df = df.loc[mask]

#remove stop words
df["text"] = df.apply(lambda row: [i for i in row["text"] if i not in stop], axis=1)

#remove punctuation
df["text"] = df.apply(lambda row: [s for s in row["text"] if s not in punct], axis=1)

#lemmatize
df["text"] = df.apply(lambda row: [lemma.lemmatize(word) for word in row["text"]], axis=1)

#### Train & test the models

In [6]:
#rejoin sentences (for TF-IDF to work)
df["text"] = df.apply(lambda row: " ".join(row["text"]), axis=1)

#split the dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["airline_sentiment"], test_size=0.2, random_state=0)

#build tf-idf index
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

#multinomial naive bayes
clf_nb = MultinomialNB()
clf_nb.fit(X_train, y_train)
y_res_nb = clf_nb.predict(X_test)
filtered_nb_f1 = f1_score(y_test, y_res_nb, average="micro")
print("MultinomialNB\n", filtered_nb_f1)

# k-nearest neighbors classifier
clf_k = KNeighborsClassifier()
clf_k.fit(X_train, y_train)
y_res_k = clf_k.predict(X_test)
filtered_k_f1 = f1_score(y_test, y_res_k, average="micro")
print("KNeighborsClassifier\n", filtered_k_f1)

#random forest classifier
clf_rf = RandomForestClassifier(random_state=0)
clf_rf.fit(X_train, y_train)
y_res_rf = clf_rf.predict(X_test)
filtered_rf_f1 = f1_score(y_test, y_res_rf, average="micro")
print("RandomForestClassifier\n", filtered_rf_f1)

MultinomialNB
 0.6879712746858169
KNeighborsClassifier
 0.6420107719928186
RandomForestClassifier
 0.7490125673249551


#### Load dataset

In [7]:
CONST_DATA_ALL = "data/Tweets.csv"
df = pd.read_csv(CONST_DATA_ALL)
cols = [10, 1]
df = df[df.columns[cols]]
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 2 columns):
text                 14640 non-null object
airline_sentiment    14640 non-null object
dtypes: object(2)
memory usage: 228.8+ KB
None


#### Process the document (No filtering)

In [8]:
#case fold
df["text"] = df.apply(lambda row: row["text"].casefold(), axis=1)

#remove links
df["text"] = df.apply(lambda row: re.sub(r'http\S+', '', row["text"]), axis=1)  

#remove punctuation
# df["text"] = df.apply(lambda row: re.sub(r'[^\w\s]','',row["text"]), axis=1)

#tokenize
df["text"] = df.apply(lambda row: [x for x in tknzr.tokenize(row["text"])], axis=1)

#remove stop words
df["text"] = df.apply(lambda row: [i for i in row["text"] if i not in stop], axis=1)

#remove punctuation
df["text"] = df.apply(lambda row: [s for s in row["text"] if s not in punct], axis=1)

#lemmatize
df["text"] = df.apply(lambda row: [lemma.lemmatize(word) for word in row["text"]], axis=1)

#### Train & test the model

In [9]:
#rejoin sentences (for TF-IDF to work)
df["text"] = df.apply(lambda row: " ".join(row["text"]), axis=1)

#split the dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["airline_sentiment"], test_size=0.2, random_state=0)

#build tf-idf index
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

#multinomial naive bayes
clf_nb = MultinomialNB()
clf_nb.fit(X_train, y_train)
y_res_nb = clf_nb.predict(X_test)
unfiltered_nb_f1 = f1_score(y_test, y_res_nb, average="micro")
print("MultinomialNB\n", unfiltered_nb_f1)

# k-nearest neighbors classifier
clf_k = KNeighborsClassifier()
clf_k.fit(X_train, y_train)
y_res_k = clf_k.predict(X_test)
unfiltered_k_f1 = f1_score(y_test, y_res_k, average="micro")
print("KNeighborsClassifier\n", unfiltered_k_f1)

#random forest classifier
clf_rf = RandomForestClassifier(random_state=0)
clf_rf.fit(X_train, y_train)
y_res_rf = clf_rf.predict(X_test)
unfiltered_rf_f1 = f1_score(y_test, y_res_rf, average="micro")
print("RandomForestClassifier\n", unfiltered_rf_f1)

MultinomialNB
 0.6933060109289617
KNeighborsClassifier
 0.4074453551912569
RandomForestClassifier
 0.7547814207650273


#### Print the results

In [10]:
results = PrettyTable(["Method","Filtered F1", "Unfiltered F1"])
results.add_row(["MultinomialNB", filtered_nb_f1, unfiltered_nb_f1])
results.add_row(["KNeighborsClassifier", filtered_k_f1, unfiltered_k_f1])
results.add_row(["RandomForestClassifier", filtered_rf_f1, unfiltered_rf_f1])
print(results)

+------------------------+--------------------+--------------------+
|         Method         |    Filtered F1     |   Unfiltered F1    |
+------------------------+--------------------+--------------------+
|     MultinomialNB      | 0.6879712746858169 | 0.6933060109289617 |
|  KNeighborsClassifier  | 0.6420107719928186 | 0.4074453551912569 |
| RandomForestClassifier | 0.7490125673249551 | 0.7547814207650273 |
+------------------------+--------------------+--------------------+
