#### Imports

In [1]:
import nltk
import string
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

#### Load files, useful lingquistic lists and tools

In [2]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
tknzr = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

CONST_DATA_ALL = "data/Tweets.csv"
CONST_COMMONWORDS_ALL = "data/4000-most-common-english-words.csv"
# https://github.com/first20hours/google-10000-english/blob/master/google-10000-english.txt
common_words = pd.read_csv(CONST_COMMONWORDS_ALL)
common_words = list(common_words["words"])[:2000]

df = pd.read_csv(CONST_DATA_ALL)
cols = [10, 1]
df = df[df.columns[cols]]
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 2 columns):
text                 14640 non-null object
airline_sentiment    14640 non-null object
dtypes: object(2)
memory usage: 228.8+ KB
None


#### Useful functions

In [3]:
#check wheether a sentence is in Egnlish or not using a 15% threshold
def isEnglish(sent):
    length = len(sent)
    cnt_of_english_word = 0
    for word in sent:
        if (word in common_words):
            cnt_of_english_word += 1
    if(cnt_of_english_word / length >= 0.15):
        return True
    return False

#### Process Tweets ( Process twitter-airline-sentiment dataset) (Filtered)

In [4]:
# Remove retweets
mask = (~(df["text"].str.contains("RT")))
df = df.loc[mask]

#remove tweets less than 20 characters in length
mask = ((df["text"].str.len() > 20))
df = df.loc[mask]

df["text"] = df['text'].str.replace('[^\w\s]','')


#case fold
df["text"] = df.apply(lambda row: row["text"].casefold(), axis=1)

#remove links
df["text"] = df.apply(lambda row: re.sub(r'http\S+', '', row["text"]), axis=1)  

#tokenize
df["text"] = df.apply(lambda row: [x for x in tknzr.tokenize(row["text"])], axis=1)

#remove non-english tweets
df['isEnglish'] = df['text'].apply(lambda row: isEnglish(row))
mask = (df['isEnglish'] == True)
df = df.loc[mask]

#remove stop words
df["text"] = df.apply(lambda row: [i for i in row["text"] if i not in stop], axis=1)

#remove punctuation
df["text"] = df.apply(lambda row: [ch for ch in row["text"] if ch not in exclude], axis=1)

#lemmatize
df["text"] = df.apply(lambda row: [lemma.lemmatize(word) for word in row["text"]], axis=1)



#rejoin sentences (for TF-IDF to work)
df["text"] = df.apply(lambda row: " ".join(row["text"]), axis=1)

tfidf = TfidfVectorizer(norm=None)

X_train, X_test, y_train, y_test = train_test_split(df["text"], df["airline_sentiment"], test_size=0.2, random_state=0)

X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

clf_nb = MultinomialNB()
clf_nb.fit(X_train, y_train)

y_res_nb = clf_nb.predict(X_test)

print("MultinomialNB\n", f1_score(y_test, y_res_nb, average="micro"))

clf_k = KNeighborsClassifier()
clf_k.fit(X_train, y_train)

y_res_k = clf_k.predict(X_test)

print("KNeighborsClassifier\n", f1_score(y_test, y_res_k, average="micro"))

clf_rf = RandomForestClassifier(random_state=0)
clf_rf.fit(X_train, y_train)

y_res_rf = clf_rf.predict(X_test)

print("RandomForestClassifier\n", f1_score(y_test, y_res_rf, average="micro"))

MultinomialNB
 0.7470609191307446
KNeighborsClassifier
 0.6134663341645885




RandomForestClassifier
 0.7377983612397577


In [5]:
CONST_DATA_ALL = "data/Tweets.csv"
df = pd.read_csv(CONST_DATA_ALL)
cols = [10, 1]
df = df[df.columns[cols]]
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 2 columns):
text                 14640 non-null object
airline_sentiment    14640 non-null object
dtypes: object(2)
memory usage: 228.8+ KB
None


In [6]:
#case fold
df["text"] = df.apply(lambda row: row["text"].casefold(), axis=1)

#remove links
df["text"] = df.apply(lambda row: re.sub(r'http\S+', '', row["text"]), axis=1)  

#tokenize
df["text"] = df.apply(lambda row: [x for x in tknzr.tokenize(row["text"])], axis=1)

#remove non-english tweets
df['isEnglish'] = df['text'].apply(lambda row: isEnglish(row))
mask = (df['isEnglish'] == True)
df = df.loc[mask]

#remove stop words
df["text"] = df.apply(lambda row: [i for i in row["text"] if i not in stop], axis=1)

#remove punctuation
df["text"] = df.apply(lambda row: [ch for ch in row["text"] if ch not in exclude], axis=1)

#lemmatize
df["text"] = df.apply(lambda row: [lemma.lemmatize(word) for word in row["text"]], axis=1)



In [7]:
#rejoin sentences (for TF-IDF to work)
df["text"] = df.apply(lambda row: " ".join(row["text"]), axis=1)

cv = CountVectorizer()
tfidf = TfidfVectorizer(norm=None)

X_train, X_test, y_train, y_test = train_test_split(df["text"], df["airline_sentiment"], test_size=0.2, random_state=0)



X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

clf_nb = MultinomialNB()
clf_nb.fit(X_train, y_train)

y_res_nb = clf_nb.predict(X_test)
print("Naive Bayed\n", f1_score(y_test, y_res_nb, average="micro"))

clf_k = KNeighborsClassifier()
clf_k.fit(X_train, y_train)

y_res_k = clf_k.predict(X_test)
print("nKNeighborsClassifier\n", f1_score(y_test, y_res_k, average="micro"))


clf_rf = RandomForestClassifier(random_state=0)
clf_rf.fit(X_train, y_train)

y_res_rf = clf_rf.predict(X_test)
print("RandomForestClassifier\n", f1_score(y_test, y_res_rf, average="micro"))

Naive Bayed
 0.7340162486753797
nKNeighborsClassifier
 0.5549275874249382




RandomForestClassifier
 0.7290709996467679
