In [1]:
# Importing required libraries
import string
import numpy as np
import pandas as pd
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Loading training and testing data
train = pd.read_csv('./training_data.csv')
test = pd.read_csv('./test_data.csv')

# Data Preprocessing

In [3]:
train.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


In [4]:
train.isnull().sum()

tweet_id                      0
airline_sentiment             0
airline                       0
airline_sentiment_gold    10949
name                          0
negativereason_gold       10956
retweet_count                 0
text                          0
tweet_coord               10204
tweet_created                 0
tweet_location             3550
user_timezone              3577
dtype: int64

In [5]:
train_data = np.array(train['text'])
train_output = np.array(train['airline_sentiment'])
test_data = np.array(test['text'])

In [6]:
# Using Lemmatizer to group togetger forms of inflected forms
lemmatizer = WordNetLemmatizer()

In [7]:
# To remove Airline names from tweets
not_reqd = set(train['airline'])
not_reqd

{'American', 'Delta', 'Southwest', 'US Airways', 'United', 'Virgin America'}

In [8]:
# Importing stopwords using nltk and adding punctuations as well as few more words
stops = set(stopwords.words("english"))
punctuations = string.punctuation
stops = list(stops) + list(punctuations) + list(not_reqd) + ['@','http','americanair', 'JetBlue','SouthwestAir', 'USAirways']

In [9]:
'i' in stops

True

In [10]:
# Function to return POS in the format required by lemmatizer
def simple_pos(word):
    if word.startswith('J'):
        return wordnet.ADJ
    if word.startswith("V"):
        return wordnet.VERB
    if word.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Function to remove stops(stopwords) from the tweets
def clean_word(words):
    output_words = []
    for w in words:
        if (w.lower() not in stops) and (w.isnumeric() == False):
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = simple_pos(pos[0][1]))
            output_words.append(clean_word)
    return output_words

In [11]:
train_data

array(['@SouthwestAir I am scheduled for the morning, 2 days after the fact, yes..not sure why my evening flight was the only one Cancelled Flightled',
       '@SouthwestAir seeing your workers time in and time out going above and beyond is why I love flying with you guys. Thank you!',
       '@united Flew ORD to Miami and back and  had great crew, service on both legs. THANKS',
       ..., '@usairways the. Worst. Ever. #dca #customerservice',
       '@nrhodes85: look! Another apology. DO NOT FLY @USAirways',
       '@united you are by far the worst airline. 4 plane delays on 1 round trip flight. How is that possible.'],
      dtype=object)

In [12]:
# Joining the words in sentence after cleaning the words
train = [' '.join(clean_word(word_tokenize(word))) for word in train_data]

In [13]:
# Splitting the data
x_train1, x_train2, y_train1, y_train2 = train_test_split(train, train_output, random_state = 0)

In [14]:
# Applying Count Vectorizer on training data
count_vec = CountVectorizer(max_features=4000)
x_train1_features = count_vec.fit_transform(x_train1)
x_train1_features

<8235x4000 sparse matrix of type '<class 'numpy.int64'>'
	with 73305 stored elements in Compressed Sparse Row format>

In [15]:
# Applying Count Vectorizer on testing data
x_train2_features = count_vec.transform(x_train2)
x_train2_features

<2745x4000 sparse matrix of type '<class 'numpy.int64'>'
	with 24225 stored elements in Compressed Sparse Row format>

In [16]:
# List of features selected
count_vec.get_feature_names()

In [17]:
# Applying SVM classifier on data
from sklearn.svm import SVC

svc = SVC()
svc.fit(x_train1_features, y_train1)
svc.score(x_train2_features, y_train2)

0.7679417122040073

In [18]:
# Applying Multinomial Naive Bayes classifier on data

nb = MultinomialNB()
nb.fit(x_train1_features, y_train1)
nb.score(x_train2_features, y_train2)

0.7581056466302368

In [19]:
# Applying Random Forest classifier on data
rf = RandomForestClassifier()
rf.fit(x_train1_features, y_train1)
rf.score(x_train2_features, y_train2)

0.7548269581056466

In [20]:
# Applying Logistic Regression classifier on data
lr = LogisticRegression(max_iter=200)
lr.fit(x_train1_features, y_train1)
lr.score(x_train2_features, y_train2)

0.7763205828779599