In [20]:
import pandas as pd

## Loading Data

In [21]:
train_data = pd.read_csv("x_y_train.csv")
test_data = pd.read_csv("x_test.csv")

In [24]:
## Training data
x_train_data = train_data["text"]
y_train_data = train_data["airline_sentiment"]

## Testing data
x_test_data = test_data["text"]

## Tokenising Words in text

In [29]:
from nltk.tokenize import word_tokenize

In [162]:
## Training data
x_train = []
for i in range(len(x_train_data)):
    x_train.append(word_tokenize(x_train_data[i].lower()))
    
## Testing data
x_test = []
for i in range(len(x_test_data)):
    x_test.append(word_tokenize(x_test_data[i].lower()))

## Data Cleaning: Removing twitter-handles, stop-words, punctuations and taking care of uppercase

In [163]:
## Remove the twitter handles
for x in x_train:
    handles = []
    for i in range(len(x)):
        if x[i] == str('@'):
            handles.append(x[i])
            handles.append(x[i+1])
#     print(handles)  
    for p in handles:
        x.remove(p)

In [164]:
## Importing stopwords
from nltk.corpus import stopwords
stop = stopwords.words("english")

## Importing punctuations
import string
punctuations = string.punctuation

## Adding Punctuations to our stop words
stop += punctuations

In [170]:
from nltk.corpus import wordnet

## Function to change the pos_tag to simpler values which can be passed to lemmatize

def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [172]:
## Importing Lemmatizer
from  nltk  import  pos_tag
from  nltk.stem  import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [174]:
pos_tag(["appple"])

[('appple', 'NN')]

In [175]:
## Function to clean the words by removing stopwords, taking care of cases and lemmatizing the word to give root words only
def cleaned(words):
    output_words = []
    for w in words:
        if w.lower() not in stop:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [176]:
x_train_doc = [(cleaned(x_train[i]), y_train[i] ) for i in range(len(x_train))]

## Count Vectoriser

In [186]:
from sklearn.model_selection import train_test_split
x_train_train, x_train_test, y_train_train, y_train_test = train_test_split(x_train_data, y_train_data, random_state = 0, test_size = 0.25)

In [183]:
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer(max_features=100, ngram_range=(1,2), stop_words=stop)

In [187]:
## Fit transform the training data
train_features = count_vec.fit_transform(x_train_train)

## Only transform the testing data according to the features which was fit using x_train
test_features = count_vec.transform(x_train_test)

In [194]:
count_vec.get_feature_names()

['agent',
 'airline',
 'airport',
 'americanair',
 'amp',
 'another',
 'back',
 'bag',
 'baggage',
 'bags',
 'call',
 'cancelled',
 'cancelled flighted',
 'cancelled flightled',
 'change',
 'check',
 'co',
 'could',
 'customer',
 'customer service',
 'day',
 'delay',
 'delayed',
 'dm',
 'due',
 'email',
 'even',
 'ever',
 'first',
 'flight',
 'flighted',
 'flightled',
 'flights',
 'fly',
 'flying',
 'gate',
 'get',
 'getting',
 'go',
 'going',
 'good',
 'got',
 'great',
 'guys',
 'help',
 'hold',
 'home',
 'hour',
 'hours',
 'http',
 'http co',
 'jetblue',
 'know',
 'last',
 'late',
 'late flight',
 'like',
 'lost',
 'love',
 'luggage',
 'make',
 'minutes',
 'much',
 'need',
 'never',
 'new',
 'next',
 'one',
 'people',
 'phone',
 'plane',
 'please',
 'really',
 'seat',
 'service',
 'someone',
 'southwestair',
 'still',
 'take',
 'thank',
 'thanks',
 'ticket',
 'time',
 'today',
 'told',
 'tomorrow',
 'travel',
 'trying',
 'united',
 'us',
 'usairways',
 'virginamerica',
 'wait',
 'wai

In [192]:
## Applying SVC
from sklearn.svm import SVC
svc = SVC()
svc.fit(train_features, y_train_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [195]:
svc.score(test_features, y_train_test)

0.69071038251366124

In [198]:
## Applying Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(train_features, y_train_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [199]:
rf.score(test_features, y_train_test)

0.68160291438979959

In [205]:
## Applying Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
bayes = MultinomialNB()
bayes.fit(train_features, y_train_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [206]:
bayes.score(test_features, y_train_test)

0.70236794171220396