### Importing libraries

In [1]:
import numpy as np
import pandas as pd

### Loading and analysing the data

In [2]:
x_y_train = pd.read_csv("twitter_x_y_train.csv")

In [3]:
x_y_train

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)
5,569677636613439488,negative,United,,bobgiolito,,0,@united Why did you load us in this flying sar...,,2015-02-22 17:58:27 -0800,"Los Angeles, CA",Pacific Time (US & Canada)
6,569658903044218880,negative,Delta,,aaronkinnari,,0,@JetBlue that is a stock response. Delays not ...,,2015-02-22 16:44:00 -0800,Gotham,Quito
7,568542766860541952,positive,Delta,,TimothySays,,0,@JetBlue That'd be nice! Hoping to rack up eno...,,2015-02-19 14:48:53 -0800,"Burlington, MA",Eastern Time (US & Canada)
8,570116209263427584,negative,United,,lindaSWC,,1,@united frankly worse customer service ever. P...,,2015-02-23 23:01:11 -0800,,
9,568870144891600896,positive,Southwest,,amyums,,0,@SouthwestAir yeah haha. Never been in one. It...,,2015-02-20 12:29:46 -0800,,Central Time (US & Canada)


In [4]:
x_y_train.columns

Index(['tweet_id', 'airline_sentiment', 'airline', 'airline_sentiment_gold',
       'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord',
       'tweet_created', 'tweet_location', 'user_timezone'],
      dtype='object')

In [5]:
x_y_train.drop(['tweet_id', 'airline', 'airline_sentiment_gold',
       'name', 'negativereason_gold', 'retweet_count', 'tweet_coord',
       'tweet_created', 'tweet_location', 'user_timezone'], axis = 1, inplace = True)

In [6]:
x_y_train

Unnamed: 0,airline_sentiment,text
0,negative,"@SouthwestAir I am scheduled for the morning, ..."
1,positive,@SouthwestAir seeing your workers time in and ...
2,positive,@united Flew ORD to Miami and back and had gr...
3,negative,@SouthwestAir @dultch97 that's horse radish 😤🐴
4,negative,@united so our flight into ORD was delayed bec...
5,negative,@united Why did you load us in this flying sar...
6,negative,@JetBlue that is a stock response. Delays not ...
7,positive,@JetBlue That'd be nice! Hoping to rack up eno...
8,negative,@united frankly worse customer service ever. P...
9,positive,@SouthwestAir yeah haha. Never been in one. It...


In [7]:
x_y_train.shape

(10980, 2)

In [8]:
 x_y_train["text"][222]

"@united wont transfer flight ticket to accompany an 11 yr old who's active military mom had to have emergency brain surgery? WOW!!"

### Preparing the training and test dataset according to how we need them

In [9]:
y = x_y_train["airline_sentiment"]

In [10]:
y.shape

(10980,)

In [11]:
x_y_train.text[2]

'@united Flew ORD to Miami and back and  had great crew, service on both legs. THANKS'

In [12]:
def Punctuation(string): # removing punctuations in text only before splitting them
  
    # punctuation marks 
    punctuationss = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  
    # traverse the given string and if any punctuation 
    # marks occur replace it with null 
    for x in string.lower(): 
        if x in punctuationss: 
            string = string.replace(x, "") 
    return string

In [13]:
text_documents = []
for text_doc in x_y_train.text:
    text_documents.append(Punctuation(text_doc).split(" "))

In [14]:
len(text_documents)

10980

In [15]:
text_documents[10979]

['united',
 'you',
 'are',
 'by',
 'far',
 'the',
 'worst',
 'airline',
 '4',
 'plane',
 'delays',
 'on',
 '1',
 'round',
 'trip',
 'flight',
 'How',
 'is',
 'that',
 'possible']

#### Importing NLTK libraries

In [16]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


####  Collecting all the stop words

In [17]:
from string import punctuation

In [18]:
stops = stopwords.words("english")
stops = stops+ list(punctuation) + [" ",""] # also removing blanks and spaces

In [19]:
stops

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

#### Cleaning the tweets

In [20]:
def get_pos_tag(pos):
    if pos.startswith("J"):
        return wordnet.ADJ
    elif pos.startswith("V"):
        return wordnet.VERB
    elif pos.startswith("N"):
        return wordnet.NOUN
    elif pos.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [21]:
lemmatizer = WordNetLemmatizer()
def clean_words(words):
    clean_words_list = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w]) # pass an array in pos_tag always importantttttt
            lemma = lemmatizer.lemmatize(w,pos = get_pos_tag(pos[0][1]))
            clean_words_list.append(lemma.lower())
    return clean_words_list

In [22]:
text_documents[2]

['united',
 'Flew',
 'ORD',
 'to',
 'Miami',
 'and',
 'back',
 'and',
 '',
 'had',
 'great',
 'crew',
 'service',
 'on',
 'both',
 'legs',
 'THANKS']

In [23]:
clean_words(text_documents[7777]) 

['southwestair',
 'fly',
 'delta',
 'week',
 'gogo',
 'good',
 'tweet',
 'sent',
 'didnt',
 'even',
 'send',
 'land',
 'thats',
 'bad']

### Making features using count vectorizer

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [25]:
texts_join = [" ".join(clean_words(text_doc)) for text_doc in text_documents ]


In [26]:
len(texts_join)

10980

In [27]:
texts_join[2]

'united flew ord miami back great crew service leg thanks'

In [28]:
x_train,x_test,y_train,y_test = train_test_split(texts_join,y)

In [29]:
count_vec = CountVectorizer(max_features = 3000,ngram_range = (1,2))

In [30]:
sklearn_x_train = count_vec.fit_transform(x_train)

In [31]:
sklearn_x_test = count_vec.transform(x_test)

In [32]:
print(sklearn_x_train.todense())
print(count_vec.get_feature_names())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
['10', '10 day', '10 hour', '10 hr', '10 min', '10 minute', '100', '1000', '10000', '11', '1130', '12', '12 hour', '12 hr', '13', '130', '136', '14', '140', '140 character', '15', '15 hour', '15 hr', '15 min', '15 minute', '150', '16', '17', '18', '1800', '19', '1hr', '1k', '1st', '1st class', '1st time', '20', '20 min', '20 minute', '200', '200 fee', '2015', '21', '219', '22', '222', '224', '23', '24', '24 hour', '24 hr', '24hrs', '25', '25 hour', '25 hr', '25 min', '25 minute', '26', '27', '2day', '2days', '2hrs', '2nd', '2nd time', '2pm', '30', '30 min', '30 minute', '300', '32', '34', '35', '35 min', '39', '3am', '3rd', '3rd time', '40', '40 min', '40 minute', '400', '45', '45 min', '45 minute', '4th', '50', '50 min', '500', '530', '5th', '60', '600', '630', '728', '75', '7am', '80', '800', '800 number', '90', '90 min', 'a320', 'aa', 'abc', 'ability', 'able', 'abl

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

### Using Random Forest

In [81]:
clf1 = RandomForestClassifier()
clf1.fit(sklearn_x_train ,y_train)
clf1.score(sklearn_x_test,y_test)



0.7289617486338797

### SVM

In [82]:
clf2 = SVC()
clf2.fit(sklearn_x_train ,y_train)
clf2.score(sklearn_x_test,y_test)



0.6240437158469946

### KNN

In [83]:
clf3 = KNeighborsClassifier()
clf3.fit(sklearn_x_train ,y_train)
clf3.score(sklearn_x_test,y_test)

0.47795992714025504

### Naive_bayes

In [84]:
clf4 = MultinomialNB()
clf4.fit(sklearn_x_train ,y_train)
clf4.score(sklearn_x_test,y_test)

0.7562841530054645

In [54]:
clf5 = LogisticRegression()
clf5.fit(sklearn_x_train ,y_train)
clf5.score(sklearn_x_test,y_test)

0.775591985428051

## Getting the test data ready

In [35]:
x_test_pd = pd.read_csv("twitter_x_test.csv")

In [36]:
x_test_pd

Unnamed: 0,tweet_id,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,569682010270101504,American,,zsalim03,,0,@AmericanAir In car gng to DFW. Pulled over 1h...,,2015-02-22 18:15:50 -0800,Texas,Central Time (US & Canada)
1,569608307184242688,American,,sa_craig,,0,"@AmericanAir after all, the plane didn’t land ...",,2015-02-22 13:22:57 -0800,"College Station, TX",Central Time (US & Canada)
2,567879304593408001,Southwest,,DanaChristos,,1,@SouthwestAir can't believe how many paying cu...,,2015-02-17 18:52:31 -0800,CT,Eastern Time (US & Canada)
3,569757651539660801,US Airways,,rossj987,,0,@USAirways I can legitimately say that I would...,,2015-02-22 23:16:24 -0800,"Washington, D.C.",Eastern Time (US & Canada)
4,569900705852608513,American,,tranpham18,,0,@AmericanAir still no response from AA. great ...,,2015-02-23 08:44:51 -0800,New York City,Eastern Time (US & Canada)
5,569235062862036992,United,,setfive,,0,@united we have developers flying down tmrw mo...,,2015-02-21 12:39:49 -0800,"Central Sq. Cambridge, MA",Eastern Time (US & Canada)
6,569824422279950336,US Airways,,earthXplorer,,1,@USAirways hello??? Anyone there?,,2015-02-23 03:41:43 -0800,"Miami, Fl. USA",Eastern Time (US & Canada)
7,567880416201293824,US Airways,,AliNHamdani,,0,@USAirways @husainhaqqani Mr. Husain u shld pr...,,2015-02-17 18:56:56 -0800,Islamabad,Islamabad
8,570021943899877377,US Airways,,BradleyPollock,,0,"@USAirways not likely, flightaware says plane ...",,2015-02-23 16:46:36 -0800,,
9,569690664029462529,American,,kaps12,,0,@AmericanAir they don't even give an option to...,,2015-02-22 18:50:13 -0800,,


In [37]:
x_test_pd.drop(['tweet_id', 'airline', 'airline_sentiment_gold',
       'name', 'negativereason_gold', 'retweet_count', 'tweet_coord',
       'tweet_created', 'tweet_location', 'user_timezone'], axis = 1, inplace = True)

In [38]:
x_test_pd

Unnamed: 0,text
0,@AmericanAir In car gng to DFW. Pulled over 1h...
1,"@AmericanAir after all, the plane didn’t land ..."
2,@SouthwestAir can't believe how many paying cu...
3,@USAirways I can legitimately say that I would...
4,@AmericanAir still no response from AA. great ...
5,@united we have developers flying down tmrw mo...
6,@USAirways hello??? Anyone there?
7,@USAirways @husainhaqqani Mr. Husain u shld pr...
8,"@USAirways not likely, flightaware says plane ..."
9,@AmericanAir they don't even give an option to...


In [39]:
text_documents_test = []
for text_doc in x_test_pd.text:
    text_documents_test.append(Punctuation(text_doc).split(" "))

In [40]:
text_documents_test[2]

['SouthwestAir',
 'cant',
 'believe',
 'how',
 'many',
 'paying',
 'customers',
 'you',
 'left',
 'high',
 'and',
 'dry',
 'with',
 'no',
 'reason',
 'for',
 'flight',
 'Cancelled',
 'Flightlations',
 'Monday',
 'out',
 'of',
 'BDL',
 'Wow']

In [41]:
len(text_documents_test)

3660

In [42]:
texts_join_test = [" ".join(clean_words(text_doc)) for text_doc in text_documents_test ]


In [43]:
sklearn_x_test_final = count_vec.transform(texts_join_test)

In [44]:
sklearn_x_test_final.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Predicting the values

### Using random forest

In [103]:
y_pred_rfc = clf1.predict(sklearn_x_test_final)

In [111]:
len(y_pred_rfc)

3660

In [115]:
np.savetxt("twitter_pred_rfc",y_pred_rfc,delimiter = ",",fmt = "%s")

### Using Naive_bayes

In [104]:
y_pred_NB = clf4.predict(sklearn_x_test_final)

In [114]:
np.savetxt("twitter_pred_NB",y_pred_NB,delimiter = ",",fmt = "%s")

### Logistic Regression

In [45]:
y_pred_LR = clf5.predict(sklearn_x_test_final)

In [46]:
np.savetxt("twitter_pred_LR",y_pred_LR,delimiter = ",",fmt = "%s")