Given Twitter US Airline Sentiment Dataset, which contains data for over 14000 tweets, your task is to predict the sentiment of the tweet i.e. positive, negative or neutral.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv(r'training_twitter_x_y_train.csv')
test = pd.read_csv(r'test_twitter_x_test.csv')

In [3]:
train.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


In [4]:
train.count()

tweet_id                  10980
airline_sentiment         10980
airline                   10980
airline_sentiment_gold       31
name                      10980
negativereason_gold          24
retweet_count             10980
text                      10980
tweet_coord                 776
tweet_created             10980
tweet_location             7430
user_timezone              7403
dtype: int64

In [5]:
#Remove the unnecessary columns from training data

train.drop('tweet_id', inplace = True, axis = 1)
train.drop('airline_sentiment_gold', inplace = True, axis = 1)
train.drop('negativereason_gold', inplace = True, axis = 1)
train.drop('tweet_coord', inplace = True, axis = 1)
train.drop('tweet_created',inplace = True, axis = 1)

In [8]:
train.head()

Unnamed: 0,airline_sentiment,airline,name,retweet_count,text,tweet_location,user_timezone
0,negative,Southwest,ColeyGirouard,0,"@SouthwestAir I am scheduled for the morning, ...",Washington D.C.,Atlantic Time (Canada)
1,positive,Southwest,WalterFaddoul,0,@SouthwestAir seeing your workers time in and ...,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,positive,United,LocalKyle,0,@united Flew ORD to Miami and back and had gr...,Illinois,Central Time (US & Canada)
3,negative,Southwest,amccarthy19,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,Atlantic Time (Canada)
4,negative,United,J_Okayy,0,@united so our flight into ORD was delayed bec...,,Eastern Time (US & Canada)


In [9]:
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10980 entries, 0 to 10979
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   airline_sentiment  10980 non-null  object
 1   airline            10980 non-null  object
 2   name               10980 non-null  object
 3   retweet_count      10980 non-null  int64 
 4   text               10980 non-null  object
 5   tweet_location     7430 non-null   object
 6   user_timezone      7403 non-null   object
dtypes: int64(1), object(6)
memory usage: 600.6+ KB
None


### Filling the missing Values

In [10]:
mood_count = train['airline_sentiment'].value_counts()
mood_count  #this returns the values of the type of the tweets

negative    6851
neutral     2327
positive    1802
Name: airline_sentiment, dtype: int64

In [11]:
train['airline'].value_counts()  #to get the all values count

United            2928
US Airways        2152
American          2078
Southwest         1817
Delta             1639
Virgin America     366
Name: airline, dtype: int64

### Converting the categorical data to numerical data

In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['airline_sentiment'] = le.fit_transform(train['airline_sentiment'])    

In [13]:
train.head()  #0 means - negative, 1- neutral, 2- positive

Unnamed: 0,airline_sentiment,airline,name,retweet_count,text,tweet_location,user_timezone
0,0,Southwest,ColeyGirouard,0,"@SouthwestAir I am scheduled for the morning, ...",Washington D.C.,Atlantic Time (Canada)
1,2,Southwest,WalterFaddoul,0,@SouthwestAir seeing your workers time in and ...,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,2,United,LocalKyle,0,@united Flew ORD to Miami and back and had gr...,Illinois,Central Time (US & Canada)
3,0,Southwest,amccarthy19,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,Atlantic Time (Canada)
4,0,United,J_Okayy,0,@united so our flight into ORD was delayed bec...,,Eastern Time (US & Canada)


### Getting the words as features

In [15]:
import nltk 
from nltk.corpus import stopwords
import re

In [16]:
#first I splitted the text into words, then remove the stopwords and then combine all the meaningful words into text again, return this text.
def words_from_tweet(tweet):
    only_letters = re.sub("[^a-zA-Z]", " ", tweet) 
    words = only_letters.lower().split()                             
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops] 
    return( " ".join( meaningful_words ))

In [19]:
#Cleaning the train data
train['clean_tweet'] = train['text'].apply(lambda x: words_from_tweet(x))

In [20]:
#Cleaning the test data
test['clean_tweet'] = test['text'].apply(lambda x: words_from_tweet(x))

### Getting all the tweets from the training and testing data so as to pass this text to CountVectorizer.

In [21]:
#for cleaning the training tweets
train_clean_tweet=[]
for tweet in train['clean_tweet']:
    train_clean_tweet.append(tweet)
  
#for clearing the testing tweets
test_clean_tweet=[]
for tweet in test['clean_tweet']:
    test_clean_tweet.append(tweet)

In [23]:
train_clean_tweet  #contains all the cleaned train tweets

['southwestair scheduled morning days fact yes sure evening flight one cancelled flightled',
 'southwestair seeing workers time time going beyond love flying guys thank',
 'united flew ord miami back great crew service legs thanks',
 'southwestair dultch horse radish',
 'united flight ord delayed air force one last flight sbn mins landed',
 'united load us flying sardine knew pilots hours late flight incompetent beyond belief',
 'jetblue stock response delays frustrating poor cust serv amp told ppl wait amp come back',
 'jetblue nice hoping rack enough miles take trip seattle enjoy perfect latte city coffee',
 'united frankly worse customer service ever problems happen deal defines company never united',
 'southwestair yeah haha never one expensive much fun destinationdragons',
 'southwestair mco gt dca flight almost full people screwed msy dca cancelled flightation united usairways cancelled flight swa mistake',
 'jetblue easiest way get ticket receipt get one check get one online tha

### Applying Count Vectorizer to get the features required by sklearn model

In [90]:
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer()
train_features = count_vec.fit_transform(train_clean_tweet)
test_features = count_vec.transform(test_clean_tweet)

In [91]:
train_features

<10980x11471 sparse matrix of type '<class 'numpy.int64'>'
	with 107004 stored elements in Compressed Sparse Row format>

In [31]:
count_vec.get_feature_names()

['aa',
 'aaaand',
 'aadvantage',
 'aafail',
 'aakjumxa',
 'aal',
 'aaron',
 'aarp',
 'aau',
 'aavvoreph',
 'aay',
 'ab',
 'abandon',
 'abandoned',
 'abandonment',
 'abassinet',
 'abbreve',
 'abc',
 'abcnetwork',
 'abcnews',
 'abducted',
 'abi',
 'abigailedge',
 'abilities',
 'ability',
 'able',
 'aboard',
 'aboout',
 'abounds',
 'abq',
 'abroad',
 'absolute',
 'absolutely',
 'absorber',
 'absoulutely',
 'absurd',
 'absurdity',
 'absurdly',
 'abt',
 'abtwf',
 'abundance',
 'abuse',
 'abused',
 'abysmal',
 'ac',
 'acarl',
 'acc',
 'accelerate',
 'accept',
 'acceptable',
 'accepted',
 'accepting',
 'acces',
 'access',
 'accessibility',
 'accessible',
 'accessing',
 'accident',
 'accidentally',
 'accidents',
 'accomidating',
 'accommodate',
 'accommodated',
 'accommodating',
 'accommodation',
 'accommodations',
 'accompaniments',
 'accompany',
 'accomplished',
 'according',
 'accordingly',
 'account',
 'accountability',
 'accounts',
 'accruing',
 'acct',
 'accts',
 'accumulation',
 'accura

### Applying Multinomial Naive Bayes as here 3 classes are present.

In [34]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

In [35]:
nb = MultinomialNB()

In [36]:
nb.fit(train_features,train['airline_sentiment'])

accuracy = accuracy_score(nb.predict(train_features),train['airline_sentiment'])
print(accuracy)

0.8437158469945355


In [37]:
test_features

<3660x11471 sparse matrix of type '<class 'numpy.int64'>'
	with 33563 stored elements in Compressed Sparse Row format>

In [38]:
test_features.todense()

matrix([[2, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [40]:
test_pred = nb.predict(test_features)

In [42]:
test_pred

array([0, 0, 0, ..., 1, 2, 1])

In [43]:
#Converting back to the labels in test_pred
test_pred = le.inverse_transform(test_pred)

In [44]:
test_pred

array(['negative', 'negative', 'negative', ..., 'neutral', 'positive',
       'neutral'], dtype=object)

In [None]:
#np.savetxt("test_pred.csv",test_pred,fmt='%s')

### Applying RandomForestClassifier

In [45]:
from sklearn.ensemble import RandomForestClassifier

In [46]:
clf = RandomForestClassifier(n_estimators = 200)

In [47]:
train_features

<10980x11471 sparse matrix of type '<class 'numpy.int64'>'
	with 107004 stored elements in Compressed Sparse Row format>

In [48]:
clf.fit(train_features,train['airline_sentiment'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [49]:
accuracy = accuracy_score(clf.predict(train_features),train['airline_sentiment'])
print(accuracy)

0.995719489981785


In [50]:
test_pred1 = clf.predict(test_features)

In [52]:
test_pred1 = le.inverse_transform(test_pred1)

In [53]:
print(test_pred1)

['negative' 'neutral' 'negative' ... 'neutral' 'positive' 'negative']


In [54]:
#Since we have more accuracy from RandomForestClassifier than Multinomial Naive Bayes, so save its predictions
np.savetxt("test_pred.csv",test_pred1, fmt='%s')