# Sentiment Analysis on Twitter US Airline Sentiment Dataset

### Load Data

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
import pandas as pd

In [3]:
train = pd.read_csv('twitter_x_y_train.csv')
test = pd.read_csv('twitter_x_test.csv')
print("Tweets in training data:{}".format(train.shape[0]))
print("Tweets in testing data:{}".format(test.shape[0]))
print("Total Tweets --> ",train.shape[0]+test.shape[0])

Tweets in training data:10980
Tweets in testing data:3660
Total Tweets -->  14640


In [4]:
#print(train.info())

In [5]:
#train.head(5)

In [6]:
print(train.airline_sentiment.value_counts())

negative    6851
neutral     2327
positive    1802
Name: airline_sentiment, dtype: int64


In [7]:
train.columns

Index(['tweet_id', 'airline_sentiment', 'airline', 'airline_sentiment_gold',
       'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord',
       'tweet_created', 'tweet_location', 'user_timezone'],
      dtype='object')

# Clean data

In [8]:
drop_cols = ['tweet_id', 'airline', 'airline_sentiment_gold',
       'name', 'negativereason_gold', 'retweet_count','tweet_coord',
       'tweet_created', 'tweet_location', 'user_timezone']
train.drop(drop_cols, axis = 1, inplace=True)
test.drop(drop_cols, axis = 1, inplace=True)

In [9]:
train

Unnamed: 0,airline_sentiment,text
0,negative,"@SouthwestAir I am scheduled for the morning, ..."
1,positive,@SouthwestAir seeing your workers time in and ...
2,positive,@united Flew ORD to Miami and back and had gr...
3,negative,@SouthwestAir @dultch97 that's horse radish 😤🐴
4,negative,@united so our flight into ORD was delayed bec...
...,...,...
10975,neutral,@AmericanAir followback
10976,positive,@united thanks for the help. Wish the phone re...
10977,negative,@usairways the. Worst. Ever. #dca #customerser...
10978,negative,@nrhodes85: look! Another apology. DO NOT FLY ...


In [10]:
train_data = (train["text"])
target_data = set(train['airline_sentiment'])
test_data = (test["text"])

In [11]:
tweets_train = []
tweets_test = []
for sentiment in target_data:
    
    #---->Training data (text,sentiment)
    sentiment_rows_train = (train['airline_sentiment'] == sentiment)
    current_train=train[sentiment_rows_train]
    current_train.reset_index(drop=True,inplace=True)
    for tweet in (list(current_train['text'])):
        tweets_train.append(((word_tokenize(tweet)), sentiment))    
    
#---->Testing data (text)
for j in test_data:
    tweets_test.append((word_tokenize(j)))

In [12]:
print(len(tweets_train))
print(len(tweets_test))

10980
3660


In [13]:
#tweets_train[:5]
#tweets_test[:5]

In [14]:
import random
random.shuffle(tweets_train)
#tweets_train[0:5]

### WordNetLemmatizer and Pos Tag

In [15]:
lemmatizer = WordNetLemmatizer()

In [16]:
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [17]:
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)

others = ['http']

stops.update(punctuations)
stops.update(others)
#stops

In [18]:
def get_clean_words(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [19]:
tweets_train = [(get_clean_words(tweet),sentiment) for tweet,sentiment in tweets_train]
len(tweets_train)

10980

In [20]:
tweets_test = [get_clean_words(tweet) for tweet in tweets_test]
len(tweets_test)

3660

In [21]:
print(len(tweets_train))
print(len(tweets_test))

10980
3660


In [22]:
all_words = []

for tweet in tweets_train:
    all_words += tweet[0]
    
#all_words 

In [23]:
#Gives freq of each words
freq = nltk.FreqDist(all_words)
common = freq.most_common(3000)
features = [i[0] for i in common]
#features

In [24]:
def get_features_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w] = w in words_set
    return current_features

In [25]:
tweets_train = [(get_features_dict(tweet), sentiment) for tweet,sentiment in tweets_train]
len(tweets_train)

10980

In [26]:
#list of dictionary  <----- Not needed right now
x_train = []
for tweet in tweets_train:
    x_train.append(tweet[0])
#x_train[:2]

In [27]:
y_train = train['airline_sentiment']

In [28]:
tweets_test = [(get_features_dict(tweet)) for tweet in tweets_test]
len(tweets_test)

3660

# Train NLTK data format and using inbuilt sklearn classifier

In [29]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.metrics import accuracy_score

 # Naive Bayes

In [30]:
from nltk import NaiveBayesClassifier

In [31]:
classifier_nb = NaiveBayesClassifier.train(tweets_train)

In [32]:
classifier_nb.show_most_informative_features(15)

Most Informative Features
                   kudos = True           positi : negati =     51.9 : 1.0
               fantastic = True           positi : negati =     34.2 : 1.0
                favorite = True           positi : negati =     34.2 : 1.0
             outstanding = True           positi : negati =     29.1 : 1.0
                 helpful = True           positi : neutra =     25.4 : 1.0
                   flyfi = True           positi : negati =     24.1 : 1.0
                   thank = True           positi : negati =     23.0 : 1.0
                 awesome = True           positi : negati =     22.6 : 1.0
                    hold = True           negati : positi =     22.3 : 1.0
               beautiful = True           positi : negati =     21.5 : 1.0
                      hr = True           negati : positi =     21.4 : 1.0
                  street = True           neutra : negati =     20.6 : 1.0
                   daily = True           neutra : negati =     20.6 : 1.0

In [33]:
y_pred=[]
length=len(tweets_test)
for i in range(0,length):
    y_pred.append(classifier_nb.classify(tweets_test[i]))

In [34]:
df_nb = pd.DataFrame(y_pred)
df_nb.to_csv('twitter_naivebayes_pred.csv',header=False,index=False)
df_nb.shape

(3660, 1)

# Random Forest Classifier

In [35]:
from sklearn.ensemble import RandomForestClassifier

In [39]:
rfc = RandomForestClassifier()
classifier_rf = SklearnClassifier(rfc)
classifier_rf.train(tweets_train)

<SklearnClassifier(RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False))>

In [40]:
y_pred2=[]
length2=len(tweets_test)
for i in range(0,length2):
    y_pred2.append(classifier_rf.classify(tweets_test[i]))
    print(classifier_rf.classify(tweets_test[i]))

e
negative
negative
negative
negative
negative
neutral
negative
negative
negative
negative
negative
negative
negative
negative
negative
positive
negative
negative
negative
neutral
negative
negative
negative
negative
neutral
negative
neutral
negative
negative
positive
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
positive
negative
negative
neutral
negative
positive
negative
negative
negative
neutral
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
positive
negative
negative
neutral
neutral
negative
positive
negative
negative
positive
negative
neutral
negative
neutral
negative
positive
negative
negative
negative
negative
negative
negative
neutral
negative
positive
negative
neutral
negative
neutral
negative
negative
neutral
negative
neutral
positive
negative
negative
negative
negative
negative
negative
negative
negative
neutral
negati

In [41]:
df_sv = pd.DataFrame(y_pred)
df_sv.to_csv('twitter_randomforest_pred.csv',header=False,index=False)
df_sv.shape

(3660, 1)