In [1]:
import pandas as pd
import numpy as np
import nltk
import io

In [2]:
df = pd.read_csv("Airline Tweets.csv")

In [3]:
'''Textblob sentiment analysis
Code was used from:
https://towardsdatascience.com/having-fun-with-textblob-7e9eed783d3f
https://stackoverflow.com/questions/43485469/apply-textblob-in-for-each-row-of-a-dataframe
'''

from textblob import TextBlob

#replace null values with empty space
noNull = df['Text'].fillna("")

df['sentiment'] = noNull.apply(lambda tweet: TextBlob(tweet).sentiment)

df

Unnamed: 0.1,Unnamed: 0,number,keyword,Text,Tidy_Tweets,Unnamed: 5,"0-apologize, 1-rectify, 2-supportive/helpful, 3-appreciate, 4 - misc.",Rating,"0-neg, 1-neu, 2-pos",sentiment
0,0.943430,65789,TT,Hi there the best team to assist you is our ...,there best team assist contact centr fee appli...,,,H,,"(1.0, 0.3)"
1,0.970177,49400,NK,We re awfully sorry to hear about that Pleas...,aw sorri hear about that pleas your reserv inf...,,-,U,,"(-0.5, 1.0)"
2,0.143541,60959,TOM,RT In #Namibia Melissa and Anna Marie found...,#namibia melissa anna mari found immedi after ...,,,N,,"(0.0, 0.0)"
3,0.360187,77607,VA,We re sorry to hear this Peter We would reco...,sorri hear thi peter would recommend phone bag...,,,U,,"(-0.04999999999999999, 0.7)"
4,0.779308,58487,TOM,RT The Q #TUIResults are out and show signi...,#tuiresult show signific growth custom number ...,,,N,,"(0.375, 0.875)"
...,...,...,...,...,...,...,...,...,...,...
318,0.510298,15909,AS,I ll work on that for ya Laura,work that laura,,,H,,"(0.0, 0.0)"
319,0.703480,63020,TOM,RT Tui Travel reports record results pc ju...,travel report record result jump profit peter ...,,,N,,"(-0.05, 0.4)"
320,0.085105,58046,TOM,#TUIresults in detail focus on #cruises Stro...,#tuiresult detail focu #cruis strong growth pa...,,,N,,"(0.5916666666666667, 0.8416666666666666)"
321,0.260572,42866,LH,Please advise your full name email and posta...,pleas advis your full name email postal addres...,,,H,,"(0.16499999999999998, 0.6599999999999999)"


In [4]:
'''Vader from nltk for sentiment analysis
Code was used from:
http://www.nltk.org/howto/sentiment.html
https://stackoverflow.com/questions/57803412/applying-sentimentintensityanalyzer-function-on-each-row-of-the-dataframe-prov
http://t-redactyl.io/blog/2017/04/using-vader-to-handle-sentiment-analysis-with-social-media-text.html
https://stackoverflow.com/questions/52323299/python-return-elif-statement
'''


nltk.download('vader_lexicon') #download if don't have vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer


def get_sentiment(row, **kwargs):
    sid = SentimentIntensityAnalyzer()
    sentiment_score = sid.polarity_scores(row)
    positive_meter = sentiment_score['pos']
    negative_meter = sentiment_score['neg']
    neutral_meter = sentiment_score['neu']
    compound_meter = sentiment_score['compound']
    
    if kwargs['k'] == 'positive':
        return positive_meter
    elif kwargs['k'] == 'negative':
        return negative_meter
    elif kwargs['k'] == 'neutral':
        return neutral_meter
    else:
        return compound_meter

#replace null values with empty space
noNull = df['Text'].fillna("")

#tokenize tweets
tokenized_tweet = noNull.apply(lambda x: x.split())

#put list back together
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

df['Positive'] = tokenized_tweet.apply(get_sentiment, k= 'positive')
df['Negative'] = tokenized_tweet.apply(get_sentiment, k= 'negative')
df['Neutral'] = tokenized_tweet.apply(get_sentiment, k = 'neutral')
df['Compound'] = tokenized_tweet.apply(get_sentiment, k = 'compound')

df

[nltk_data] Downloading package vader_lexicon to C:\Users\Karthik
[nltk_data]     Doddi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0.1,Unnamed: 0,number,keyword,Text,Tidy_Tweets,Unnamed: 5,"0-apologize, 1-rectify, 2-supportive/helpful, 3-appreciate, 4 - misc.",Rating,"0-neg, 1-neu, 2-pos",sentiment,Positive,Negative,Neutral,Compound
0,0.943430,65789,TT,Hi there the best team to assist you is our ...,there best team assist contact centr fee appli...,,,H,,"(1.0, 0.3)",0.174,0.000,0.826,0.6369
1,0.970177,49400,NK,We re awfully sorry to hear about that Pleas...,aw sorri hear about that pleas your reserv inf...,,-,U,,"(-0.5, 1.0)",0.096,0.067,0.837,0.1796
2,0.143541,60959,TOM,RT In #Namibia Melissa and Anna Marie found...,#namibia melissa anna mari found immedi after ...,,,N,,"(0.0, 0.0)",0.000,0.000,1.000,0.0000
3,0.360187,77607,VA,We re sorry to hear this Peter We would reco...,sorri hear thi peter would recommend phone bag...,,,U,,"(-0.04999999999999999, 0.7)",0.066,0.034,0.899,0.2960
4,0.779308,58487,TOM,RT The Q #TUIResults are out and show signi...,#tuiresult show signific growth custom number ...,,,N,,"(0.375, 0.875)",0.180,0.000,0.820,0.5267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318,0.510298,15909,AS,I ll work on that for ya Laura,work that laura,,,H,,"(0.0, 0.0)",0.000,0.000,1.000,0.0000
319,0.703480,63020,TOM,RT Tui Travel reports record results pc ju...,travel report record result jump profit peter ...,,,N,,"(-0.05, 0.4)",0.116,0.000,0.884,0.4404
320,0.085105,58046,TOM,#TUIresults in detail focus on #cruises Stro...,#tuiresult detail focu #cruis strong growth pa...,,,N,,"(0.5916666666666667, 0.8416666666666666)",0.431,0.000,0.569,0.8442
321,0.260572,42866,LH,Please advise your full name email and posta...,pleas advis your full name email postal addres...,,,H,,"(0.16499999999999998, 0.6599999999999999)",0.120,0.000,0.880,0.6124


In [5]:
##Cross Validation

from sklearn.model_selection import KFold # import KFold

X=np.array(df['Text'])
y=np.array(df['Rating'])

kf = KFold(n_splits=10) # Define the split - into 2 folds 
kf.get_n_splits(X) # returns the number of splitting iterations in the cross-validator

print(kf)

KFold(n_splits=10, random_state=None, shuffle=False)


In [6]:
#Function to Train & Run the model
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(stop_words='english', ngram_range = (1,1), max_df = .80, min_df = 1)
vectorizer = TfidfVectorizer(min_df=15)
def run_model(model,modelType):
    scores=list()
    scores_std=list()
    C_s = np.logspace(-10, 0, 10)
    for train_index, test_index in kf.split(X):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        if modelType == "KNN":
            #vectorizer = TfidfVectorizer(min_df=15)
            X_train_dtm = vectorizer.fit_transform(X_train)
            X_test_dtm = vectorizer.transform(X_test)
            
        else:
            #vect = CountVectorizer(stop_words='english', ngram_range = (1,1), max_df = .80, min_df = 1)
            #Using training data to transform text into counts of features for each message
            vect.fit(X_train)
            X_train_dtm = vect.transform(X_train) 
            X_test_dtm = vect.transform(X_test)

        #Fit Model
        model.fit(X_train_dtm, y_train)
        y_pred = model.predict(X_test_dtm)

        this_scores=metrics.accuracy_score(y_test,y_pred)*100
        scores.append(np.mean(this_scores))
        scores_std.append(np.std(this_scores))
        
        print('Accuracy Score: ',this_scores,'%',sep='')
        print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')
    return scores

In [7]:
'''KNN model'''
from sklearn.neighbors import KNeighborsClassifier

#KNN Classifiers
KNN = KNeighborsClassifier(n_neighbors = 1)
scores=run_model(KNN,"KNN")

print('\nK Nearest Neighbors (NN = 1)')
print('Accuracy Score: ',np.mean(scores),'%',sep='')
#print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')

Accuracy Score: 78.78787878787878%
Confusion Matrix: 
[[10  0  2]
 [ 1  6  1]
 [ 1  2 10]]
Accuracy Score: 51.515151515151516%
Confusion Matrix: 
[[7 4 1]
 [4 2 2]
 [3 2 8]]
Accuracy Score: 66.66666666666666%
Confusion Matrix: 
[[10  2  0]
 [ 2  3  3]
 [ 4  0  9]]
Accuracy Score: 65.625%
Confusion Matrix: 
[[10  4  2]
 [ 1  3  0]
 [ 3  1  8]]
Accuracy Score: 56.25%
Confusion Matrix: 
[[7 3 3]
 [2 3 1]
 [3 2 8]]
Accuracy Score: 78.125%
Confusion Matrix: 
[[14  2  3]
 [ 1  6  0]
 [ 1  0  5]]
Accuracy Score: 65.625%
Confusion Matrix: 
[[5 4 2]
 [1 7 0]
 [3 1 9]]
Accuracy Score: 65.625%
Confusion Matrix: 
[[8 1 5]
 [3 4 0]
 [1 1 9]]
Accuracy Score: 62.5%
Confusion Matrix: 
[[10  1  5]
 [ 3  4  0]
 [ 1  2  6]]
Accuracy Score: 71.875%
Confusion Matrix: 
[[ 9  2  3]
 [ 1  4  1]
 [ 2  0 10]]

K Nearest Neighbors (NN = 1)
Accuracy Score: 66.2594696969697%


In [8]:
'''Logistic Regression'''

from sklearn.linear_model import LogisticRegression

#logistic regression
LR = LogisticRegression()
scores=run_model(LR,"LR")

print('\nLogistic Regression')
print('Accuracy Score: ',np.mean(scores),'%',sep='')

Accuracy Score: 87.87878787878788%
Confusion Matrix: 
[[11  0  1]
 [ 2  6  0]
 [ 1  0 12]]
Accuracy Score: 78.78787878787878%
Confusion Matrix: 
[[11  1  0]
 [ 3  5  0]
 [ 2  1 10]]
Accuracy Score: 75.75757575757575%
Confusion Matrix: 
[[10  2  0]
 [ 2  5  1]
 [ 3  0 10]]
Accuracy Score: 90.625%
Confusion Matrix: 
[[14  2  0]
 [ 1  3  0]
 [ 0  0 12]]
Accuracy Score: 84.375%
Confusion Matrix: 
[[11  2  0]
 [ 2  4  0]
 [ 1  0 12]]
Accuracy Score: 87.5%
Confusion Matrix: 
[[17  1  1]
 [ 1  6  0]
 [ 1  0  5]]
Accuracy Score: 87.5%
Confusion Matrix: 
[[10  1  0]
 [ 2  6  0]
 [ 1  0 12]]
Accuracy Score: 90.625%
Confusion Matrix: 
[[14  0  0]
 [ 0  7  0]
 [ 1  2  8]]
Accuracy Score: 78.125%
Confusion Matrix: 
[[12  3  1]
 [ 2  5  0]
 [ 1  0  8]]
Accuracy Score: 96.875%
Confusion Matrix: 
[[13  1  0]
 [ 0  6  0]
 [ 0  0 12]]

Logistic Regression
Accuracy Score: 85.80492424242424%


In [9]:
'''Support Vector Machine'''
from sklearn.svm import LinearSVC

#SVM
SVM = LinearSVC(penalty='l2', loss='squared_hinge')
#kernel = 'linear', random_state = 0

scores=run_model(SVM,"SVM")
print('\nSupport Vector Machine')
print('Accuracy Score: ',np.mean(scores),'%',sep='')

Accuracy Score: 75.75757575757575%
Confusion Matrix: 
[[ 9  1  2]
 [ 2  6  0]
 [ 3  0 10]]
Accuracy Score: 75.75757575757575%
Confusion Matrix: 
[[10  2  0]
 [ 3  5  0]
 [ 2  1 10]]
Accuracy Score: 81.81818181818183%
Confusion Matrix: 
[[10  2  0]
 [ 1  6  1]
 [ 2  0 11]]
Accuracy Score: 87.5%
Confusion Matrix: 
[[13  2  1]
 [ 1  3  0]
 [ 0  0 12]]
Accuracy Score: 81.25%
Confusion Matrix: 
[[ 9  4  0]
 [ 1  5  0]
 [ 1  0 12]]
Accuracy Score: 81.25%
Confusion Matrix: 
[[15  3  1]
 [ 1  6  0]
 [ 1  0  5]]
Accuracy Score: 84.375%
Confusion Matrix: 
[[ 9  1  1]
 [ 2  6  0]
 [ 1  0 12]]
Accuracy Score: 90.625%
Confusion Matrix: 
[[14  0  0]
 [ 0  7  0]
 [ 1  2  8]]
Accuracy Score: 81.25%
Confusion Matrix: 
[[12  3  1]
 [ 1  6  0]
 [ 1  0  8]]
Accuracy Score: 93.75%
Confusion Matrix: 
[[12  1  1]
 [ 0  6  0]
 [ 0  0 12]]

Support Vector Machine
Accuracy Score: 83.33333333333333%


In [10]:
'''Random Forest Classifier'''
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(n_estimators = 3,random_state=0)
scores=run_model(RFC,"RFC")

print('\n Random Forest Classifier')
print('Accuracy Score: ',np.mean(scores),'%',sep='')

Accuracy Score: 69.6969696969697%
Confusion Matrix: 
[[11  0  1]
 [ 6  2  0]
 [ 3  0 10]]
Accuracy Score: 72.72727272727273%
Confusion Matrix: 
[[10  1  1]
 [ 5  3  0]
 [ 0  2 11]]
Accuracy Score: 60.60606060606061%
Confusion Matrix: 
[[11  0  1]
 [ 7  1  0]
 [ 5  0  8]]
Accuracy Score: 78.125%
Confusion Matrix: 
[[11  3  2]
 [ 1  3  0]
 [ 1  0 11]]
Accuracy Score: 65.625%
Confusion Matrix: 
[[10  0  3]
 [ 4  1  1]
 [ 3  0 10]]
Accuracy Score: 59.375%
Confusion Matrix: 
[[10  4  5]
 [ 1  5  1]
 [ 2  0  4]]
Accuracy Score: 78.125%
Confusion Matrix: 
[[11  0  0]
 [ 7  1  0]
 [ 0  0 13]]
Accuracy Score: 78.125%
Confusion Matrix: 
[[12  2  0]
 [ 0  7  0]
 [ 3  2  6]]
Accuracy Score: 87.5%
Confusion Matrix: 
[[16  0  0]
 [ 4  3  0]
 [ 0  0  9]]
Accuracy Score: 68.75%
Confusion Matrix: 
[[9 5 0]
 [0 6 0]
 [5 0 7]]

 Random Forest Classifier
Accuracy Score: 71.8655303030303%


In [11]:
'''Naive Bayes Model'''

from sklearn.naive_bayes import MultinomialNB

#Naive Bayes Model
NB = MultinomialNB()
scores=run_model(NB,"NB")

print('\nNaive Bayes')
print('Accuracy Score: ',np.mean(scores),'%',sep='')

Accuracy Score: 87.87878787878788%
Confusion Matrix: 
[[11  0  1]
 [ 2  5  1]
 [ 0  0 13]]
Accuracy Score: 66.66666666666666%
Confusion Matrix: 
[[ 9  0  3]
 [ 4  2  2]
 [ 1  1 11]]
Accuracy Score: 60.60606060606061%
Confusion Matrix: 
[[ 7  1  4]
 [ 3  2  3]
 [ 2  0 11]]
Accuracy Score: 75.0%
Confusion Matrix: 
[[ 9  0  7]
 [ 1  3  0]
 [ 0  0 12]]
Accuracy Score: 81.25%
Confusion Matrix: 
[[11  0  2]
 [ 3  2  1]
 [ 0  0 13]]
Accuracy Score: 65.625%
Confusion Matrix: 
[[13  1  5]
 [ 1  2  4]
 [ 0  0  6]]
Accuracy Score: 78.125%
Confusion Matrix: 
[[10  0  1]
 [ 5  2  1]
 [ 0  0 13]]
Accuracy Score: 87.5%
Confusion Matrix: 
[[12  0  2]
 [ 0  6  1]
 [ 1  0 10]]
Accuracy Score: 62.5%
Confusion Matrix: 
[[10  0  6]
 [ 1  2  4]
 [ 1  0  8]]
Accuracy Score: 90.625%
Confusion Matrix: 
[[14  0  0]
 [ 2  3  1]
 [ 0  0 12]]

Naive Bayes
Accuracy Score: 75.57765151515152%
