# Table of Contents
* [Import data and Recover the 5-Fold Validation Indices](#Import-data-and-Recover-the-5-Fold-Validation-Indices)
* [Sentiment Analysis with AFINN Lexicon](#Sentiment-Analysis-with-AFINN-Lexicon)
* [Sentiment Analysis with TextBlob Lexicon](#Sentiment-Analysis-with-TextBlob-Lexicon)

In [2]:
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
import warnings
warnings.filterwarnings("ignore")

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.python.keras import models, layers, optimizers
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
import bz2
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import re
%matplotlib inline
# Input data files are available in the "../data/" directory.
# Running this will list the files in the input directory
import os
import statistics
from scipy import stats
from scipy.stats import t
from scipy.stats import norm
import seaborn as sns
import sklearn
import sqlite3
from sqlite3 import Error
import csv
import lightgbm

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import LinearSVC
from xgboost.sklearn import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer

from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_curve
#from sklearn.metrics import plot_precision_recall_curve

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

## Import data and Recover the 5-Fold Validation Indices

In [3]:
X_train = pd.read_csv("../python notebook v1/X_train.csv", )
y_train_0 = pd.read_csv("../python notebook v1/y_train.csv", header=None)
y_train = y_train_0[0]
X_test = pd.read_csv("../python notebook v1/X_test.csv", )
y_test_0 = pd.read_csv("../python notebook v1/y_test.csv", header=None)
y_test = y_test_0[0]

In [4]:
print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (11640, 9)
y_train shape:  (11640,)
X_test shape:  (2911, 9)
y_test shape:  (2911,)


In [5]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, random_state=123, shuffle=True)
kf.get_n_splits(X_train)

5

In [6]:
i = 1
for train_index, val_index in kf.split(X_train):
    print(f"Fold {i}: ", "Train:", train_index, "Validation:", val_index)
    i = i + 1

Fold 1:  Train: [    0     1     2 ... 11636 11638 11639] Validation: [    3     6     7 ... 11619 11620 11637]
Fold 2:  Train: [    1     3     5 ... 11637 11638 11639] Validation: [    0     2     4 ... 11591 11624 11629]
Fold 3:  Train: [    0     1     2 ... 11637 11638 11639] Validation: [    8    33    35 ... 11631 11635 11636]
Fold 4:  Train: [    0     2     3 ... 11636 11637 11638] Validation: [    1     5    18 ... 11632 11634 11639]
Fold 5:  Train: [    0     1     2 ... 11636 11637 11639] Validation: [   13    16    17 ... 11628 11633 11638]


## Sentiment Analysis with AFINN Lexicon

In [7]:
!pip install afinn



In [8]:
# initialize afinn sentiment analyzer
from afinn import Afinn
af = Afinn()

# compute sentiment scores (polarity) and labels
sentiment_scores_train = [af.score(article) for article in X_train.text]
sentiment_category_train = ['positive' if score > 0 else 'negative' if score < 0 else 'neutral' 
                            for score in sentiment_scores_train]

# sentiment statistics
df_afinn_train = pd.DataFrame([list(y_train), sentiment_scores_train, sentiment_category_train]).T
df_afinn_train.columns = ['airline_sentiment', 'afinn_score', 'sentiment_category']
df_afinn_train['afinn_score'] = df_afinn_train.afinn_score.astype('float')

display(df_afinn_train.head())
df = pd.crosstab(df_afinn_train.airline_sentiment, df_afinn_train.sentiment_category)
display(df)
df = df.reset_index()
df = df.reindex([2,0,1]) # use to match the orders of actual vs. predicted categories
display(df)

Unnamed: 0,airline_sentiment,afinn_score,sentiment_category
0,2,-2.0,negative
1,0,0.0,neutral
2,2,2.0,positive
3,2,-1.0,negative
4,1,0.0,neutral


sentiment_category,negative,neutral,positive
airline_sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,310,1200,931
1,71,243,1553
2,3116,2162,2054


sentiment_category,airline_sentiment,negative,neutral,positive
2,2,3116,2162,2054
0,0,310,1200,931
1,1,71,243,1553


Calculate the overall accuracy and precision and recall numbers for the training data for each sentiment

In [9]:
train_acc = (df.loc[0,'neutral']+df.loc[1,'positive']+df.loc[2,'negative'])/(np.sum(df.loc[:,"neutral"])+
                                                                             np.sum(df.loc[:,"positive"])+
                                                                             np.sum(df.loc[:,"negative"]))
print("Accuracy on Training set: ", train_acc)

b0_p_r_train = (df.loc[0,'neutral']/np.sum(df.loc[:,"neutral"]), 
                df.loc[0,'neutral']/np.sum(df.loc[0,"negative":"positive"]))
print("precision and recall for neutral sentiment:", b0_p_r_train)

b1_p_r_train = (df.loc[1,'positive']/np.sum(df.loc[:,"positive"]), 
                df.loc[1,'positive']/np.sum(df.loc[1,"negative":"positive"]))
print("precision and recall for positive sentiment:", b1_p_r_train)

b2_p_r_train = (df.loc[2,'negative']/np.sum(df.loc[:,"negative"]), 
                df.loc[2,'negative']/np.sum(df.loc[2,"negative":"positive"]))
print("precision and recall for negative sentiment:", b2_p_r_train)

Accuracy on Training set:  0.5042096219931271
precision and recall for neutral sentiment: (0.332871012482663, 0.4916018025399426)
precision and recall for positive sentiment: (0.3422212428382547, 0.8318157471880021)
precision and recall for negative sentiment: (0.8910494709751215, 0.4249863611565739)


## Sentiment Analysis with TextBlob Lexicon

In [10]:
from textblob import TextBlob

# compute sentiment scores (polarity) and labels
sentiment_scores_train = [round(TextBlob(article).sentiment.polarity, 3) for article in X_train.text]
sentiment_category_train = ['positive' if score > 0 else 'negative' if score < 0 else 'neutral' 
                            for score in sentiment_scores_train]

# sentiment statistics
df_tb_train = pd.DataFrame([list(y_train), sentiment_scores_train, sentiment_category_train]).T
df_tb_train.columns = ['airline_sentiment', 'tb_score', 'sentiment_category']
df_tb_train['tb_score'] = df_tb_train.tb_score.astype('float')
display(df_tb_train.head())
df = pd.crosstab(df_tb_train.airline_sentiment, df_tb_train.sentiment_category)
display(df)
df = df.reset_index()
df = df.reindex([2,0,1]) # use to match the orders of actual vs. predicted categories
display(df)

Unnamed: 0,airline_sentiment,tb_score,sentiment_category
0,2,0.0,neutral
1,0,0.0,neutral
2,2,0.0,neutral
3,2,0.05,positive
4,1,0.2,positive


sentiment_category,negative,neutral,positive
airline_sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,214,1494,733
1,74,578,1215
2,2218,3042,2072


sentiment_category,airline_sentiment,negative,neutral,positive
2,2,2218,3042,2072
0,0,214,1494,733
1,1,74,578,1215


In [11]:
train_acc = (df.loc[0,'neutral']+df.loc[1,'positive']+df.loc[2,'negative'])/(np.sum(df.loc[:,"neutral"])+
                                                                             np.sum(df.loc[:,"positive"])+
                                                                             np.sum(df.loc[:,"negative"]))
print("Accuracy on Training set: ", train_acc)

b0_p_r_train = (df.loc[0,'neutral']/np.sum(df.loc[:,"neutral"]), 
                df.loc[0,'neutral']/np.sum(df.loc[0,"negative":"positive"]))
print("precision and recall for neutral sentiment:", b0_p_r_train)

b1_p_r_train = (df.loc[1,'positive']/np.sum(df.loc[:,"positive"]), 
                df.loc[1,'positive']/np.sum(df.loc[1,"negative":"positive"]))
print("precision and recall for positive sentiment:", b1_p_r_train)

b2_p_r_train = (df.loc[2,'negative']/np.sum(df.loc[:,"negative"]), 
                df.loc[2,'negative']/np.sum(df.loc[2,"negative":"positive"]))
print("precision and recall for negative sentiment:", b2_p_r_train)

Accuracy on Training set:  0.42328178694158075
precision and recall for neutral sentiment: (0.2921392256550645, 0.6120442441622286)
precision and recall for positive sentiment: (0.30223880597014924, 0.6507766470273165)
precision and recall for negative sentiment: (0.8850758180367119, 0.30250954719039824)
