# Airline Sentiment Analysis

In [84]:
import numpy as np
import pandas as pd
import pickle

import re
import emoji
from nltk.stem import PorterStemmer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import tensorflow as tf

In [85]:
data = pd.read_csv("airline_sentiment_analysis.csv")

In [89]:
data

Unnamed: 0.1,Unnamed: 0,airline_sentiment,text
0,1,positive,@VirginAmerica plus you've added commercials t...
1,3,negative,@VirginAmerica it's really aggressive to blast...
2,4,negative,@VirginAmerica and it's a really big bad thing...
3,5,negative,@VirginAmerica seriously would pay $30 a fligh...
4,6,positive,"@VirginAmerica yes, nearly every time I fly VX..."
...,...,...,...
11536,14633,negative,@AmericanAir my flight was Cancelled Flightled...
11537,14634,negative,@AmericanAir right on cue with the delays👌
11538,14635,positive,@AmericanAir thank you we got on a different f...
11539,14636,negative,@AmericanAir leaving over 20 minutes Late Flig...


### Preprocessing

In [91]:
data.isna().sum().sum()

0

In [92]:
data["airline_sentiment"].value_counts()

negative    9178
positive    2363
Name: airline_sentiment, dtype: int64

In [93]:
sentiment_ordering = ['negative', 'positive']

data['airline_sentiment'] = data['airline_sentiment'].apply(lambda x: sentiment_ordering.index(x))

In [94]:
ps = PorterStemmer()

def process_tweet(tweet):       #simplify the text
    new_tweet = tweet.lower()  
    new_tweet = re.sub(r'@\w+', '', new_tweet)  #remove @s
    new_tweet = re.sub(r'#', '', new_tweet)    #remove hashtags
    new_tweet = re.sub(r':', ' ', emoji.demojize(new_tweet))    #Turn emojis into words
    new_tweet = re.sub(r'http\S+', '', new_tweet)    #remove URLs
    new_tweet = re.sub(r'\$\S+', 'dollar', new_tweet)   #change dollar amounts to dollar
    new_tweet = re.sub(r'[^a-z0-9\s]', '', new_tweet)   #remove punctation
    new_tweet = re.sub(r'[0-9]+', 'number', new_tweet)   #change number values to number
    new_tweet = new_tweet.split(" ")
    new_tweet = list(map(lambda x: ps.stem(x), new_tweet)) #Stemming the words
    new_tweet = list(map(lambda x: x.strip(), new_tweet))  #Stripping whitespace from the words
    if ' ' in new_tweet:
        new_tweet.remove(' ')
    if '' in new_tweet:
        new_tweet.remove('')
    return new_tweet

In [95]:
tweets = data['text'].apply(process_tweet)

labels = np.array(data['airline_sentiment'])

In [96]:
tweets

0        [plu, youv, ad, commerci, to, the, experi, tacki]
1        [it, realli, aggress, to, blast, obnoxi, enter...
2         [and, it, a, realli, big, bad, thing, about, it]
3        [serious, would, pay, dollar, a, flight, for, ...
4        [ye, nearli, everi, time, i, fli, vx, thi, ear...
                               ...                        
11536    [my, flight, wa, cancel, flightl, leav, tomorr...
11537           [right, on, cue, with, the, delay, hand, ]
11538    [thank, you, we, got, on, a, differ, flight, t...
11539    [leav, over, number, minut, late, flight, no, ...
11540    [you, have, my, money, you, chang, my, flight,...
Name: text, Length: 11541, dtype: object

In [97]:
# Get size of vocabulary
vocabulary = set()

for tweet in tweets:
    for word in tweet:
        if word not in vocabulary:
            vocabulary.add(word)
            
vocab_length = len(vocabulary)

# Get max length of a sequence
max_seq_length = 0

for tweet in tweets:
    if len(tweet) > max_seq_length:
        max_seq_length = len(tweet)

# Print results
print("Vocab length:", vocab_length)
print("Max sequence length:", max_seq_length)

Vocab length: 9508
Max sequence length: 90


In [98]:
# Dense encoding

tokenizer = Tokenizer(num_words=vocab_length)
tokenizer.fit_on_texts(tweets)

sequences = tokenizer.texts_to_sequences(tweets)

word_index = tokenizer.word_index

model_inputs = pad_sequences(sequences, maxlen=max_seq_length, padding='post')

In [99]:
sequences

[[537, 480, 610, 1224, 1, 3, 163, 4011],
 [15, 126, 2476, 1, 1742, 3010, 753, 14, 17, 1743, 960, 62, 57, 22, 460, 1908],
 [10, 15, 7, 126, 425, 195, 273, 87, 15],
 [359,
  85,
  206,
  103,
  7,
  6,
  9,
  84,
  19,
  180,
  22,
  28,
  4012,
  126,
  3,
  124,
  195,
  273,
  87,
  63,
  1632],
 [181, 1171, 286, 42, 5, 63, 1909, 28, 2477, 4013, 197, 70, 449, 2, 2],
 [194, 5, 4014, 35, 5, 48, 2, 1281],
 [15, 21, 306, 10, 207, 41, 30, 339, 17, 171, 140, 1, 18],
 [5, 1069, 611, 3011, 45, 169, 200, 154, 4015, 4016, 2, 1281],
 [28,
  13,
  517,
  7,
  113,
  394,
  222,
  246,
  87,
  12,
  590,
  188,
  1,
  2,
  62,
  5,
  320,
  120,
  809,
  11,
  12,
  375,
  188,
  254,
  1910],
 [2,
  56,
  63,
  17,
  1517,
  4017,
  929,
  100,
  121,
  112,
  61,
  3,
  930,
  449,
  40,
  131,
  53,
  4018],
 [20],
 [4019, 388, 13, 66, 833],
 [45,
  774,
  9,
  12,
  164,
  728,
  754,
  6,
  323,
  1,
  775,
  114,
  524,
  230,
  31,
  113,
  273,
  87,
  961,
  1172,
  4020],
 [2,
  5,
  591

In [100]:
word_index

{'to': 1,
 '': 2,
 'the': 3,
 'number': 4,
 'i': 5,
 'flight': 6,
 'a': 7,
 'you': 8,
 'for': 9,
 'and': 10,
 'on': 11,
 'my': 12,
 'is': 13,
 'in': 14,
 'it': 15,
 'of': 16,
 'your': 17,
 'me': 18,
 'that': 19,
 'thank': 20,
 'wa': 21,
 'have': 22,
 'not': 23,
 'no': 24,
 'get': 25,
 'at': 26,
 'with': 27,
 'thi': 28,
 'be': 29,
 'hour': 30,
 'but': 31,
 'cancel': 32,
 'delay': 33,
 'servic': 34,
 'now': 35,
 'are': 36,
 'custom': 37,
 'we': 38,
 'help': 39,
 'from': 40,
 'an': 41,
 'time': 42,
 'been': 43,
 'just': 44,
 'so': 45,
 'call': 46,
 'wait': 47,
 'do': 48,
 'can': 49,
 'bag': 50,
 'up': 51,
 'hold': 52,
 'http': 53,
 'plane': 54,
 'what': 55,
 'im': 56,
 'they': 57,
 'out': 58,
 'us': 59,
 'will': 60,
 'all': 61,
 'amp': 62,
 'fli': 63,
 'whi': 64,
 'our': 65,
 'still': 66,
 'cant': 67,
 'when': 68,
 'tri': 69,
 'go': 70,
 'need': 71,
 'airlin': 72,
 'day': 73,
 'one': 74,
 'how': 75,
 'gate': 76,
 'flightl': 77,
 'there': 78,
 'had': 79,
 'back': 80,
 'if': 81,
 'dont': 82

In [101]:
model_inputs

array([[ 537,  480,  610, ...,    0,    0,    0],
       [  15,  126, 2476, ...,    0,    0,    0],
       [  10,   15,    7, ...,    0,    0,    0],
       ...,
       [  20,    8,   38, ...,    0,    0,    0],
       [ 212,   98,    4, ...,    0,    0,    0],
       [   8,   22,   12, ...,    0,    0,    0]])

In [102]:
model_inputs.shape

(11541, 90)

In [103]:
x_train, x_test, y_train, y_test = train_test_split(model_inputs, labels, train_size=0.7, random_state=22)  #training size about 70%

### Training 

In [104]:
embedding_dim = 32

inputs = tf.keras.Input(shape=(max_seq_length,))

embedding = tf.keras.layers.Embedding(
    input_dim=vocab_length,
    output_dim=embedding_dim,
    input_length=max_seq_length
)(inputs)


# Model A (just a Flatten layer)
flatten = tf.keras.layers.Flatten()(embedding)

# Model B (GRU with a Flatten layer)
gru = tf.keras.layers.GRU(units=embedding_dim)(embedding)
gru_flatten = tf.keras.layers.Flatten()(gru)

# Both A and B are fed into the output
concat = tf.keras.layers.concatenate([flatten, gru_flatten])

outputs = tf.keras.layers.Dense(3, activation='softmax')(concat)

model = tf.keras.Model(inputs, outputs)

In [105]:
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

batch_size = 32
epochs = 100

history = model.fit(
    x_train,
    y_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True,
            verbose=1
        ),
        tf.keras.callbacks.ReduceLROnPlateau()
    ]

)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 8: early stopping


In [106]:
model.evaluate(x_test, y_test)



[0.22161456942558289, 0.9116373062133789]

In [107]:
pickle.dump(model, open('model.pkl', 'wb'))          # save the model in disc

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\concatenate
......vars
...layers\dense
......vars
.........0
.........1
...layers\embedding
......vars
.........0
...layers\flatten
......vars
...layers\flatten_1
......vars
...layers\gru
......vars
...layers\gru\cell
......vars
.........0
.........1
.........2
...layers\input_layer
......vars
...metrics\mean
......vars
.........0
.........1
...metrics\mean_metric_wrapper
......vars
.........0
.........1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........2
.........3
.........4
.........5
.........6
.........7
.........8
.........9
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-12-24 12:02:29         3242
metadata.json                                  2022-12-24 12:02:29           64
variables.h5                                   2022-12-24 12:02:29    