# Sentiment Analysis with Airline Tweets

## Install Libraries

In [1]:
!pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
!pip install fastai
!pip install nltk
!pip install sklearn

Looking in links: https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html


## Import Libraries

In [2]:
import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from functools import partial
import io
import os

## Import Dataset

Twitter Airline Sentiment Data from Kaggle

In [3]:
dataset = pd.read_csv('Tweets.csv')

In [4]:
df = pd.DataFrame({'sentiment':dataset.airline_sentiment, 'tweet':dataset.text})

In [5]:
df.head()

Unnamed: 0,sentiment,tweet
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [6]:
import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, len(df)):
	#replace anything that isn't a letter into a space
    text = re.sub('[^a-zA-Z]', ' ', df['tweet'][i])
    #make everything lowercase
    text = text.lower()
    #split into words
    text = text.split()
    #stopwords are irrelevant words (i.e. the)
    text = [word for word in text if not word in set(stopwords.words('english'))]
    #rejoin words by spaces
    df['tweet'][i] = ' '.join(text)
    corpus.append(' '.join(text))

In [8]:
from sklearn.model_selection import train_test_split

# split data into training and validation set
df_trn, df_val = train_test_split(df, stratify = df['sentiment'], test_size = 0.4, random_state = 12)
df_trn.shape, df_val.shape

((8784, 2), (5856, 2))

## Create Language Model Data and Classifier Model Data

In [9]:
# Language model data
data_lm = TextLMDataBunch.from_df(train_df = df_trn, valid_df = df_val, path = "")

# Classifier model data
data_clas = TextClasDataBunch.from_df(path = "", train_df = df_trn, valid_df = df_val, vocab=data_lm.train_ds.vocab, bs=32)

In [10]:
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.7)

In [11]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,7.510352,6.321576,0.098899,02:03


In [12]:
learn.unfreeze()
learn.fit_one_cycle(1, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,6.023198,5.579774,0.159405,03:07


In [13]:
learn.predict("This is a tweet about", n_words=10)

'This is a tweet about yes painful republican makes lax thought explanation accommodations clearly xxbos'

In [14]:
learn.save_encoder('ft_enc')

In [15]:
learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5)
learn.load_encoder('ft_enc')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (8784 items)
x: TextList
xxbos americanair ok one help bag lost honeymoon months ago responsible professional,xxbos united xxwrep 5 lt shoddy customer service use xxunk xxunk,xxbos jetblue fleet fleek http co xxunk xxunk xxunk,xxbos southwestair xxunk,xxbos americanair tired sitting delayed computer
y: CategoryList
negative,negative,neutral,neutral,negative
Path: .;

Valid: LabelList (5856 items)
x: TextList
y: CategoryList
negative,negative,neutral,neutral,neutral
Path: .;

Test: None, model=SequentialRNN(
  (0): MultiBatchEncoder(
    (module): AWD_LSTM(
      (encoder): Embedding(4464, 400, padding_idx=1)
      (encoder_dp): EmbeddingDropout(
        (emb): Embedding(4464, 400, padding_idx=1)
      )
      (rnns): ModuleList(
        (0): WeightDropout(
          (module): LSTM(400, 1152, batch_first=True)
        )
        (1): WeightDropout(
          (module): LSTM(1152, 1152, batch_first=True)
        )
        (2): WeightDrop

In [16]:
data_clas.show_batch()

text,target
xxbos southwestair needs make whole used tkt back n phx due rude sna agent jacquie plitt flew usairways cabo http co xxunk p,negative
xxbos southwestair could maybe hook xxunk imagine dragon tickets tonight she s hug xxrep 5 e fan amp would really love go,neutral
xxbos americanair file loc xxunk bag airport since last nite scheduled get xxunk xxunk xxunk u shld b ashamed disgusted w u,negative
xxbos usairways customer service dead last xxunk flts delayed cancelled flighted bags lost days last nt flt delayed cancelled flighted meal voucher,negative
xxbos united ticket h biz trvl wifi missed xxunk flt next one h missed meeting food voucher hotel xxunk flt wifi hotel,negative


In [17]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.771959,0.687509,0.70987,01:41


In [18]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(5e-3/2., 5e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.731729,0.672766,0.712944,01:51


In [19]:
learn.unfreeze()
learn.fit_one_cycle(1, slice(2e-3/100, 2e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.6539,0.60769,0.748975,04:06


In [20]:
learn.predict("Horrible service, will not fly again.")

(Category tensor(0), tensor(0), tensor([0.8810, 0.0559, 0.0631]))

In [21]:
# get predictions
preds, targets = learn.get_preds()

predictions = np.argmax(preds, axis = 1)
pd.crosstab(predictions, targets)

col_0,0,1,2
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3509,792,334
1,79,328,62
2,83,120,549


## Comparative Metrics

In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
results = []
results.append(
    [accuracy_score(targets, predictions),
     precision_score(targets, predictions, average = 'macro'),
    recall_score(targets, predictions, average = 'macro'),
    f1_score(targets, predictions, average = 'macro')]
)
resultsinDataFrame = pd.DataFrame(results, columns = ['Accuracy', 'Precision', 'Recall', 'F1 Score'], index = ['ULMFiT'])
resultsinDataFrame

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
ULMFiT,0.748975,0.728826,0.600446,0.625269
