## ULMFiT Sentiment

### Apply a supervised or semi-supervised ULMFiT model to Twitter US Airlines Sentiment 

In [1]:
import pandas as pd
import fastai
import nltk
import sklearn

In [2]:
data = pd.read_csv('tweets.csv')
df = pd.DataFrame(data)



In [3]:
df = df.rename(columns={"airline_sentiment": "sentiment", "airline_sentiment_confidence": "confidence", "negativereason_confidence": "neg_confidence"})
df.columns

Index(['tweet_id', 'sentiment', 'confidence', 'negativereason',
       'neg_confidence', 'airline', 'airline_sentiment_gold', 'name',
       'negativereason_gold', 'retweet_count', 'text', 'tweet_coord',
       'tweet_created', 'tweet_location', 'user_timezone'],
      dtype='object')

In [4]:
df = pd.DataFrame({'sentiment':df.sentiment, 'text':df.text})
df

Unnamed: 0,sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
...,...,...
14635,positive,@AmericanAir thank you we got on a different f...
14636,negative,@AmericanAir leaving over 20 minutes Late Flig...
14637,neutral,@AmericanAir Please bring American Airlines to...
14638,negative,"@AmericanAir you have my money, you change my ..."


In [5]:
df['sentiment'].value_counts()

negative    9178
neutral     3099
positive    2363
Name: sentiment, dtype: int64

### Preprocessing

#### Clean  text by retaining only alphabets and removing everything else in the text column

In [6]:
df['text'] = df['text'].str.replace("[^a-zA-Z]", " ")
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
df['text'][54]

' VirginAmerica Will flights be leaving Dallas for LA on February   th '

In [7]:
# tokenization 
tokenized_doc = df['text'].apply(lambda x: x.split())

# remove stop-words 
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization 
detokenized_doc = [] 
for i in range(len(df)): 
    t = ' '.join(tokenized_doc[i]) 
    detokenized_doc.append(t) 

df['text'] = detokenized_doc


In [8]:
df['text'][54]


'VirginAmerica Will flights leaving Dallas LA February th'

In [9]:
from sklearn.model_selection import train_test_split

# split data into training and validation set, where 80% is used for training and 20% for testing
df_trn, df_test = train_test_split(df, stratify = df['sentiment'], test_size = 0.2)
df_trn.shape, df_test.shape

((11712, 2), (2928, 2))

In [10]:
# Language model data
from fastai.text import *
from fastai import *
data_lm = TextLMDataBunch.from_df(train_df = df_trn, valid_df = df_test, path = "")

In [11]:
data_lm

TextLMDataBunch;

Train: LabelList (11712 items)
x: LMTextList
xxbos united help delayed flight i m going miss connection,xxbos united xxmaj the xxunk designed door xxunk extend half foot plane seat a forced always sit,xxbos usairways xxunk xxmaj we like hear xxmaj we sorry taking long get xxmaj stop sending generic responses,xxbos southwestair trying go far away xxmaj king scollegelondon possible charity today xxmaj would help us jailbreak xxup rag,xxbos southwestair xxmaj apparently mechanics flown fix planes xxunk sign
y: LMLabelList
,,,,
Path: .;

Valid: LabelList (2928 items)
x: LMTextList
xxbos united i flew back w company since xxmaj united did nt earlier flight,xxbos jetblue xxmaj thank jetblue xxmaj credit xxmaj nice save,xxbos united customer service terrible,xxbos united thank,xxbos southwestair xxmaj well plane arrived pretty much time degrees ground xxmaj texas xxmaj not much complain
y: LMLabelList
,,,,
Path: .;

Test: None

In [12]:
# Classifier model data
data_clas = TextClasDataBunch.from_df(path = "", train_df = df_trn, valid_df = df_test, vocab=data_lm.train_ds.vocab, bs=32)

In [13]:
data_clas

TextClasDataBunch;

Train: LabelList (11712 items)
x: TextList
xxbos united help delayed flight i m going miss connection,xxbos united xxmaj the xxunk designed door xxunk extend half foot plane seat a forced always sit,xxbos usairways xxunk xxmaj we like hear xxmaj we sorry taking long get xxmaj stop sending generic responses,xxbos southwestair trying go far away xxmaj king scollegelondon possible charity today xxmaj would help us jailbreak xxup rag,xxbos southwestair xxmaj apparently mechanics flown fix planes xxunk sign
y: CategoryList
negative,negative,negative,neutral,negative
Path: .;

Valid: LabelList (2928 items)
x: TextList
xxbos united i flew back w company since xxmaj united did nt earlier flight,xxbos jetblue xxmaj thank jetblue xxmaj credit xxmaj nice save,xxbos united customer service terrible,xxbos united thank,xxbos southwestair xxmaj well plane arrived pretty much time degrees ground xxmaj texas xxmaj not much complain
y: CategoryList
negative,positive,negative,positive

## Fine-tuning a language model


In [14]:
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.5)


In [15]:
# train the learner object with learning rate = 1e-2
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,6.367753,5.352593,0.165268,00:07


In [16]:
learn.unfreeze()
learn.fit_one_cycle(1, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,5.132285,4.914657,0.197121,00:07


In [17]:
#save this encoder to use it for classification later

learn.save_encoder('ft_enc')

In [18]:
learn = text_classifier_learner(data_clas, drop_mult=0.5, arch=AWD_LSTM)
learn.load_encoder('ft_enc')
data_clas.show_batch()

text,target
xxbos usairways xxup fuk u xxup us xxup airways xxup with xxup yo xxup shitty xxup chicken xxup xxunk xxup sandwich xxup that xxup so xxup overpriced xxup and u xxup xxunk xxup make xxup me xxup wait xxup in a xxup hr xxup layover xxup fuk u xxup and,negative
xxbos united i xxup just xxup asked xxup my xxup boyfriend xxup to xxup prom xxup over xxup the xxup xxunk xxup on xxup flight xxup he xxup said xxup yes xxup best xxup day xxup ever xxup thank u xxup so xxup much,positive
xxbos united xxup where xxup is xxup my xxup fucking xxup bag xxmaj where fuck fucking bag xxup tell xxup me xxup now xxup or xxup give xxup me a xxup number xxup to xxup call a xxup human xxup san m,negative
xxbos united xxunk xxup weeks xxmaj late flightr xxup and i xxup still xxup have xxup not xxup received xxup my xxup miles xxup from xxup the mileageplus xxmaj gift xxmaj card xxup xxunk xxup card i xxup handed xxup over,negative
xxbos southwestair xxmaj thx xxmaj ops xxmaj agt xxmaj xxunk xxmaj xxunk n xxmaj flight xxmaj xxunk xxmaj xxunk xxup den xxmaj airport xxmaj held flight n even saved seat xxmaj bus xxmaj select xxunk,positive


In [19]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.734242,0.646072,0.728825,01:21


In [20]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(5e-3/2., 5e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.663252,0.607663,0.750342,01:21


In [21]:
# get predictions
preds, targets = learn.get_preds()

In [22]:
import numpy as np

In [23]:
predictions = np.argmax(preds, axis = 1)
pd.crosstab(predictions, targets)

col_0,0,1,2
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1718,372,156
1,70,203,40
2,48,45,276


## Summary
Training ULMfit achieved an accuracy of 0.750342