In [1]:
import sqlite3 as lite
import pandas as pd
import numpy as np
from pandas.io.sql import read_sql


In [2]:
import os
from fastai.text import *
bs=16

In [3]:
print(os.getcwd())
path='/media/jlealtru/data_files/github/Tutorials/TextAnalytics/pitchfork_data'

/media/jlealtru/data_files/github/Tutorials/TextAnalytics/pitchfork_data


In [4]:
# load data bunch
data_lm = load_data(path, 'data_lm_pitchfork.pkl', bs=bs)

In [5]:
# connect to data

con = lite.connect('database.sqlite')
df = read_sql(""" SELECT a.content, b.* FROM content a LEFT JOIN reviews b """
              """on a.reviewid = b.reviewid """, con, coerce_float=True, params=None) 

# subset data
df = df[['content','score', 'reviewid']]
#df= df.iloc[0:10000]
print (len(df))

# create labels good, average and bad. 
#df['label'] = np.where(df['score']>=7.5,'good','bad')
df['label'] = np.where(df['score']>=8,'good',
                       (np.where(df['score']<6, 'low', 'medium')))
df.head(1)

18401


Unnamed: 0,content,score,reviewid,label
0,"“Trip-hop” eventually became a ’90s punchline,...",9.3,22703,good


In [6]:
df.groupby('label').count()

Unnamed: 0_level_0,content,score,reviewid
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
good,3829,3829,3829
low,3004,3004,3004
medium,11568,11568,11568


In [7]:
# split the dataframe 
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify = df['label'])

In [8]:
data_classification = TextClasDataBunch.from_df(path=".", train_df=train_df, valid_df=val_df,
                                            vocab= data_lm.vocab,
                                            text_cols="content", 
                                            label_cols="label", 
                                            bs=bs)

data_classification.save(path+'/data_clas_pitch.pkl')

In [None]:
#data_classification.save(path+'/data_clas_pitch.pkl')

In [9]:
# calculate weights for imbalanded class
trn_labelcounts = train_df.groupby(["label"]).size()
val_labelcounts = val_df.groupby(["label"]).size()
trn_label_sum = len(train_df["label"])
val_label_sum = len(val_df["label"])
trn_weights = [count/trn_label_sum for count in trn_labelcounts]
val_weights = [count/val_label_sum for count in val_labelcounts]
trn_weights, val_weights
rtd_val_weights = [max(val_weights)/value for value in val_weights]
rtd_val_weights = torch.FloatTensor(rtd_val_weights).cuda()

In [10]:
rtd_val_weights

tensor([3.0209, 3.8502, 1.0000], device='cuda:0')

In [11]:
learn_classifier = text_classifier_learner(data_classification, AWD_LSTM, drop_mult=0.4, pretrained=False)
learn_classifier.load_encoder(path+'/fine_tuned_enc')

In [12]:
# add new loss function and metric
learn_classifier.loss_func = nn.CrossEntropyLoss(weight=rtd_val_weights)
#learn_classifier = learn_classifier.to_distributed(args.local_rank)
print('about to start')
learn_classifier.metrics = [accuracy,
                            Precision(),
                            Recall(),
                            FBeta(average='weighted')]

about to start


In [13]:
# fit one cycle
lr = 1e-1

learn_classifier.fit_one_cycle(1,lr/2, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,precision,recall,f_beta,time
0,0.867933,0.872609,0.574572,0.549372,0.617589,0.572864,16:17


  warn("average=`binary` was selected for a non binary case. Value for average has now been set to `macro` instead.")


In [14]:
learn_classifier.freeze_to(-2)
lr /= 2
learn_classifier.fit_one_cycle(1, slice(lr/(2.6**4),lr), moms=(0.8,0.7))
#learn_classifier.fit_one_cycle(2, slice(1e-4/2,1e-2/2), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,precision,recall,f_beta,time
0,0.854841,4.379245,0.623744,0.582229,0.661506,0.622261,17:12


In [15]:
learn_classifier.freeze_to(-3)
lr /= 2
learn_classifier.fit_one_cycle(1, slice(lr/(2.6**4),lr), moms=(0.8,0.7))
#learn_classifier.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,precision,recall,f_beta,time
0,0.961439,0.984511,0.472698,0.46755,0.557565,0.46351,20:26


In [16]:
learn_classifier.unfreeze()
lr /= 2
learn_classifier.fit_one_cycle(3, slice(lr/(2.6**4),lr), moms=(0.8,0.7))
#learn_classifier.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,precision,recall,f_beta,time
0,0.845182,1.181797,0.372453,0.535196,0.513299,0.33048,23:33
1,0.773269,1.461936,0.630807,0.591452,0.596395,0.632457,23:34
2,0.537712,1.297095,0.60853,0.563133,0.626462,0.608507,23:36


In [None]:
learn_classifier.save('/content/gdrive/My Drive/nlp_models/pitch_classifier_tuned')
learn_classifier.export(path+'/'+'export_classifier_pitch.pkl')

In [None]:
from fastai.vision import *
preds, y, losses = learn_classifier.get_preds(with_loss= True)
interp = ClassificationInterpretation(learn_classifier, preds, y, losses)
interp.plot_confusion_matrix()
data_classification.save(path+'/confusion.pkl')