# Multilable text classification with fastai v1

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai import *
from fastai.text import *

In [3]:
path = Path('data')

## Prepare dataframe

In [4]:
df_trn = pd.read_csv(path/'train_Robinson.csv', skiprows=1,low_memory=False, dtype={'Contributing Factors / Situations':str})
df_val = pd.read_csv(path/'valid.csv', skiprows=1,low_memory=False, dtype={'Contributing Factors / Situations':str})
df_trn.head(5)

Unnamed: 0,ACN,Date,Local Time Of Day,Locale Reference,State Reference,Relative Position.Angle.Radial,Relative Position.Distance.Nautical Miles,Altitude.AGL.Single Value,Altitude.MSL.Single Value,Flight Conditions,...,When Detected,Result,Contributing Factors / Situations,Primary Problem,Narrative,Callback,Narrative.1,Callback.1,Synopsis,Unnamed: 96
0,925545,201101,0001-0600,ZNY.ARTCC,NY,,,,19000.0,,...,,Air Traffic Control Issued New Clearance,Aircraft; Equipment / Tooling; Human Factors,Human Factors,[I was] working air carrier 1 at FL190. Sector...,,Air carrier 1 [was] being worked by Sector 56 ...,,ZNY controllers and flight crew personnel desc...,
1,925560,201101,1201-1800,ZZZ.Tower,US,,,0.0,,,...,Taxi,Air Traffic Control Issued New Clearance; Air ...,Human Factors,Human Factors,Aircraft X; a CRJ2; landed Runway XXL and was ...,,,,Disabled aircraft causes airport authority to ...,
2,925563,201101,1201-1800,ZOB.ARTCC,OH,,,,15000.0,,...,In-flight,Air Traffic Control Issued New Clearance; Air ...,Human Factors; Procedure; Staffing,Human Factors,I was working combined sectors with no D-side....,,,,ZOB Controller reported that when an aircraft ...,
3,925810,201101,0001-0600,ZZZ.Airport,US,,,0.0,,,...,Aircraft In Service At Gate,Flight Crew Returned To Gate; General Maintena...,Procedure; Human Factors; Aircraft; Staffing; ...,Aircraft,[We] arrived to a cold airplane with ambient t...,,,,A CRJ flight crew suffered a hot start on a fi...,
4,925811,201101,0601-1200,ZZZ.Airport,US,,,,32500.0,IMC,...,In-flight,Air Traffic Control Provided Assistance; Aircr...,Aircraft; Weather,Aircraft,During the climbout we entered the ragged over...,,,,B717 Captain experienced simultaneous overspee...,


In [5]:
df_trn.shape, df_val.shape

((4500, 97), (2948, 97))

In [6]:
df_trn['Contributing Factors / Situations'].isna().sum()

15

In [7]:
df_val['Contributing Factors / Situations'].isna().sum()

13

In [8]:
df_trn.dropna(axis=0, subset=['Contributing Factors / Situations'], inplace=True)
df_val.dropna(axis=0, subset=['Contributing Factors / Situations'], inplace=True)

In [9]:
len(df_trn), len(df_val)

(4485, 2935)

In [11]:
# Replace label delimiter '; ' by single char '|' to allow compatibility with fastai 1.0.28

df_trn['Contributing Factors / Situations'] = df_trn['Contributing Factors / Situations'].str.replace('; ', '|').astype(str)

df_val['Contributing Factors / Situations'] = df_val['Contributing Factors / Situations'].str.replace('; ', '|').astype(str)


## Language model

Fine tune the language model

In [None]:
df = pd.concat([df_trn, df_val], ignore_index=True)

In [12]:
bs = 48

In [None]:
data_lm = (TextList.from_df(df, path, cols='Narrative')
                .random_split_by_pct(0.1)
                .label_for_lm()
                .databunch(bs=bs))

In [None]:
data_lm.save('tmp_lm')

In [None]:
learn_lm = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.3)

In [None]:
learn_lm.lr_find()
learn.recorder.plot(skip_end=10)

In [None]:
learn_lm.fit_one_cycle(1, 1e-2, moms=(0.8,0.7))

In [None]:
learn_lm.save('fit_head')

In [None]:
learn_lm.load('fit_head')

In [None]:
learn_lm.unfreeze()
learn_lm.fit_one_cycle(10, 1e-3, moms=(0.8,0.7))

In [None]:
learn_lm.recorder.plot_losses()

In [None]:
learn_lm.save('fine_tuned')

In [None]:
learn_lm.save_encoder('fine_tuned_enc')

Fine tuned models *fit_head*, *fine_tuned* and *fine_tuned_enc* are saved in default `path/models` folder

Numeralized text and the integer to string dictionary are saved in user defined `path/tmp_lm` folder 

## Classifier

In [12]:
bs = 48

In [13]:
data_lm = TextLMDataBunch.load(path, 'tmp_lm', bs=bs)

In [14]:
def precision(log_preds, targs, thresh=0.5, epsilon=1e-8):
    pred_pos = (log_preds > thresh).float()
    tpos = torch.mul((targs == pred_pos).float(), targs.float())
    return (tpos.sum()/(pred_pos.sum() + epsilon))#.item()

In [15]:
def recall(log_preds, targs, thresh=0.5, epsilon=1e-8):
    pred_pos = (log_preds > thresh).float()
    tpos = torch.mul((targs == pred_pos).float(), targs.float())
    return (tpos.sum()/(targs.sum() + epsilon))

In [None]:
data_clas = TextClasDataBunch.from_df(path, train_df=df_trn, valid_df=df_val, 
                                  vocab=data_lm.vocab, 
                                  text_cols='Narrative', 
                                  label_cols='Contributing Factors / Situations',
                                  label_delim='|',
                                  bs=bs)

In [None]:
data_clas.save('tmp_clas')

In [16]:
learn = text_classifier_learner(data_clas, drop_mult=0.5)
learn.metrics = [accuracy_thresh, precision, recall]
learn.load_encoder('fine_tuned_enc')

In [17]:
learn.freeze()
learn.fit_one_cycle(1, 3e-2, moms=(0.8,0.7))

Total time: 00:37
epoch  train_loss  valid_loss  accuracy_thresh  precision  recall  
1      0.237107    0.216778    0.911840         0.795298   0.435772  (00:37)



In [None]:
learn.save('first_factors')

In [None]:
learn.load('first_factors')

In [18]:
learn.freeze_to(-2)
learn.fit_one_cycle(2, slice(1e-2/(2.6**4),1e-2), moms=(0.8,0.7), wd=0.1)

Total time: 01:26
epoch  train_loss  valid_loss  accuracy_thresh  precision  recall  
1      0.225615    0.214058    0.911989         0.784269   0.507307  (00:41)
2      0.213732    0.210868    0.911520         0.750227   0.537914  (00:45)



In [None]:
learn.save('second_factors')

In [None]:
learn.load('second_factors')

In [None]:
learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3), moms=(0.8,0.7), wd=0.1)

In [17]:
# Predict on the validation dataset
y_pred, y_true =  learn.get_preds()



Classifier models *first_factors* and *second_factors* are saved in default `path/models` folder

Numeralized text, labels and the integer to string dictionary are saved in user defined `path/tmp_clas` folder 

## Evaluate results

In [18]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

In [39]:
from sklearn.metrics import classification_report

In [33]:
f1_score(y_true, y_pred>0.35, average='micro')

0.6634428004330567

In [20]:
y_true = y_true.numpy()
scores = y_pred.numpy()

In [35]:
scores.shape, y_true.shape

((2935, 16), (2935, 16))

In [None]:
metrics = classification_report(y_true, scores>0.35, 
                                target_names=data_clas.valid_ds.classes,
                                digits=3, output_dict=True)#, target_names=target_names

In [61]:
results = pd.DataFrame(metrics).T

In [63]:
results = results.sort_values('support', ascending=False)

In [78]:
results['support'] = results['support'].astype(int)
results['f1-score'] = results['f1-score']*100
results['precision'] = results['precision']*100
results['recall'] = results['recall']*100

In [79]:
pd.options.display.float_format = '{:,.1f}'.format
results

Unnamed: 0,f1-score,precision,recall,support
samples avg,69.1,68.0,78.9,6301
micro avg,66.3,60.8,72.9,6301
weighted avg,64.3,61.3,72.9,6301
macro avg,36.0,45.4,38.1,6301
Human Factors,79.5,71.3,89.7,1672
Aircraft,84.7,77.1,94.1,1635
Procedure,52.5,37.2,89.1,754
Company Policy,57.7,65.6,51.4,624
Weather,62.5,64.0,61.1,270
Chart Or Publication,51.1,46.9,56.0,259
