# Language Modeling & Sentiment Analysis of IMDB movie reviews

In [3]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai import *
from fastai.text import *

#### Get the IMDb database

In [6]:
path = untar_data(URLs.IMDB)

## 1. IMDb Language model

#### batch size of 48 works for the language model

In [20]:
bs=48

In [5]:
%%time

# throws `BrokenProcessPool` Error sometimes. Keep trying `till it works!
count = 0
error = True
while error:
    try: 
        # Preprocessing steps
        data_lm = (TextList.from_folder(path)
           #Inputs: all the text files in path
            .filter_by_folder(include=['train', 'test', 'unsup']) 
           # notebook 3-logreg-nb-imbd used .split_by_folder instead of .filter_by_folder
            # and this took less time to run. Can we do the same here?
           #We may have other temp folders that contain text files so we only keep what's in train and test
            .split_by_rand_pct(0.1, seed=42))
           #We randomly split and keep 10% (10,000 reviews) for validation
            #.label_for_lm()           
           #We want to make a language model so we label accordingly
            #.databunch(bs=bs, num_workers=1))
        error = False
        print(f'failure count is {count}\n')    
    except: # catch *all* exceptions
        # accumulate failure count
        count = count + 1
        print(f'failure count is {count}')

failure count is 0

Wall time: 2.04 s


In [6]:
%%time

# throws `BrokenProcessPool' Error sometimes. Keep trying `till it works!
count = 0
error = True
while error:
    try: 
        # Preprocessing steps
        #     the next step is the bottleneck
        data_lm = (data_lm.label_for_lm()           
           #We want to make a language model so we label accordingly
            .databunch(bs=bs, num_workers=1))
        error = False
        print(f'failure count is {count}\n')    
    except: # catch *all* exceptions
        # accumulate failure count
        count = count + 1
        print(f'failure count is {count}')

failure count is 1

Wall time: 5min 3s


In [7]:
data_lm.save('lm_databunch')

In [8]:
data_lm = load_data(path, 'lm_databunch', bs=bs)

In [9]:
learn_lm = language_model_learner(data_lm, AWD_LSTM, drop_mult=1.).to_fp16()

In [10]:
lr = 1e-2
lr *= bs/48

In [11]:
learn_lm.fit_one_cycle(1, lr, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,4.479124,4.137096,0.284542,16:27


In [12]:
# ASUS ROG STRIX SCAR II with NVIDIA RTX-2070
# Why does the processing time tend to increase with epoch number?
#      19 minutes per epoch for first few, then 1:18, then 1:44. 
learn_lm.unfreeze()
learn_lm.fit_one_cycle(10, lr/10, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,4.103189,3.933051,0.30643,18:43
1,4.02814,3.87275,0.315896,18:53
2,3.993248,3.838678,0.320412,1:18:54
3,3.935021,3.795683,0.32528,1:43:51
4,3.896384,3.763514,0.32902,1:43:55
5,3.838582,3.729117,0.332492,1:43:52
6,3.777858,3.702898,0.335487,1:44:10
7,3.761096,3.684962,0.337716,1:43:45
8,3.717375,3.674535,0.338906,1:43:45
9,3.721038,3.672052,0.339102,1:43:58


In [13]:
learn_lm.save('fine_tuned_10')
learn_lm.save_encoder('fine_tuned_enc_10')

## 2. IMDb Movie Review Classifier

#### Decrease batch size to 32 for the classifier

In [4]:
bs=32

#### Preprocess data into a databunch and save it

In [16]:
%%time

# throws `BrokenProcessPool' Error sometimes. Keep trying `till it works!
count = 0
error = True
while error:
    try: 
        # Preprocessing steps
        data_clas = (TextList.from_folder(path, vocab=data_lm.vocab)
             .split_by_folder(valid='test')
             .label_from_folder(classes=['neg', 'pos'])
             .databunch(bs=bs, num_workers=1))        
        error = False
        print(f'failure count is {count}\n')    
    except: # catch *all* exceptions
        # accumulate failure count
        count = count + 1
        print(f'failure count is {count}')

failure count is 26

Wall time: 26min 25s


In [17]:
data_clas.save('imdb_textlist_class')

In [7]:
data_clas = load_data(path, 'imdb_textlist_class', bs=bs, num_workers=1)

#### Step 1: Train with pretrained weights

In [None]:
learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5).to_fp16()
learn_c.load_encoder('fine_tuned_enc_10')
learn_c.freeze()

In [9]:
lr=2e-2
lr *= bs/48

In [10]:
learn_c.fit_one_cycle(1, lr, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.23196,0.182936,0.93004,02:09


In [11]:
learn_c.save('1')

#### Step 2: Train last layer

In [12]:
# Unfreeze the last layer
learn_c.freeze_to(-2)
learn_c.fit_one_cycle(1, slice(lr/(2.6**4),lr), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.225318,0.162057,0.93948,02:26


In [13]:
learn_c.save('2nd')

#### Step 3: Train last two layers

In [14]:
# Unfreeze the last two layers
learn_c.freeze_to(-3)
learn_c.fit_one_cycle(1, slice(lr/2/(2.6**4),lr/2), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.180939,0.140976,0.94868,03:28


In [15]:
learn_c.save('3rd')

In [17]:
learn_c.load('3rd')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (25000 items)
x: TextList
xxbos xxmaj story of a man who has unnatural feelings for a pig . xxmaj starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turned into an insane , violent mob by the crazy xxunk of it 's singers . xxmaj unfortunately it stays absurd the xxup whole time with no general narrative eventually making it just too off putting . xxmaj even those from the era should be turned off . xxmaj the cryptic dialogue would make xxmaj shakespeare seem easy to a third grader . xxmaj on a technical level it 's better than you might think with some good cinematography by future great xxmaj vilmos xxmaj zsigmond . xxmaj future stars xxmaj sally xxmaj kirkland and xxmaj frederic xxmaj forrest can be seen briefly .,xxbos xxmaj airport ' 77 starts as a brand new luxury 747 plane is loaded up with valuable paintings & such belonging to rich businessman xxmaj philip xxmaj steven

#### Step 4: Unfreeze all  layers and train, save the resulting classifier model

In [18]:
# unfreeze all the weights
# throws CUDA out of memory error with batch size 48; reducing batch size to 32 size works
learn_c.unfreeze()
learn_c.fit_one_cycle(2, slice(lr/10/(2.6**4),lr/10), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.140903,0.140118,0.94844,04:23
1,0.123696,0.14263,0.94928,04:26


In [19]:
learn_c.save('clas')