# Language Modeling & Sentiment Analysis of IMDB movie reviews

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai import *
from fastai.text import *

#### Get the IMDb database

In [2]:
path = untar_data(URLs.IMDB)

## 1. IMDb Language model

#### Add 50,000 unlabelled reviews from the `unsup` folder, because labels are not needed in language model! 
#### batch size of 48 works for the language model

In [3]:
bs=48

In [5]:
%%time

# throws `BrokenProcessPool` Error sometimes. Keep trying `till it works!
count = 0
error = True
while error:
    try: 
        # Preprocessing steps
        data_lm = (TextList.from_folder(path)
           #Inputs: all the text files in path
            .filter_by_folder(include=['train', 'test', 'unsup']) 
           # notebook 3-logreg-nb-imbd used .split_by_folder instead of .filter_by_folder
            # and this took less time to run. Can we do the same here?
           #We may have other temp folders that contain text files so we only keep what's in train and test
            .split_by_rand_pct(0.1, seed=42))
           #We randomly split and keep 10% (10,000 reviews) for validation
            #.label_for_lm()           
           #We want to make a language model so we label accordingly
            #.databunch(bs=bs, num_workers=1))
        error = False
        print(f'failure count is {count}\n')    
    except: # catch *all* exceptions
        # accumulate failure count
        count = count + 1
        print(f'failure count is {count}')

failure count is 0

Wall time: 2.04 s


In [6]:
%%time

# throws `BrokenProcessPool' Error sometimes. Keep trying `till it works!
count = 0
error = True
while error:
    try: 
        # Preprocessing steps
        #     the next step is the bottleneck
        data_lm = (data_lm.label_for_lm()           
           #We want to make a language model so we label accordingly
            .databunch(bs=bs, num_workers=1))
        error = False
        print(f'failure count is {count}\n')    
    except: # catch *all* exceptions
        # accumulate failure count
        count = count + 1
        print(f'failure count is {count}')

failure count is 1

Wall time: 5min 3s


In [7]:
data_lm.save('lm_databunch')

In [4]:
data_lm = load_data(path, 'lm_databunch', bs=bs)

In [5]:
# use mixed-precision training to speed up processing and reduce memory footprint
# need NVIDIA GPU with 'tensor cores' such as RTX-2070
learn_lm = language_model_learner(data_lm, AWD_LSTM, drop_mult=1.).to_fp16()

In [10]:
# heuristic that Jeremy found
lr = 1e-2
lr *= bs/48

In [11]:
learn_lm.fit_one_cycle(1, lr, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,4.479124,4.137096,0.284542,16:27


In [12]:
# ASUS ROG STRIX SCAR II with NVIDIA RTX-2070
# Why does the processing time tend to increase with epoch number?
#      19 minutes per epoch for first few, then 1:18, then 1:44. 
learn_lm.unfreeze()
learn_lm.fit_one_cycle(10, lr/10, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,4.103189,3.933051,0.30643,18:43
1,4.02814,3.87275,0.315896,18:53
2,3.993248,3.838678,0.320412,1:18:54
3,3.935021,3.795683,0.32528,1:43:51
4,3.896384,3.763514,0.32902,1:43:55
5,3.838582,3.729117,0.332492,1:43:52
6,3.777858,3.702898,0.335487,1:44:10
7,3.761096,3.684962,0.337716,1:43:45
8,3.717375,3.674535,0.338906,1:43:45
9,3.721038,3.672052,0.339102,1:43:58


In [13]:
learn_lm.save('fine_tuned_10')
learn_lm.save_encoder('fine_tuned_enc_10')

In [6]:
learn_lm.load('fine_tuned_10')

LanguageLearner(data=TextLMDataBunch;

Train: LabelList (90000 items)
x: LMTextList
xxbos xxbos xxmaj once again xxmaj mr. xxmaj costner has dragged out a movie for far longer than necessary . xxmaj aside from the terrific sea rescue sequences , of which there are very few i just did not care about any of the characters . xxmaj most of us have ghosts in the closet , and xxmaj costner 's character are realized early on , and then forgotten until much later , by which time i did not care . xxmaj the character we should really care about is a very cocky , overconfident xxmaj ashton xxmaj kutcher . xxmaj the problem is he comes off as kid who thinks he 's better than anyone else around him and shows no signs of a cluttered closet . xxmaj his only obstacle appears to be winning over xxmaj costner . xxmaj finally when we are well past the half way point of this stinker , xxmaj costner tells us all about xxmaj kutcher 's ghosts . xxmaj we are told why xxmaj kutcher is driven to be the best wi

In [7]:
learn_lm.model

SequentialRNN(
  (0): AWD_LSTM(
    (encoder): Embedding(60000, 400, padding_idx=1)
    (encoder_dp): EmbeddingDropout(
      (emb): Embedding(60000, 400, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDropout(
        (module): LSTM(400, 1152, batch_first=True)
      )
      (1): WeightDropout(
        (module): LSTM(1152, 1152, batch_first=True)
      )
      (2): WeightDropout(
        (module): LSTM(1152, 400, batch_first=True)
      )
    )
    (input_dp): RNNDropout()
    (hidden_dps): ModuleList(
      (0): RNNDropout()
      (1): RNNDropout()
      (2): RNNDropout()
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=400, out_features=60000, bias=True)
    (output_dp): RNNDropout()
  )
)

In [9]:
len(learn_lm.model)

2

In [10]:
learn_lm.model[0]

AWD_LSTM(
  (encoder): Embedding(60000, 400, padding_idx=1)
  (encoder_dp): EmbeddingDropout(
    (emb): Embedding(60000, 400, padding_idx=1)
  )
  (rnns): ModuleList(
    (0): WeightDropout(
      (module): LSTM(400, 1152, batch_first=True)
    )
    (1): WeightDropout(
      (module): LSTM(1152, 1152, batch_first=True)
    )
    (2): WeightDropout(
      (module): LSTM(1152, 400, batch_first=True)
    )
  )
  (input_dp): RNNDropout()
  (hidden_dps): ModuleList(
    (0): RNNDropout()
    (1): RNNDropout()
    (2): RNNDropout()
  )
)

In [17]:
dir(learn_lm.model[0])

['__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__post_init__',
 '__pre_init__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_backend',
 '_backward_hooks',
 '_buffers',
 '_construct',
 '_forward_hooks',
 '_forward_pre_hooks',
 '_get_name',
 '_load_from_state_dict',
 '_load_state_dict_pre_hooks',
 '_modules',
 '_named_members',
 '_one_hidden',
 '_parameters',
 '_register_load_state_dict_pre_hook',
 '_register_state_dict_hook',
 '_save_to_state_dict',
 '_slow_forward',
 '_state_dict_hooks',
 '_tracing_name',
 '_version',
 'add_module',
 'apply',
 'bs',
 'buffers',
 'children',
 'cpu',
 'cuda',
 'double',
 'dump_patches',
 'emb_sz',
 'encoder',
 'encoder_d

In [21]:
learn_lm.model[0]

AWD_LSTM(
  (encoder): Embedding(60000, 400, padding_idx=1)
  (encoder_dp): EmbeddingDropout(
    (emb): Embedding(60000, 400, padding_idx=1)
  )
  (rnns): ModuleList(
    (0): WeightDropout(
      (module): LSTM(400, 1152, batch_first=True)
    )
    (1): WeightDropout(
      (module): LSTM(1152, 1152, batch_first=True)
    )
    (2): WeightDropout(
      (module): LSTM(1152, 400, batch_first=True)
    )
  )
  (input_dp): RNNDropout()
  (hidden_dps): ModuleList(
    (0): RNNDropout()
    (1): RNNDropout()
    (2): RNNDropout()
  )
)

In [22]:
ee = learn_lm.model[0].encoder

In [24]:
dir(ee)

['__call__',
 '__class__',
 '__constants__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_backend',
 '_backward_hooks',
 '_buffers',
 '_construct',
 '_forward_hooks',
 '_forward_pre_hooks',
 '_get_name',
 '_load_from_state_dict',
 '_load_state_dict_pre_hooks',
 '_modules',
 '_named_members',
 '_parameters',
 '_register_load_state_dict_pre_hook',
 '_register_state_dict_hook',
 '_save_to_state_dict',
 '_slow_forward',
 '_state_dict_hooks',
 '_tracing_name',
 '_version',
 'add_module',
 'apply',
 'buffers',
 'children',
 'cpu',
 'cuda',
 'double',
 'dump_patches',
 'embedding_dim',
 'eval',
 'extra_repr',
 'float',
 'forward',
 'from_pre

In [36]:
ee.weight.shape

torch.Size([60000, 400])

In [38]:
ee.weight[360,:]

tensor([-7.9590e-02,  5.3711e-01, -1.8945e-01, -8.2959e-01,  5.8154e-01,
        -5.7178e-01, -2.4612e-02, -4.5508e-01,  2.0691e-01, -3.1470e-01,
         5.6244e-02,  5.6738e-01,  2.9272e-01, -3.9917e-01,  9.7607e-01,
         3.0835e-01,  5.2881e-01,  9.0698e-02, -5.2521e-02,  8.4290e-02,
         4.0186e-01, -4.4580e-01,  4.2578e-01,  1.9238e-01,  5.9619e-01,
         6.9763e-02,  1.5015e-01,  1.9922e-01, -1.8994e-01, -4.4287e-01,
        -9.2896e-02, -3.6713e-02, -8.1116e-02, -7.0190e-02,  9.8022e-02,
        -2.3999e-01, -4.2871e-01, -1.5381e-01,  3.1055e-01,  1.9617e-01,
        -4.6326e-02,  1.2451e-01, -5.5573e-02,  6.6699e-01, -1.3733e-01,
        -2.5391e-01,  3.1592e-01, -1.2537e-01, -1.1487e-01, -3.9819e-01,
         4.5700e-03,  6.2109e-01, -5.6488e-02,  1.2610e-01,  1.1627e-01,
         2.5928e-01,  1.2262e-01,  4.7412e-01, -4.9927e-01,  4.1229e-02,
         1.6077e-01, -1.8213e-01, -3.7933e-02, -2.1362e-01, -1.2192e-02,
        -4.9219e-01,  2.0178e-01, -4.3164e-01,  2.4

In [30]:
learn_lm.data.vocab.itos[360]

'less'

In [11]:
learn_lm.model[1]

LinearDecoder(
  (decoder): Linear(in_features=400, out_features=60000, bias=True)
  (output_dp): RNNDropout()
)

## 2. IMDb Movie Review Classifier

#### Decrease batch size to 32 for the classifier

In [4]:
bs=32

#### Preprocess data into a databunch and save it

In [16]:
%%time

# throws `BrokenProcessPool' Error sometimes. Keep trying `till it works!
count = 0
error = True
while error:
    try: 
        # Preprocessing steps
        data_clas = (TextList.from_folder(path, vocab=data_lm.vocab)
             .split_by_folder(valid='test')
             .label_from_folder(classes=['neg', 'pos'])
             .databunch(bs=bs, num_workers=1))        
        error = False
        print(f'failure count is {count}\n')    
    except: # catch *all* exceptions
        # accumulate failure count
        count = count + 1
        print(f'failure count is {count}')

failure count is 26

Wall time: 26min 25s


In [17]:
data_clas.save('imdb_textlist_class')

In [12]:
data_clas = load_data(path, 'imdb_textlist_class', bs=bs, num_workers=1)

#### Step 1: Train with pretrained weights

In [14]:
# note that drop_mult controls the amount of 5 different kinds of dropout
learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=1.).to_fp16()
learn_c.load_encoder('fine_tuned_enc_10')
learn_c.freeze()

In [9]:
lr=2e-2
lr *= bs/48

In [10]:
learn_c.fit_one_cycle(1, lr, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.313081,0.193992,0.92672,01:43


In [11]:
learn_c.save('1')

#### Step 2: Train last layer

In [12]:
# Unfreeze the last layer
learn_c.freeze_to(-2)
learn_c.fit_one_cycle(1, slice(lr/(2.6**4),lr), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.262704,0.161128,0.94052,02:08


In [13]:
learn_c.save('2nd')

#### Step 3: Train last two layers

In [14]:
# Unfreeze the last two layers
learn_c.freeze_to(-3)
learn_c.fit_one_cycle(1, slice(lr/2/(2.6**4),lr/2), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.223798,0.150961,0.94436,03:05


In [15]:
learn_c.save('3rd')

In [16]:
learn_c.load('3rd')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (25000 items)
x: TextList
xxbos xxmaj story of a man who has unnatural feelings for a pig . xxmaj starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turned into an insane , violent mob by the crazy xxunk of it 's singers . xxmaj unfortunately it stays absurd the xxup whole time with no general narrative eventually making it just too off putting . xxmaj even those from the era should be turned off . xxmaj the cryptic dialogue would make xxmaj shakespeare seem easy to a third grader . xxmaj on a technical level it 's better than you might think with some good cinematography by future great xxmaj vilmos xxmaj zsigmond . xxmaj future stars xxmaj sally xxmaj kirkland and xxmaj frederic xxmaj forrest can be seen briefly .,xxbos xxmaj airport ' 77 starts as a brand new luxury 747 plane is loaded up with valuable paintings & such belonging to rich businessman xxmaj philip xxmaj steven

#### Step 4: Unfreeze all  layers and train, save the resulting classifier model

In [16]:
bs

48

In [17]:
# unfreeze all the weights
# throws CUDA out of memory error with batch size 48; reducing batch size to 32 size works
learn_c.unfreeze()
learn_c.fit_one_cycle(2, slice(lr/10/(2.6**4),lr/10), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time


RuntimeError: CUDA out of memory. Tried to allocate 102.00 MiB (GPU 0; 8.00 GiB total capacity; 5.58 GiB already allocated; 74.97 MiB free; 313.38 MiB cached)

In [None]:
learn_c.save('clas')