# Experiment4: Trump Tweet Generator

The goal of this notebook is to use the embeddings generated in Lesson 4 to build a Trump Tweet generator.

Plan:

1. Prepare dataset.
2. Load and test embeddings.
3. Finetune language model on Trump tweets.
4. Test.

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import json
import re
import html
import random

from pathlib import Path

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.fastai.rnn_reg import *
from fastai.fastai.rnn_train import *
from fastai.fastai.nlp import *
from fastai.fastai.lm_rnn import *

import dill

## 1. Prepare dataset

In [3]:
PATH = Path('data/trump')

In [3]:
PATH.mkdir(exist_ok=True)

In [4]:
!git clone git@github.com:bpb27/trump_tweet_data_archive.git {PATH}

Cloning into 'data/trump'...
remote: Counting objects: 36441, done.[K
remote: Compressing objects: 100% (83/83), done.[K
remote: Total 36441 (delta 28), reused 57 (delta 1), pack-reused 36357[K
Receiving objects: 100% (36441/36441), 3.18 GiB | 71.81 MiB/s, done.
Resolving deltas: 100% (10162/10162), done.
Checking connectivity... done.


In [4]:
for file in PATH.iterdir():
    if not file.name.endswith('zip'):
        continue

    !unzip -q -d{PATH} -o {file}

In [5]:
trump_2018 = json.load(open(PATH / 'master_2018.json'))

In [6]:
type(trump_2018)

list

In [7]:
trump_2018[3]['full_text']

'HAPPY NEW YEAR! We are MAKING AMERICA GREAT AGAIN, and much faster than anyone thought possible!'

Iterate through each master file and save text into train and validation sets.

In [8]:
TRN = 'train'
VAL = 'val'

TRN_PATH = PATH / TRN
VAL_PATH = PATH / VAL

TRN_PATH.mkdir(exist_ok=True)
VAL_PATH.mkdir(exist_ok=True)

In [11]:
random.seed(42)

for file in PATH.iterdir():
    if not (
        file.name.endswith('json') and
        file.name.startswith('master')
        
    ):
        continue
        
    tweets = json.load(open(file))
    
    for t in tweets:
        # Ignore replies
        if t.get('in_reply_to_status_id'):
            continue
            
        # Ignore retweets
        if t['retweeted']:
            continue
    
        if t.get('full_text'):
            text = t['full_text']
        else:
            text = t['text']
            
        text = text.strip()
            
        if text.startswith('RT'):
            continue
            
        # Strip urls
        text = ' '.join(
            [w for w in text.split(' ') if not w.startswith('http')])
            
        # Unescape html entites
        text = html.unescape(text)
        
        set_name = 'val' if random.random() > 0.8 else 'train'
        
        with open(PATH / set_name / f"{t['id']}.txt", 'w') as fh:
            fh.write(text)

In [12]:
!cat {PATH}/train/10027087487.txt

From Donald Trump: Andrea Bocelli @ Mar-a-Lago - Many say best night of entertainment in long history of Palm Beach

## 2. Create word embeddings on IMDB data

In [7]:
PATH='data/aclImdb/'

TRN_PATH = 'train/all/'
VAL_PATH = 'test/all/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}

imdbEr.txt  imdb.vocab  [0m[01;34mmodels[0m/  README  [01;34mtest[0m/  [01;34mtmp[0m/  [01;34mtrain[0m/


In [10]:
TEXT = data.Field(lower=True, tokenize="spacy")

In [11]:
bs=64; bptt=70

In [12]:
%%time
imdb_model_data = LanguageModelData.from_text_files(
    PATH, TEXT,
    train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH,
    bs=bs, bptt=bptt, min_freq=10)

CPU times: user 3min 56s, sys: 3.5 s, total: 3min 59s
Wall time: 3min 59s


In [13]:
dill.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb'))

In [14]:
em_sz = 200  # size of each embedding vector
nh = 500     # number of hidden activations per layer
nl = 3       # number of layers

opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [15]:
learner = imdb_model_data.get_model(opt_fn, em_sz, nh, nl,
               dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05)
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip=0.3

In [16]:
learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=2)

HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      4.823443   4.711968  
    1      4.642202   4.519935                                
    2      4.530305   4.438553                                
    3      4.588088   4.467611                                
    4      4.502331   4.395613                                
    5      4.417197   4.338107                                
    6      4.393524   4.321291                                
    7      4.522829   4.414174                                
    8      4.50086    4.389934                                
    9      4.458413   4.356636                                
    10     4.422713   4.326128                                
    11     4.38166    4.296248                                
    12     4.350391   4.272144                                
    13     4.310256   4.258536                                
    14     4.337864   4.255822                                



[array([4.25582])]

In [17]:
learner.save_encoder('adam1_enc')

In [18]:
learner.load_encoder('adam1_enc')

In [20]:
learner.save('adam3_model')

In [21]:
learner.fit(3e-3, 1, wds=1e-6, cycle_len=10, cycle_save_name='adam3_20')

HBox(children=(IntProgress(value=0, description='Epoch', max=10), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      4.472974   4.37178   
    1      4.462487   4.359999                                
    2      4.441079   4.339152                                
 92%|█████████▏| 4223/4583 [19:42<01:40,  3.57it/s, loss=4.42]    3      4.411868   4.319829  
    4      4.382664   4.294869                                
    5      4.346344   4.270796                                
    6      4.313401   4.250776                                
    7      4.324756   4.237291                                
    8      4.263578   4.227346                                
    9      4.270439   4.225821                                



[array([4.22582])]

In [22]:
learner.save_encoder('adam3_20_enc')

In [24]:
learner.fit(0.0015, 1, wds=1e-6, cycle_len=10, cycle_save_name='adam4_10')

HBox(children=(IntProgress(value=0, description='Epoch', max=10), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      4.380983   4.284288  
    1      4.354751   4.273869                                
    2      4.356179   4.266137                                
    3      4.329464   4.255518                                
    4      4.315512   4.244637                                
    5      4.297495   4.234239                                
    6      4.270093   4.222948                                
    7      4.255681   4.216224                                
    8      4.250174   4.213073                                
    9      4.236307   4.212431                                



[array([4.21243])]

In [25]:
learner.save_encoder('adam4_10_enc')

In [26]:
learner.fit(0.0015, 4, wds=1e-6, cycle_len=1, cycle_mult=2)

HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      4.310396   4.225647  
    1      4.347734   4.244958                                
    2      4.275885   4.217382                                
    3      4.363409   4.264354                                
    4      4.316585   4.239199                                
    5      4.287024   4.216901                                
    6      4.248019   4.210321                                
    7      4.363109   4.266071                                
    8      4.357974   4.260314                                
    9      4.323174   4.245379                                
    10     4.29578    4.232848                                
 94%|█████████▍| 4310/4583 [20:25<01:17,  3.52it/s, loss=4.29]    11     4.280968   4.218835  
    12     4.260347   4.209569                                
    13     4.232872   4.203747                                
    14     4.232934   4.202884                       

[array([4.20288])]

In [27]:
learner.save_encoder('adam5_enc')

In [28]:
dill.dump(TEXT, open(f'{PATH}models/TEXT-adam5.pkl','wb'))

## 3. Fine-tune language model on Trump set

In [2]:
PATH = Path('data/trump')

In [3]:
TEXT = dill.load(open(f'./data/aclImdb/models/TEXT-adam5.pkl', 'rb'))     

In [4]:
TRN = 'train'
VAL = 'val'
bs=64
bptt=70
em_sz = 200  # size of each embedding vector
nh = 500     # number of hidden activations per layer
nl = 3       # number of layers

In [5]:
trump_model_data = LanguageModelData.from_text_files(
    PATH, TEXT,
    train=TRN, validation=VAL, test=VAL,
    bs=bs, bptt=bptt, min_freq=10)

In [6]:
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [8]:
m3 = trump_model_data.get_model(
    opt_fn, em_sz, nh, nl,
    dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05)

In [11]:
m3.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
m3.load_encoder(f'adam5_enc')

In [12]:
lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])

In [13]:
m3.freeze_to(-1)

In [15]:
m3.fit(3e-3, 1, wds=1e-6, cycle_len=10)

HBox(children=(IntProgress(value=0, description='Epoch', max=10), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      4.684523   4.197061  
    1      4.047135   3.914536                              
    2      3.748342   3.812412                              
    3      3.560045   3.747139                              
    4      3.401466   3.721526                              
    5      3.258891   3.705311                              
    6      3.164892   3.702058                              
    7      3.109769   3.702141                              
    8      3.052933   3.698228                              
    9      3.040893   3.700763                              



[array([3.70076])]

In [16]:
model = m3.model

In [27]:
ss = "am"

In [28]:
s = [TEXT.tokenize(ss)]
t = TEXT.numericalize(s)
' '.join(s[0])

'am'

In [29]:
model[0].bs = 1
model.eval()
model.reset()
res, *_ = model(t)
model[0].bs = bs

In [30]:
next_words = torch.topk(res[-1], 10)[1]
[TEXT.vocab.itos[o] for o in to_np(next_words)]

[',', '.', 'in', 'at', 'to', '<unk>', 'i', '!', 'a', 'and']

In [62]:
print(ss,"\n")
for i in range(100):
    n=res[-1].topk(10)[1]
    
    n = n[random.randint(0, 9)] if n.data[0]==0 else n[0]

    print(TEXT.vocab.itos[n.data[0]], end=' ')
    res,*_ = model(n[0].unsqueeze(0))
print('...')

am 

champion <eos> " a good night for the american people . " # obama <eos> " i 'm a big fan of all of the other candidates and i have a lot of common sense . " donald trump " <eos> " you have to love what you do . " -- think like a champion <eos> " <unk> : # new york city , trump national hotel & tower , new york , is the best hotel in the world ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ...


In [36]:
res[-1].topk(2)[0].data[0]

10.741398811340332

In [40]:
n.data[0]

46

In [53]:
n.data[0]

38

In [59]:
res[-1].topk(10)[0]

Variable containing:
 10.1666
  8.7362
  8.1153
  7.8387
  7.3282
  7.0713
  6.8716
  6.7939
  6.5435
  6.4551
[torch.cuda.FloatTensor of size 10 (GPU 0)]