In [None]:
import os
import random
import time

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

import torch
from torch import nn
from torch.nn import functional as F
from torch.nn import CrossEntropyLoss
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, random_split
torch.manual_seed(42)

from transformers import pipelines
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup

from pymongo import MongoClient

from tqdm.notebook import tqdm_notebook as tqdm

import nltk
nltk.download('punkt')

import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

In [None]:
# client = MongoClient()

In [None]:
# db = client['reviews']

In [None]:
# collection = db['reviews']

# BOOKMARK: get my own dataset for training

- Must make my own `reviews` df

In [None]:
def download_trump_tweets(fpath='trump_tweets.csv',append_date=True,
                          verbose=True,return_data=True):
    """Downloads the most recent data from the trumptwittearchive v2.
    https://drive.google.com/uc?export=download&id=1JZnhB0Nq_2RKtDb-IOnd0XxnD5c7x-nQ
    
    Args:
        fpath (str): filepath for data that ends with .csv
        append_date (bool): Whether to save today's date as part of filename(Default=True)
        verbose (bool): Whether to print the file name (Default=True)
        return_data (bool): Whether to return the data as a df (Default=True)"""
#     url = "https://www.thetrumparchive.com/latest-tweets"        ## Import data downloading packages
    import datetime as dt
    import requests
    import pandas as pd
    import json

    url="https://drive.google.com/uc?export=download&id=1JZnhB0Nq_2RKtDb-IOnd0XxnD5c7x-nQ"
    response = requests.get(url)
    
    if append_date:
        suffix = "_"+dt.date.today().strftime('%m-%d-%y')
        filepath = f"{fpath.split('.')[0]}{suffix}.{fpath.split('.')[-1]}"
    else:
        filepath=fpath
        
        
    ## Save output to csv file
    with open(filepath,'wb') as file:
        file.write(response.content)  
        
#     with open(filepath,'w') as f:
# #         f.write(response.content)
#         f.write(json.dumps(response.json()))
    
#     if fpath.endswith('.csv'):
#         tweets = pd.read_json(filepath)
#         tweets.to_csv(filepath)
#     else: 
#         tweets = pd.read_json(filepath)
        
    if verbose:
        print('[i] Tweet data successfully downloaded and saved as:')
        print('- ',filepath)
        
    if return_data:

        return pd.read_csv(filepath,index_col=0,parse_dates=['date'])
#tweets#,parse_dates=['created_at'])


df = download_trump_tweets()

In [None]:
df

In [None]:
# collection = df['text']

In [None]:
# test = []
# ind = []

# reviews = collection.find()

# for index, review in enumerate(reviews[50000:75000]):
#     try:
#         test.append('{0}'.format(review['reviewText']))
#     except KeyError:
#         ind.append(index)
#         pass

In [None]:
# review_data = pd.DataFrame(data=test, columns=(['reviews']))
# review_data = df.copy()

In [None]:
# review_data

In [None]:
# review_data.isna().sum()

In [None]:
# reviews = review_data.copy()

In [None]:
## JUST TRUMP
df = df[df['isRetweet'] == 'f']

In [None]:
# reviews = reviews['reviews']
reviews = df['text']


In [None]:
import nltk
tokenizer = nltk.TweetTokenizer()
reviewlen = []
for review in tqdm(reviews):

    tokens = tokenizer.tokenize(review)#nltk.word_tokenize(review)
    reviewlen.append(len(tokens))
    
reviewlen = np.array(reviewlen)

sns.distplot(reviewlen)

In [None]:
len(reviewlen[reviewlen > 768])/len(reviewlen)*100

In [None]:
print('Average review length: {} words.'.format(round(np.average(reviewlen), 3)))

In [None]:
print('Max review length: {} words.'.format(np.max(reviewlen)))

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2',
                                          bos_token='<|sot|>', eos_token='<|eot|>', pad_token='<|pad|>')

In [None]:
print("Max model length is {} for this model".format(tokenizer.model_max_length))
print("Beginning of sentence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
print("End of sentence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
print("Padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))

In [None]:
class GPT_Finetune_Dataset(Dataset):

  def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=200):

    self.tokenizer = tokenizer
    self.input = []
    self.attn = []

    for txt in tqdm(txt_list):

      encodings_dict = tokenizer('<|sot|>'+ txt +'<|eot|>',
                                 truncation=True, max_length=max_length, padding="max_length")

      self.input.append(torch.tensor(encodings_dict['input_ids']))
      self.attn.append(torch.tensor(encodings_dict['attention_mask']))
    
  def __len__(self):
    return len(self.input)

  def __getitem__(self, idx):
    return self.input[idx], self.attn[idx] 

In [None]:
data = GPT_Finetune_Dataset(reviews, tokenizer)

In [None]:
train_size = int(len(data) * .7)
test_size = len(data) - train_size

train_set, test_set = random_split(data, [train_size, test_size])

In [None]:
print('{} training samples'.format(train_size))
print('{} test samples'.format(test_size))

In [None]:
batch_size = 3
train_dataloader = DataLoader(
            train_set,  # The training set
            sampler = RandomSampler(train_set), # Random sampler
            batch_size = batch_size # Trains with this batch size for memory reasons
        )

test_dataloader = DataLoader(
            test_set, # The validation samples.
            sampler = SequentialSampler(test_set), # Pull out batches sequentially since order doesn't matter
            batch_size = batch_size 
        )

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
# Get config
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# Model instantiation
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# Necessary because of the custom tokens
model.resize_token_embeddings(len(tokenizer))

# Model to the GPU
device = torch.device('cpu')#"cuda" if torch.cuda.is_available() else "cpu")
model.cuda()

seed_val = 42

# Setting seeds
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
# Setting Parameters
epochs = 5
learning_rate = .001#.00005
warmup_steps = 50

# this produces sample output every 100 steps
sample_every = 100

In [None]:
#AdamW is a class from the huggingface library that schedules weights
optimizer = AdamW(model.parameters(),
                  lr = learning_rate
                )

In [None]:
total_steps = len(train_dataloader) * epochs

# Adjusts the learning rate as the model steps through
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)

In [None]:
timestat = time.time()

stats = []

model = model.to(device)

for epoch_i in range(0, epochs):

    # training loop
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))

    timestat = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()        

        outputs = model(  b_input_ids,
                          labels=b_labels, 
                          attention_mask = b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Get sample every x batches.
        if step % sample_every == 0 and not step == 0:

            elapsed = time.time() - timestat
            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

            model.eval()

            sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000),
                                    do_sample=True,   
                                    top_k=50, 
                                    max_length = 200,
                                    top_p=0.95, 
                                    num_return_sequences=1
                                )
            for i, sample_output in enumerate(sample_outputs):
                  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
            
            model.train()

        loss.backward()

        optimizer.step()

        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)       
    
    # Measure how long this epoch took.
    training_time = time.time() - timestat

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

    # Testing loop
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in test_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        
        with torch.no_grad():        

            outputs  = model(b_input_ids, 
#                            token_type_ids=None, 
                             attention_mask = b_masks,
                            labels=b_labels)
          
            loss = outputs[0]  
            
        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(test_dataloader)
    
    validation_time = time.time() - timestat 

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

In [None]:
training_df = pd.DataFrame(data=stats)

training_df = training_df.set_index('epoch')

training_df

In [None]:
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)
plt.plot(training_df['Training Loss'], 'r-o', label="Training")
plt.plot(training_df['Valid. Loss'], 'b-o', label="Testing")
plt.title("Training & Testing Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4])
plt.show()

In [None]:
params = list(model.named_parameters())

In [None]:
# for param in params:
#     print(param[0])

In [None]:
output_dir = './model_save/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
print("Saving model to {}".format(output_dir))

In [None]:
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

## BOOKMARK: MAKING PROMPTS 

In [None]:
prompts = []
for review in reviews[:15]:
    try:
        prompts.append("<|sot|> " + ' '.join(review.split()))
    except:
        pass

In [None]:
prompts

In [None]:
model.eval()

gen_prompt = []
for prompt in prompts:
    gen_prompt.append(torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).to(device))

In [None]:
for x, promp in enumerate(gen_prompt):
    print('---------------------------------')
    print('''The prompt is "{}"'''.format(prompts[x]))
    print('---------------------------------')
    sample_outputs = model.generate(
                                    promp, 
                                    bos_token_id= random.randint(1, 100000),
                                    do_sample=True,   
                                    top_k=30, 
                                    min_length=20,
                                    max_length = 40,
                                    top_p=0.95,
                                    num_return_sequences=5
                                    )

    for i, sample_output in enumerate(sample_outputs):
      print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))