In [3]:
import re
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ! pip install transformers==2.11.0
! pip install transformers
from transformers import pipeline
# from transformers import AutoTokenizer, AutoModelForCausalLM

!pip install wandb -qqq
import wandb



In [2]:
! python -m pip install --upgrade pip

Collecting pip
  Downloading pip-22.3.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.3
    Uninstalling pip-22.3:
      Successfully uninstalled pip-22.3
[0mSuccessfully installed pip-22.3.1


## Get Quotes data from Github and load as Pandas Df

In [None]:
! curl https://raw.githubusercontent.com/krohak/QuoteGen/master/data/quotes_all.csv > quotes_all.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 11.9M  100 11.9M    0     0  21.9M      0 --:--:-- --:--:-- --:--:-- 21.9M


In [None]:
data = pd.read_csv('quotes_all.csv',delimiter=';')
data.head()

Unnamed: 0,Quote,Author,Topic
0,Age is an issue of mind over matter. If you do...,Mark Twain,age
1,"Anyone who stops learning is old, whether at t...",Henry Ford,age
2,Wrinkles should merely indicate where smiles h...,Mark Twain,age
3,True terror is to wake up one morning and disc...,Kurt Vonnegut,age
4,A diplomat is a man who always remembers a wom...,Robert Frost,age


In [None]:
data.shape

(75966, 3)

In [None]:
strip_special_chars = re.compile("[^A-Za-z0-9., ]+")

def cleanSentences(string):
    string = string.replace("\n", " ")
    sentence = re.sub(strip_special_chars, "", string)
    return sentence #re.sub(r"(\w)([.,])", r"\1 \2", sentence)

sentence = '''I will never be an old man-$. 
To me, old age is always 15 years older than I am.'''
print('Original Sentence: %s'%sentence)
print('Cleaned Sentence: %s'%cleanSentences(sentence))

Original Sentence: I will never be an old man-$. 
To me, old age is always 15 years older than I am.
Cleaned Sentence: I will never be an old man.  To me, old age is always 15 years older than I am.


In [None]:
data['CleanedQuotes'] = data['Quote'].apply(cleanSentences)

In [None]:
data.head()

Unnamed: 0,Quote,Author,Topic,CleanedQuotes
0,Age is an issue of mind over matter. If you do...,Mark Twain,age,Age is an issue of mind over matter. If you do...
1,"Anyone who stops learning is old, whether at t...",Henry Ford,age,"Anyone who stops learning is old, whether at t..."
2,Wrinkles should merely indicate where smiles h...,Mark Twain,age,Wrinkles should merely indicate where smiles h...
3,True terror is to wake up one morning and disc...,Kurt Vonnegut,age,True terror is to wake up one morning and disc...
4,A diplomat is a man who always remembers a wom...,Robert Frost,age,A diplomat is a man who always remembers a wom...


In [None]:
data.shape

(75966, 4)

In [None]:
data['CleanedQuotes'][ data['CleanedQuotes'].str.len() == data['CleanedQuotes'].str.len().max() ].iloc[0]

'Furthermore, we believe that health care reform, again I said at the beginning of my remarks, that we sent the three pillars that the Presidents economic stabilization and job creation initiatives were education and innovation  innovation begins in the classroom  clean energy and climate, addressing the climate issues in an innovative way to keep us number one and competitive in the world with the new technology, and the third, first among equals I may say, is health care, health insurance reform.'

In [None]:
cleanData = data['CleanedQuotes'].drop_duplicates()

In [None]:
cleanData.shape

(61067,)

In [None]:
from sklearn.model_selection import train_test_split

train_test_ratio = 0.9
train_valid_ratio = 7/9

df_full_train, df_test = train_test_split(cleanData, train_size = train_test_ratio, random_state = 1)
df_train, df_valid = train_test_split(df_full_train, train_size = train_valid_ratio, random_state = 1)

In [None]:
df_train

13321    Some people are so afraid do die that they nev...
36243    Peace Train is a song I wrote, the message of ...
33810    To get rich never risk your health. For it is ...
29569             Active Evil is better than Passive Good.
19674    Our faith is stronger than death, our philosop...
                               ...                        
71312    For truth has such a face and such a mien, as ...
33848            A pint of sweat, saves a gallon of blood.
7407     Family farms and small businesses are the back...
24325    Freedom means you are unobstructed in living y...
56591      Nothing external to you has any power over you.
Name: CleanedQuotes, Length: 42746, dtype: object

In [None]:
def generate_text_file(contents, filename):
  with open(filename, 'w', encoding='utf-8') as fhead:
    fhead.write("\n".join(contents.tolist()))

generate_text_file(df_train, 'train.txt')
generate_text_file(df_valid, 'val.txt')
generate_text_file(df_test, 'test.txt')

## Load Hugging Face pretrained model and fine tune

In [None]:
tokenizer = AutoTokenizer.from_pretrained("nandinib1999/quote-generator")
model = AutoModelForCausalLM.from_pretrained("nandinib1999/quote-generator")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ml6team/gpt-2-medium-conditional-quote-generator")
model = AutoModelForCausalLM.from_pretrained("ml6team/gpt-2-medium-conditional-quote-generator")

Downloading:   0%|          | 0.00/527 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/828 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/379 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

In [None]:
generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

In [None]:
generator(
    "Given Topics: anger. Related Quote: This is not okay"
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Given Topics: anger. Related Quote: This is not okay to be angry about. but this is not okay to not be angry about.'}]

In [None]:
wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
! curl https://raw.githubusercontent.com/nandinib1999/gpt2_quotes_generation/main/run_language_modeling.py > run_language_modeling.py

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 10563  100 10563    0     0  74387      0 --:--:-- --:--:-- --:--:-- 74387


In [None]:
! rm -rf "/content/model/checkpoint-21000"

In [None]:
! git lfs install
! git clone https://huggingface.co/nandinib1999/quote-generator

Error: Failed to call git rev-parse --git-dir --show-toplevel: "fatal: not a git repository (or any of the parent directories): .git\n"
Git LFS initialized.
Cloning into 'quote-generator'...
remote: Enumerating objects: 29, done.[K
remote: Counting objects: 100% (29/29), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 29 (delta 8), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (29/29), done.
Filtering content: 100% (3/3), 961.49 MiB | 66.88 MiB/s, done.


In [None]:
!python run_language_modeling.py \
--output_dir='/content/model' \
--overwrite_output_dir \
--model_type=gpt2 \
--model_name_or_path='quote-generator/' \
--tokenizer_name=gpt2 \
--do_train \
--train_data_file='train.txt' \
--do_eval \
--eval_data_file='val.txt' \
--per_device_train_batch_size=2 \
--per_device_eval_batch_size=2 \
--line_by_line \
--evaluate_during_training \
--learning_rate 5e-5 \
--num_train_epochs 1 \
--save_total_limit 1

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Evaluation:  23% 1384/6107 [00:19<01:05, 72.19it/s][A[A

Evaluation:  23% 1392/6107 [00:19<01:06, 71.31it/s][A[A

Evaluation:  23% 1400/6107 [00:19<01:05, 72.03it/s][A[A

Evaluation:  23% 1408/6107 [00:20<01:04, 72.31it/s][A[A

Evaluation:  23% 1416/6107 [00:20<01:06, 71.07it/s][A[A

Evaluation:  23% 1424/6107 [00:20<01:07, 69.49it/s][A[A

Evaluation:  23% 1432/6107 [00:20<01:06, 70.47it/s][A[A

Evaluation:  24% 1440/6107 [00:20<01:09, 67.41it/s][A[A

Evaluation:  24% 1448/6107 [00:20<01:07, 69.08it/s][A[A

Evaluation:  24% 1456/6107 [00:20<01:05, 70.61it/s][A[A

Evaluation:  24% 1464/6107 [00:20<01:08, 67.93it/s][A[A

Evaluation:  24% 1472/6107 [00:21<01:06, 69.90it/s][A[A

Evaluation:  24% 1480/6107 [00:21<01:06, 69.14it/s][A[A

Evaluation:  24% 1488/6107 [00:21<01:05, 70.02it/s][A[A

Evaluation:  24% 1496/6107 [00:21<01:05, 69.98it/s][A[A

Evaluation:  25% 1504/6107 [00:21<01:06, 68.86it/

In [None]:
! pip install huggingface_hub
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('/content/model')
model = GPT2LMHeadModel.from_pretrained('/content/model')

In [None]:
model.push_to_hub("QuoteGen")

CommitInfo(commit_url='https://huggingface.co/krohak/QuoteGen/commit/fd3cdb56cf426a9459ed278747c927b26d6f4c8d', commit_message='Upload model', commit_description='', oid='fd3cdb56cf426a9459ed278747c927b26d6f4c8d', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub("QuoteGen")

CommitInfo(commit_url='https://huggingface.co/krohak/QuoteGen/commit/0d3a2acd289a41bcfcd32c8c3cf1f9a6dc6ee57a', commit_message='Upload tokenizer', commit_description='', oid='0d3a2acd289a41bcfcd32c8c3cf1f9a6dc6ee57a', pr_url=None, pr_revision=None, pr_num=None)

In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('krohak/QuoteGen')
model = GPT2LMHeadModel.from_pretrained('krohak/QuoteGen')

Downloading:   0%|          | 0.00/999k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/107 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/794 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/917 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/510M [00:00<?, ?B/s]

In [4]:
from transformers import pipeline

In [5]:
generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
generator(
    "Once upon a time"
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Once upon a time the government of the people of America has said that the time for national defense lies in the next two years as well as today. The political and economic consequences of that have already become too great for any of us to deal with.'}]

Bad pipe message: %s [b'Z\x11\x9f\xe79\xee\x18\x19\x0fi\x9dM\xfeT]\xf8\x18= \x1d\xdf\x85m\x01\x95\xdc\x1b\x0e\x80z,\xbc-\x7f\x1ay\xb8\xc3\xe8\xdaHs}\xea9\x17\xc9\xd6\xa9\xe5\xf2\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 gm\xaa\xd9\x02']
Bad pipe message: %s [b'\x11\x813\xdf\xbczg\xed\xeeU\xd5\x04z\xa2\xba,\x83\x18\x85\xb8\xda']
Bad pipe message: %s [b'\xdaG\xac\x95\xa2\x0b\xfcB\xf3\x8a\r\x01$\xe2\\\xb8z\t \x88\x0e\xe6\xf3\x02$\x08\x1d\xfbT\x8eE\xc4\x8fb\x15_&\xfe,\xd0>\x98\xea^H$\xe4\x87\xf6\xc4z\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x0