In [1]:
import boto3
import pandas as pd

In [2]:
# DO NOT PUSH CREDENTIALS TO REPO
s3 = boto3.client(
    service_name='s3',
    region_name='us-west-2',
    aws_access_key_id='',
    aws_secret_access_key=''
)

In [3]:
# Querying CSV for relevant results
resp = s3.select_object_content(
    Bucket='reddit-title-generation',
    Key='dataset-train.csv',
    ExpressionType='SQL',
    # Need to use positional headers in query
    Expression="SELECT * FROM s3object s LIMIT 1000",
    InputSerialization = {'CSV': {"FileHeaderInfo": "NONE", 'AllowQuotedRecordDelimiter':True}, 'CompressionType': 'NONE'},
    OutputSerialization = {'CSV': {}},
)

records = []
for event in resp['Payload']:
    if 'Records' in event:
        # records.append(event['Records']['Payload'].decode('utf-8'))
        records.append(event['Records']['Payload'])  
        
file_str = ''.join(req.decode('utf-8') for req in records)

In [4]:
from io import StringIO
train = pd.read_csv(StringIO(file_str), header=0)
train.head()

Unnamed: 0.1,Unnamed: 0,author,body,normalizedBody,subreddit,subreddit_id,id,content,summary
0,2099088,UghImRegistered,&gt; â€œI have friends with the same degree as m...,"> â€œI have friends with the same degree as me, ...",politics,t5_2cneq,c1v1gu3,"I have friends with the same degree as me, fro...","co-op, get some."
1,2893884,PossibleLesbian,Just a bit of background: I grew up Catholic. ...,Just a bit of background: I grew up Catholic. ...,actuallesbians,t5_2rch0,t3_185lqh,Just a bit of background: I grew up Catholic. ...,Former Catholic confused about sexuality. Has ...
2,2237635,[deleted],I myself enjoy approaching an attractive young...,I myself enjoy approaching an attractive young...,AskReddit,t5_2qh1i,t3_g0rk6,I myself enjoy approaching an attractive young...,I've noticed a lot of stuff on Reddit concerni...
3,498777,Naztash,You do realize that the contract probably has ...,You do realize that the contract probably has ...,TopGear,t5_2r9n6,cpcdljw,You do realize that the contract probably has ...,"He is not their child, but he is acting like a..."
4,1337130,BurChaBow,[](/dashiewilliamisboredofnamingemotes)\n\nI g...,ï¿¿ I got a teacher that used the most ridiculou...,MLPLounge,t5_2t403,cgej4pd,I got a teacher that used the most ridiculous ...,"Teacher likes papers, and said ""Pdf isn't the..."


In [5]:
from fastai.text.all import *
from transformers import *
from blurr.text.data.all import *
from blurr.text.modeling.all import *

#Select part of data we want to keep
train_texts = train[['content','summary']]

#Clean text

  squad_metric = load_metric("squad")


In [6]:
pretrained_model_name = "facebook/bart-large-cnn"
hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(pretrained_model_name, 
                                                                  model_cls=BartForConditionalGeneration)

# Create mini-batch and define parameters
hf_batch_tfm = Seq2SeqBatchTokenizeTransform(hf_arch, hf_config, hf_tokenizer, hf_model, 
    task='summarization')

# Simple preprocessing
preprocessor = SummarizationPreprocessor(
    hf_tokenizer,
    text_attr='content',
    target_text_attr='summary',
    max_input_tok_length=256,
    max_target_tok_length=130,
    min_summary_char_length=30,
)

preprocessed_train = preprocessor.process_df(train_texts)


# Prepare data for training
blocks = (Seq2SeqTextBlock(batch_tokenize_tfm=hf_batch_tfm), noop)
dblock = DataBlock(blocks=blocks, get_x=ColReader('content'), get_y=ColReader('summary'), splitter=RandomSplitter())
# Batch size can be changed here
dls = dblock.dataloaders(preprocessed_train, bs = 8)

  final_df = final_df.append(self._process_df_batch(batch_df))
  final_df = final_df.append(self._process_df_batch(batch_df))


Due to IPython and Windows limitation, python multiprocessing isn't available now.
So `number_workers` is changed to 0 to avoid getting stuck




In [7]:
#Define performance metrics
seq2seq_metrics = {
        'rouge': {
            'compute_kwargs': { 'rouge_types': ["rouge1", "rouge2", "rougeL"], 'use_stemmer': True },
            'returns': ["rouge1", "rouge2", "rougeL"]
        },
        'bertscore': {
            'compute_kwargs': { 'lang': 'fr' },
            'returns': ["precision", "recall", "f1"]}}

#Model
model = BaseModelWrapper(hf_model)
learn_cbs = [BaseModelCallback]
fit_cbs = [Seq2SeqMetricsCallback(custom_metrics=seq2seq_metrics)]

#Specify training
learn = Learner(dls, model,
                opt_func=ranger,loss_func=CrossEntropyLossFlat(),
                cbs=learn_cbs,splitter=partial(blurr_seq2seq_splitter, arch=hf_arch)).to_fp16()

In [8]:
learner = learn.load('bart_reddit_summary')

In [9]:
outputs = learn.blurr_generate(train_texts.iloc[0].content, early_stopping=False, num_return_sequences=1, \
                               min_length=30, max_length=50)

for idx, o in enumerate(outputs):
    print(f'=== Prediction {idx+1} ===\n{o}\n')

=== Prediction 1 ===
{'generated_texts': ' Kyle Bishop, 23, has spent the last two years waiting tables, delivering beer, working at a bookstore and entering data. â€œItâ€™s more about luck than anything else. I have friends with the same degree'}

