# Dataset Creation
1) Download the CNN/Dailymail dataset into the folder `datasets/`. The folder should be named `cnn_dailymail` already, and the train `.csv` should be in `datasets/cnn_dailymail/train.csv` (or change the directory below as needed)
2) create the directory `datasets/cnn_parsed`
3) Run this section of the notebook. This should remove all the dailymail and duplicate articles and save the new train test val split into the folder above.

In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

dataset_dir_in = 'datasets/cnn_dailymail/'
dataset_dir_out = 'datasets/cnn_parsed/'

In [22]:
# load train dataset
def load_and_parse(dataset = 'train'):
    df = pd.read_csv(dataset_dir_in + dataset + '.csv')
    df = df[df.article.str.contains('CNN')]
    df = df.drop_duplicates('article')
    return df

df_train = load_and_parse('train')
df_test = load_and_parse('test')
df_val = load_and_parse('validation')

In [23]:
df = pd.concat([df_train, df_val])
df_train, df_val = train_test_split(df, test_size=0.3)


In [35]:
num_articles_per_split = 128

def create_splits(df, num_articles_per_split):
    splits = np.array(range(df.shape[0]))
    splits = splits // num_articles_per_split
    df['split'] = splits
    return df

df_train, df_val = create_splits(df_train, num_articles_per_split), create_splits(df_val, num_articles_per_split)

In [36]:
df_train.to_csv(dataset_dir_out + 'train.csv')
df_val.to_csv(dataset_dir_out + 'val.csv')
df_test.to_csv(dataset_dir_out + 'test.csv')

# Synthetic Data Test

In [63]:
import data
import pandas as pd
import matplotlib.pyplot as plt
# test the summary generator

df_train = pd.read_csv('datasets/cnn_parsed/train.csv')

task = 'summary' # summary or qna
synthetic_data_dir = f'datasets/synthetic/{task}/'

if task == 'summary':
    generator = data.SummaryGenerator()
else:
    generator = data.QnAGenerator()

total_splits = df_train.split.max() + 1

total_splits

451

In [45]:
import time

cur_split = 0
df_parse = df_train[df_train.split == cur_split] # start with the first split
print('----- Number of articles to parse -----')
print(df_parse.shape[0])

start_time = time.time()

results = []
for i, (idx, row) in enumerate(df_parse.iterrows()):
    if (i+1) % 16 == 0:
        print(f'Sample {i+1}') 
    id = row.id
    article = row.article
    human_summary = row.highlights
    
    try:
        out = generator(article)
        out['id'] = id
        out['article'] = article
        out['gpt_summary'] = out.pop('summary')
        out['gpt_keywords'] = out.pop('keywords')
        out['human_summary'] = human_summary
        results.append(out)
    except Exception as e:
        print(e)

end_time = time.time()

print('----- ELAPSED TIME -----')
print(f'{end_time - start_time:0.1f} seconds')

results[0]

----- Number of articles to parse -----
128
Sample 16
Sample 32
Sample 48
Sample 64
Sample 80
Sample 96
Sample 112
Sample 128
----- ELAPSED TIME -----
208.3 seconds


{'id': '98e9ebb587845753ab2f26af3e278e28c9311892',
 'gpt_summary': 'The tragic case of John Crawford III, who was shot by police while holding a toy BB gun in Walmart, highlights the ongoing issues of racism and implicit bias in America. Despite the presence of video evidence that could clarify the events leading to his death, Walmart has yet to release the tape, which is crucial for confronting the realities of racial bias. This incident underscores the need for society to openly acknowledge and address the lingering effects of racism, as well as the disparities in how individuals are treated based on their race.',
 'gpt_keywords': ['John Crawford III',
  'police',
  'racism',
  'implicit bias',
  'Walmart'],
 'human_summary': 'John Crawford III, who was shopping, was shot by a white police officer in Walmart .\nVan Jones: There is video evidence and Walmart should release the tape to the public .\nHe says racism is not a thing of the past; implicit bias lurks in our minds .\nJones: W

In [None]:
time_per_article = (end_time-start_time) / df_parse.shape[0]
total_time = time_per_article * (df_train.shape[0] + df_val.shape[0])
print(f'Estimated total time: {(total_time/3600):0.2f}hrs')

num_tokens = 423348-273000

Estimated total time: 37.25hrs


0.779368344050902

In [None]:
# save results to dataframe
df_out = pd.DataFrame(columns = results[0].keys())
for result in results:
    for key in result.keys():
        result[key] = [result[key]]
    df_row = pd.DataFrame.from_dict(result)
    df_out = pd.concat([df_out, df_row])
df_out.head()

Unnamed: 0,id,article,gpt_summary,gpt_keywords,human_summary
0,98e9ebb587845753ab2f26af3e278e28c9311892,(CNN) -- When a black man dies at the hands of...,"The tragic case of John Crawford III, who was ...","[John Crawford III, police, racism, implicit b...","John Crawford III, who was shopping, was shot ..."
0,d698437338b33652a8260ec96df5a8b1cfadbf39,(CNN) -- Concerns about an impending terrorist...,A recent CNN poll indicates that concerns abou...,"[terrorism, poll, Americans, Iraq, president]",Poll: About 1 in 10 say terrorism is the most ...
0,38cc920f6a5326b056ad0dc08a8f5e63ee6f25ec,"Havana, Cuba (CNN) -- A member of the ""Cuban F...","Rene Gonzalez, a member of the Cuban Five spy ...","[Cuban Five, citizenship, return, Gonzalez, re...",'Cuban Five' renounces U.S. citizenship as con...
0,320f5735028777e4c4356c8292a704831240b9e8,(CNN) -- Two ships broke free Tuesday from the...,"Two ships, the Russian research vessel Akademi...","[Antarctic, ships, ice, rescue, researchers]",NEW: Cracks in the ice allowed a Russian resea...
0,a94802555098c2d71e68cedfbead6621d728b19f,"(CNN) -- Suppose that shortly after 9/11, when...",The text critiques President Bush's decision t...,"[military commissions, federal court, justice,...",Obama administration now reconsidering trying ...


In [64]:
df_out.to_csv(synthetic_data_dir + f'train_{cur_split}.csv')