# Instructions
* Run the code starting from 'Dataset Creation' up to Synthetic Data Generation. You only need to do this once
* Then, the 'Synthetic Data Generation' section will contain everything you need for generating synthetic data from the dataset.

# Dataset Creation
1) Download the CNN/Dailymail dataset into the folder `datasets/`. The folder should be named `cnn_dailymail` already, and the train `.csv` should be in `datasets/cnn_dailymail/train.csv` (or change the directory below as needed)
2) create the directory `datasets/cnn_parsed`
3) Run this section of the notebook. This should remove all the dailymail and duplicate articles and save the new train test val split into the folder above.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

dataset_dir_in = 'datasets/cnn_dailymail/'
dataset_dir_out = 'datasets/cnn_parsed/'

In [3]:
# load train dataset
def load_and_parse(dataset = 'train'):
    df = pd.read_csv(dataset_dir_in + dataset + '.csv')
    df = df[df.article.str.contains('CNN')]
    df = df.drop_duplicates('article')
    return df

df_train = load_and_parse('train')
df_test = load_and_parse('test')
df_val = load_and_parse('validation')

In [4]:
df = pd.concat([df_train, df_val, df_test])
# df_train, df_val = train_test_split(df, test_size=0.3)


In [5]:
num_articles_per_split = 128

def create_splits(df, num_articles_per_split):
    df = df.sort_values('id')
    splits = np.array(range(df.shape[0]))
    splits = splits // num_articles_per_split
    df['split'] = splits
    return df

df = create_splits(df, num_articles_per_split)

In [None]:
df.to_csv(dataset_dir_out + 'all.csv')

# Synthetic Data Generation

* Below, make sure the directories match with the ones you created
* Then specify 'splits' to choose which splits you want to generate data for. Kerem: 0-150, Lillian: 151-300, Emma: 301-450

In [12]:
import data
import matplotlib.pyplot as plt
import pandas as pd
from os import listdir
from os.path import isfile, join

def create_lookup_table(dir = 'datasets/synthetic/summary/'):
    files = [f for f in listdir(dir) if isfile(join(dir, f))]

    lookup_df = None
    for f in files:
        loc = dir + f
        if lookup_df is None:
            lookup_df = pd.read_csv(loc)
        else:
            df2 = pd.read_csv(loc)
            lookup_df = pd.concat([lookup_df, df2])

    unique_ids = lookup_df.id.unique()
    id_lookup = {}
    for idx in unique_ids:
        id_lookup[idx] = 1

    return id_lookup

id_lookup = create_lookup_table()

def is_processed(row):
    try:
        idx = id_lookup[row['id']]
        return False
    except:
        return True

In [None]:
df_all = pd.read_csv('datasets/cnn_parsed/all.csv')
print('ALL DATA')
print(df_all.shape[0])

df_all = df_all[df_all.apply(is_processed, axis = 1)]
print('REMAINING')
print(df_all.shape)

task = 'summary' # summary or qna
synthetic_data_dir = f'datasets/synthetic/{task}/'

if task == 'summary':
    generator = data.SummaryGenerator()
else:
    generator = data.QnAGenerator()

total_splits = df_all.split.max() + 1

total_splits

ALL DATA
83558
REMAINING
(60355, 5)


653

In [16]:
import time

splits = list(range(0, 1))
data.process_splits(df_all, generator, splits, synthetic_data_dir = synthetic_data_dir, mode = 'all')

----- Parsing split 0 -----
----- Number of articles to parse: 34 -----
----- ELAPSED TIME -----
75.1 seconds


# Remove duplicates

{'02268ba08216bf41f800a591340fa3ca516d821b': 1,
 '02271084074dc719dfa84594eba2eae7bc5ac51d': 1,
 '02271c690c2e7876fbd1167750e91d360205b8f0': 1,
 '0228c2331812dc41f4dbac83845761d7e632d400': 1,
 '022a2cd3f444f0a362aa7d66267cb74ba5bebd94': 1,
 '022baf8154b902fb58eef255bafca90372137eca': 1,
 '022bc468b129b910cf4a664ce0c6b23cce710280': 1,
 '022e6889857ee92fc0d27fd8977d55054e78a112': 1,
 '022e6cdf6e27f7ff388003b11cda41cfcc8e7905': 1,
 '023056f02b6a6323ba1ce70e7ed269a6155d5647': 1,
 '0230dee900d5ef4d8c4228e5a9591406189736f3': 1,
 '023283c87f2cf858311d2e08990277b6ee912d03': 1,
 '023299c641b14f0acf3ebda5ecf5e137b0e507fb': 1,
 '023848d6c598e80f4ed9585cb6a40d33b2b4632d': 1,
 '02390fafd882a0c90dd2d4f810fec81a84de015f': 1,
 '02394f7713b7f8fffc20e46cb5f220c330446d78': 1,
 '02397d573fdba783fa039043e9a4d9df5755dfaa': 1,
 '023b63c15d39122daae59b4f46b4b19f72a32d05': 1,
 '023bda0457516013b519b36cdcd225c0b953cf91': 1,
 '023d900e7ec5f82ab85139be235185117c8f560f': 1,
 '023efd2b40b245ffca3a19cebc20f7a6da3114