# Instructions
* Run the code starting from 'Dataset Creation' up to Synthetic Data Generation. You only need to do this once
* Then, the 'Synthetic Data Generation' section will contain everything you need for generating synthetic data from the dataset.

# Dataset Creation
1) Download the CNN/Dailymail dataset into the folder `datasets/`. The folder should be named `cnn_dailymail` already, and the train `.csv` should be in `datasets/cnn_dailymail/train.csv` (or change the directory below as needed)
2) create the directory `datasets/cnn_parsed`
3) Run this section of the notebook. This should remove all the dailymail and duplicate articles and save the new train test val split into the folder above.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

dataset_dir_in = 'datasets/cnn_dailymail/'
dataset_dir_out = 'datasets/cnn_parsed/'

In [3]:
# load train dataset
def load_and_parse(dataset = 'train'):
    df = pd.read_csv(dataset_dir_in + dataset + '.csv')
    df = df[df.article.str.contains('CNN')]
    df = df.drop_duplicates('article')
    return df

df_train = load_and_parse('train')
df_test = load_and_parse('test')
df_val = load_and_parse('validation')

In [4]:
df = pd.concat([df_train, df_val, df_test])
# df_train, df_val = train_test_split(df, test_size=0.3)


In [5]:
num_articles_per_split = 128

def create_splits(df, num_articles_per_split):
    df = df.sort_values('id')
    splits = np.array(range(df.shape[0]))
    splits = splits // num_articles_per_split
    df['split'] = splits
    return df

df = create_splits(df, num_articles_per_split)

In [None]:
df.to_csv(dataset_dir_out + 'all.csv')

# Synthetic Data Generation

* Below, make sure the directories match with the ones you created
* Then specify 'splits' to choose which splits you want to generate data for. Kerem: 0-150, Lillian: 151-300, Emma: 301-450

In [5]:
import data
import matplotlib.pyplot as plt
import pandas as pd
from os import listdir
from os.path import isfile, join

def create_lookup_table(dir = 'datasets/synthetic/summary/'):
    files = [f for f in listdir(dir) if isfile(join(dir, f))]

    lookup_df = None
    for f in files:
        print(f)
        loc = dir + f
        if lookup_df is None:
            lookup_df = pd.read_csv(loc)
        else:
            df2 = pd.read_csv(loc)
            lookup_df = pd.concat([lookup_df, df2])

    unique_ids = lookup_df.id.unique()
    id_lookup = {}
    for idx in unique_ids:
        id_lookup[idx] = 1

    return id_lookup

id_lookup = create_lookup_table()

def is_processed(row):
    try:
        idx = id_lookup[row['id']]
        return False
    except:
        return True

train_4.csv
train_15.csv
train_29.csv
train_409.csv
train_353.csv
train_347.csv
train_390.csv
train_384.csv
train_385.csv
train_391.csv
train_346.csv
train_352.csv
train_408.csv
train_28.csv
train_14.csv
train_5.csv
train_7.csv
train_16.csv
train_378.csv
train_344.csv
train_350.csv
train_387.csv
train_393.csv
train_392.csv
train_386.csv
train_351.csv
train_345.csv
train_379.csv
train_17.csv
train_6.csv
train_2.csv
train_13.csv
train_341.csv
train_355.csv
train_369.csv
train_382.csv
train_396.csv
train_397.csv
train_383.csv
train_368.csv
train_354.csv
train_340.csv
train_12.csv
train_3.csv
train_1.csv
train_38.csv
train_10.csv
train_356.csv
train_342.csv
train_395.csv
train_381.csv
train_380.csv
train_394.csv
train_343.csv
train_357.csv
train_11.csv
train_39.csv
train_0.csv
train_62.csv
train_318.csv
train_330.csv
train_324.csv
train_325.csv
train_331.csv
train_319.csv
train_63.csv
train_61.csv
train_49.csv
train_327.csv
train_333.csv
train_332.csv
train_326.csv
train_48.csv
train_60.cs

In [30]:
df_all = pd.read_csv('datasets/cnn_parsed/all.csv')
print('ALL DATA')
print(df_all.shape[0])

df_all = df_all[df_all.apply(is_processed, axis = 1)]
print('REMAINING')
print(df_all.shape)

task = 'summary' # summary or qna
synthetic_data_dir = f'datasets/synthetic/{task}/'

if task == 'summary':
    generator = data.SummaryGenerator()
else:
    generator = data.QnAGenerator()

total_splits = df_all.split.max() + 1

total_splits

ALL DATA
83558
REMAINING
(50047, 5)


653

In [33]:
import time

splits = list(range(200, 210))
data.process_splits(df_all, generator, splits, synthetic_data_dir = synthetic_data_dir, mode = 'all')

----- Parsing split 200 -----
----- Number of articles to parse: 128 -----
----- ELAPSED TIME -----
256.5 seconds
----- Parsing split 201 -----
----- Number of articles to parse: 128 -----
----- ELAPSED TIME -----
263.0 seconds
----- Parsing split 202 -----
----- Number of articles to parse: 128 -----
----- ELAPSED TIME -----
264.3 seconds
----- Parsing split 203 -----
----- Number of articles to parse: 128 -----
['[keywords] "Senate"', ' "repeal"', ' "don\'t ask', ' don\'t tell"', ' "filibuster"', ' "bipartisan"\n']
----- GOT BAD RESPONSE FOR KEYWORDS ------
----- ELAPSED TIME -----
272.7 seconds
----- Parsing split 204 -----
----- Number of articles to parse: 128 -----
['[keywords] "don\'t ask', ' don\'t tell"', ' "Gates"', ' "Pentagon"', ' "repeal"', ' "military"\n']
----- GOT BAD RESPONSE FOR KEYWORDS ------
----- ELAPSED TIME -----
282.7 seconds
----- Parsing split 205 -----
----- Number of articles to parse: 128 -----
----- ELAPSED TIME -----
267.4 seconds
----- Parsing split 206

# Create batch api

In [29]:
import json

with open("batch_requests.jsonl", "w") as f:
    for i , (_, row) in enumerate(df_all.iterrows()):
        if i == 10000:
            break

        text = row['article']
        idx = row['id']

        id_lookup[idx] = 1

        custom_id = f'request-summary-{idx}'

        system_prompt = generator.system_prompt
        
        messages = [
            {'role' : 'system', 'content' : system_prompt},
            {'role' : 'user', 'content' : text}
        ]

        body = {'model' : 'gpt-4o-mini', 'messages' : messages, 'max_tokens' : 2048, 'temperature' : 0.1}
        
        line = {'custom_id' : custom_id, 'method' : 'POST', 'url' : '/v1/chat/completions', 'body' : body}
        json.dump(line, f)
        f.write('\n')


In [26]:
from openai import OpenAI
client = OpenAI()

batch_input_file = client.files.create(
file=open("batch_requests.jsonl", "rb"),
purpose="batch"
)

In [27]:
from openai import OpenAI
client = OpenAI()

client.batches.create(
  input_file_id=batch_input_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h",
  metadata={
    "description": "Summarization"
  }
)

Batch(id='batch_675b4c357e688190b7147f69fc698aab', completion_window='24h', created_at=1734036533, endpoint='/v1/chat/completions', input_file_id='file-C1tn1nDmPATA7B5gd2TQU8', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1734122933, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Summarization'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [28]:
client.models.list()

SyncPage[Model](data=[Model(id='gpt-4o', created=1715367049, object='model', owned_by='system'), Model(id='gpt-4-turbo-preview', created=1706037777, object='model', owned_by='system'), Model(id='gpt-4o-mini', created=1721172741, object='model', owned_by='system'), Model(id='gpt-4o-mini-2024-07-18', created=1721172717, object='model', owned_by='system')], object='list')

# Parse batch responses

In [None]:
import data
import json
import pandas as pd

filename = 'datasets/batch_675b4c357e688190b7147f69fc698aab_output.jsonl'

generator = data.SummaryGenerator()

with open(filename, 'r') as json_file:
    json_list = list(json_file)


results = []
for json_str in json_list:
    result = json.loads(json_str)

    id = result['custom_id'].split('-')[2]
    text = result['response']['body']['choices'][0]['message']['content']
    
    try:
        out = generator.parse_response(text)
        out['id'] = id
        out['gpt_summary'] = out.pop('summary')
        out['gpt_keywords'] = out.pop('keywords')
        results.append(out)
    except Exception as e:
        continue
        print(e)

df_out = pd.DataFrame(columns = results[0].keys())
for result in results:
    for key in result.keys():
        result[key] = [result[key]] # otherwise the lists are ignored
    df_row = pd.DataFrame.from_dict(result)
    df_out = pd.concat([df_out, df_row])
    
df_all = pd.read_csv('datasets/cnn_parsed/all.csv')
df_all = df_all.rename(columns={'highlights' : 'human_summary'})

df_merged = pd.merge(df_all, df_out, on='id', how='inner')
df_merged.drop(df_merged.columns[df_merged.columns.str.contains('^Unnamed')], axis=1, inplace=True)
print(df_merged.shape)
df_merged.head()


['[keywords] "Lust', ' Caution"', ' "Ang Lee"', ' "Wong Chia Chi"', ' "Tony Leung"', ' "World War II"\n']
['[keywords] "CNN"', ' "Student News"', ' "September 23', ' 2011"', ' "transcript"', ' "maps"\n']
['[keywords] "Navy"', ' "don\'t ask', ' don\'t tell"', ' "gay"', ' "Lynne"', ' "military"\n']
['[keywords] "don\'t ask', ' don\'t tell"', ' "Obama administration"', ' "repeal"', ' "military"', ' "Log Cabin Republicans"\n']
['[keywords] "Lady Gaga"', ' "rally"', ' "defense bill"', ' "don\'t ask', ' don\'t tell"', ' "Senate"\n']
['[keywords] "Darren Manzella"', ' "don\'t ask', ' don\'t tell"', ' "military"', ' "discharge"', ' "sexual identity"\n']
['[keywords] "don\'t ask', ' don\'t tell"', ' "Pentagon"', ' "military"', ' "repeal"', ' "cohesion"\n']
['[keywords] "don\'t ask', ' don\'t tell"', ' "Patrick Murphy"', ' "repeal"', ' "military"', ' "discrimination"\n']
['[keywords] "don\'t ask', ' don\'t tell"', ' "repeal"', ' "military"', ' "Gates"', ' "certification"\n']
['[keywords] "Ron Pa

Unnamed: 0.1,Unnamed: 0,id,article,human_summary,split,gpt_summary,gpt_keywords
0,7037,02fbe00a8ca19dbd1ddc989bde8f387e1ea34848,Washington (CNN)Nearly 6 in 10 Americans say t...,Most Americans say businesses should not discr...,8,A recent CNN/ORC poll reveals that nearly 60% ...,"[same-sex couples, businesses, religious objec..."
1,1058,02fd53c1bbd8bed0380fdabd56558a6f64ef95be,"(CNN Student News) -- August 22, 2014 . It's a...",This page includes the show Transcript .\nUse ...,8,The CNN Student News program highlights a mira...,"[Ebola, Ferguson, statues, doctor, news]"
2,92538,0303ab6df437e8548306512852c9ec65d2c716ef,"(CNN) -- Denise Borino-Quinn, who played the w...",Denise Borino-Quinn played the wife of mob bos...,8,"Denise Borino-Quinn, known for her role as Gin...","[Denise Borino-Quinn, The Sopranos, cancer, Gi..."
3,92540,0303f5a5689817bad8af4701708b075fc9d69470,(CNN) -- House Speaker John Boehner and the Pr...,"Robert Mnookin: Boehner, Obama in dangerous ga...",8,House Speaker John Boehner and President Barac...,"[negotiation, budget, debt ceiling, Boehner, O..."
4,92541,030435cfe4a74e665130955677fdb29a177066c5,(CNN) -- A school bus wreck killed a young chi...,"Ten other children are injured, two critically...",8,A tragic school bus wreck in Indianapolis resu...,"[school bus, wreck, child, injured, investigat..."


In [44]:
df_merged.to_csv('datasets/synthetic/batch_0.csv')

# Create Final Dataset

In [52]:
from os import listdir
from os.path import isfile, join
import data
import pandas as pd
import json

dir = 'datasets/synthetic/summary/'

files = [f for f in listdir(dir) if isfile(join(dir, f))]

df = None
for f in files:
    print(f)
    loc = dir + f
    if df is None:
        df = pd.read_csv(loc)
    else:
        df2 = pd.read_csv(loc)
        df = pd.concat([df, df2])

df.drop(df.columns[df.columns.str.contains('^split')], axis = 1, inplace=True)
df.drop(df.columns[df.columns.str.contains('^Unnamed')], axis=1, inplace=True)

df.head()

train_4.csv
train_15.csv
train_29.csv
train_409.csv
all_200.csv
train_353.csv
train_347.csv
train_390.csv
train_384.csv
train_385.csv
train_391.csv
train_346.csv
train_352.csv
all_201.csv
train_408.csv
train_28.csv
train_14.csv
train_5.csv
train_7.csv
train_16.csv
all_203.csv
train_378.csv
train_344.csv
train_350.csv
train_387.csv
train_393.csv
train_392.csv
train_386.csv
train_351.csv
train_345.csv
all_202.csv
train_379.csv
train_17.csv
train_6.csv
train_2.csv
train_13.csv
train_341.csv
train_355.csv
all_206.csv
train_369.csv
train_382.csv
train_396.csv
train_397.csv
train_383.csv
train_368.csv
all_207.csv
train_354.csv
train_340.csv
train_12.csv
train_3.csv
train_1.csv
train_38.csv
train_10.csv
train_356.csv
train_342.csv
all_205.csv
train_395.csv
train_381.csv
train_380.csv
train_394.csv
all_204.csv
train_343.csv
train_357.csv
train_11.csv
train_39.csv
train_0.csv
train_62.csv
train_318.csv
train_330.csv
train_324.csv
train_325.csv
train_331.csv
train_319.csv
train_63.csv
train_61.c

Unnamed: 0,id,article,gpt_summary,gpt_keywords,human_summary
0,02268ba08216bf41f800a591340fa3ca516d821b,(CNN) -- A British tourist in the United State...,"A British tourist, John Stephen Busby, was kil...","['crash', 'vintage', 'tourist', 'Galveston', '...",Man was in the United States for his 41st wedd...
1,02271084074dc719dfa84594eba2eae7bc5ac51d,(CNN) -- The race is on to locate the latest B...,Banksy is currently showcasing his latest art ...,"['Banksy', 'New York', 'art', 'residency', 'so...",Banksy started out as a graffiti artist in wes...
2,02271c690c2e7876fbd1167750e91d360205b8f0,(CNN) -- Late one afternoon in the summer of 2...,The author reflects on her unconventional rela...,"['marriage', 'divorce', 'love', 'commitment', ...","Elizabeth Gilbert is the author of ""Eat, Pray,..."
3,0228c2331812dc41f4dbac83845761d7e632d400,(CNN) -- Rafael Nadal made light work of Marco...,Rafael Nadal showcased his prowess at the Aego...,"['Rafael Nadal', 'Aegon Championships', 'grass...",Rafael Nadal wins his first game on grass in t...
4,022a2cd3f444f0a362aa7d66267cb74ba5bebd94,"Jakarta, Indonesia (CNN) -- International flig...",International flights to Bali have been cancel...,"['Bali', 'flights', 'volcanic', 'ash', 'Mount ...",An ash cloud has affected flights to the Indon...


In [53]:
for col in df.columns:
    print(df[col].unique().shape)

df.head()

(34768,)
(34768,)
(34768,)
(34727,)
(34530,)


Unnamed: 0,id,article,gpt_summary,gpt_keywords,human_summary
0,02268ba08216bf41f800a591340fa3ca516d821b,(CNN) -- A British tourist in the United State...,"A British tourist, John Stephen Busby, was kil...","['crash', 'vintage', 'tourist', 'Galveston', '...",Man was in the United States for his 41st wedd...
1,02271084074dc719dfa84594eba2eae7bc5ac51d,(CNN) -- The race is on to locate the latest B...,Banksy is currently showcasing his latest art ...,"['Banksy', 'New York', 'art', 'residency', 'so...",Banksy started out as a graffiti artist in wes...
2,02271c690c2e7876fbd1167750e91d360205b8f0,(CNN) -- Late one afternoon in the summer of 2...,The author reflects on her unconventional rela...,"['marriage', 'divorce', 'love', 'commitment', ...","Elizabeth Gilbert is the author of ""Eat, Pray,..."
3,0228c2331812dc41f4dbac83845761d7e632d400,(CNN) -- Rafael Nadal made light work of Marco...,Rafael Nadal showcased his prowess at the Aego...,"['Rafael Nadal', 'Aegon Championships', 'grass...",Rafael Nadal wins his first game on grass in t...
4,022a2cd3f444f0a362aa7d66267cb74ba5bebd94,"Jakarta, Indonesia (CNN) -- International flig...",International flights to Bali have been cancel...,"['Bali', 'flights', 'volcanic', 'ash', 'Mount ...",An ash cloud has affected flights to the Indon...


In [54]:
df.to_csv('datasets/synthetic/summary_clean.csv')