# Instructions
* Run the code starting from 'Dataset Creation' up to Synthetic Data Generation. You only need to do this once
* Then, the 'Synthetic Data Generation' section will contain everything you need for generating synthetic data from the dataset.

# Dataset Creation
1) Download the CNN/Dailymail dataset into the folder `datasets/`. The folder should be named `cnn_dailymail` already, and the train `.csv` should be in `datasets/cnn_dailymail/train.csv` (or change the directory below as needed)
2) create the directory `datasets/cnn_parsed`
3) Run this section of the notebook. This should remove all the dailymail and duplicate articles and save the new train test val split into the folder above.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

dataset_dir_in = 'datasets/cnn_dailymail/'
dataset_dir_out = 'datasets/cnn_parsed/'

# load train dataset
def load_and_parse(dataset = 'train'):
    df = pd.read_csv(dataset_dir_in + dataset + '.csv')
    df = df[df.article.str.contains('CNN')]
    df = df.drop_duplicates('article')
    return df

df_train = load_and_parse('train')
df_test = load_and_parse('test')
df_val = load_and_parse('validation')

df = pd.concat([df_train, df_val, df_test])

num_articles_per_split = 128 # 128

def create_splits(df, num_articles_per_split):
    df = df.sort_values('id')
    splits = np.array(range(df.shape[0]))
    splits = splits // num_articles_per_split
    df['split'] = splits
    return df

df = create_splits(df, num_articles_per_split)

df.to_csv(dataset_dir_out + 'all.csv')

# Synthetic Data Generation

* Below, make sure the directories match with the ones you created
* Then specify 'splits' to choose which splits you want to generate data for. Kerem: 0-150, Lillian: 151-300, Emma: 301-450

In [1]:
import data
import matplotlib.pyplot as plt
import pandas as pd
from os import listdir
from os.path import isfile, join

task = 'qna' # qna or summary

out_dir = f'datasets/synthetic/{task}/'

# creates lookup table for data already processed
def create_lookup_table(dir = out_dir):
    files = [f for f in listdir(dir) if isfile(join(dir, f))]

    lookup_df = None
    for f in files:
        print(f)
        loc = dir + f
        if lookup_df is None:
            lookup_df = pd.read_csv(loc)
        else:
            df2 = pd.read_csv(loc)
            lookup_df = pd.concat([lookup_df, df2])

    if lookup_df is None:
        return {}

    unique_ids = lookup_df.id.unique()
    id_lookup = {}
    for idx in unique_ids:
        id_lookup[idx] = 1

    return id_lookup

id_lookup = create_lookup_table(dir = out_dir)

def is_processed(row):
    try:
        idx = id_lookup[row['id']]
        return False
    except:
        return True

In [2]:
# read data and remove all ids that are already processed
df_all = pd.read_csv('datasets/cnn_parsed/all.csv')

print('ALL DATA')
print(df_all.shape[0])

df_all = df_all[df_all.apply(is_processed, axis = 1)]
print('REMAINING')
print(df_all.shape)

synthetic_data_dir = f'datasets/synthetic/{task}/'

if task == 'summary':
    generator = data.SummaryGenerator()
else:
    generator = data.QnAGenerator()

total_splits = df_all.split.max() + 1

total_splits

ALL DATA
83558
REMAINING
(83558, 5)


5223

Below, we make the API requests and parse the output in real time. For batch processing, see the next sections.

In [3]:
import time

# ! This is for real time processing ! 
# ! See below for batches !
#
splits = list(range(0, 1))
data.process_splits(df_all, generator, splits, synthetic_data_dir = synthetic_data_dir, mode = 'all')

----- Parsing split 0 -----
----- Number of articles to parse: 16 -----
----- ELAPSED TIME -----
77.4 seconds


# Create batch api (instead of the above)

Initially create the `.jsonl` file which will be uploaded to OpenAI files. 

In [None]:
import json

with open(f"batch_requests_{task}.jsonl", "w") as f:
    for i , (_, row) in enumerate(df_all.iterrows()):
        if i == 10000:
            break

        text = row['article']
        idx = row['id']

        id_lookup[idx] = 1

        custom_id = f'request-{task}-{idx}'

        system_prompt = generator.system_prompt
        
        messages = [
            {'role' : 'system', 'content' : system_prompt},
            {'role' : 'user', 'content' : text}
        ]

        body = {'model' : 'gpt-4o-mini', 'messages' : messages, 'max_tokens' : 2048, 'temperature' : 0.1}
        
        line = {'custom_id' : custom_id, 'method' : 'POST', 'url' : '/v1/chat/completions', 'body' : body}
        json.dump(line, f)
        f.write('\n')


Then, upload the file created above with all the requests to OpenAI.

In [None]:
from openai import OpenAI
client = OpenAI()

batch_input_file = client.files.create(
file=open(f"batch_requests_{task}.jsonl", "rb"),
purpose="batch"
)

Start the batch processing. Make sure the following outputs `gpt-4o-mini` and `gpt-4o-mini-2024-07-18` 

In [None]:
client.models.list()

In [None]:
from openai import OpenAI
client = OpenAI()

client.batches.create(
  input_file_id=batch_input_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h",
  metadata={
    "description": f"{task}"
  }
)

Batch(id='batch_675b4c357e688190b7147f69fc698aab', completion_window='24h', created_at=1734036533, endpoint='/v1/chat/completions', input_file_id='file-C1tn1nDmPATA7B5gd2TQU8', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1734122933, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Summarization'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

# Parse batch responses
Once the batch processing is done (check through the OpenAI platform dashboard), download the output file to a proper location and parse the output using the generator.

In [None]:
import data
import json
import pandas as pd

filename = 'datasets/batch_675b4c357e688190b7147f69fc698aab_output.jsonl'

generator = data.SummaryGenerator() if task == 'summary' else data.QnAGenerator()

with open(filename, 'r') as json_file:
    json_list = list(json_file)


results = []
for json_str in json_list:
    result = json.loads(json_str)

    id = result['custom_id'].split('-')[2]
    text = result['response']['body']['choices'][0]['message']['content']
    
    try:
        out = generator.parse_response(text)
       
        out['id'] = id
        out['gpt_summary'] = out.pop('summary')

        if task == 'summary':
            out['gpt_keywords'] = out.pop('keywords')
        else:
            out['qna'] = out.pop('qna')
        results.append(out)
    except Exception as e:
        continue
        print(e)

df_out = pd.DataFrame(columns = results[0].keys())
for result in results:
    for key in result.keys():
        result[key] = [result[key]] # otherwise the lists are ignored
    df_row = pd.DataFrame.from_dict(result)
    df_out = pd.concat([df_out, df_row])
    
df_all = pd.read_csv('datasets/cnn_parsed/all.csv')
df_all = df_all.rename(columns={'highlights' : 'human_summary'})

df_merged = pd.merge(df_all, df_out, on='id', how='inner')
df_merged.drop(df_merged.columns[df_merged.columns.str.contains('^Unnamed')], axis=1, inplace=True)
print(df_merged.shape)
df_merged.head()


IndexError: list index out of range

In [None]:
# save the outputs
# make sure you change the filename below every time you run batch to avoid overriding
df_merged.to_csv(f'datasets/synthetic/{task}/batch_0.csv')

# Create Final Dataset

In [4]:
from os import listdir
from os.path import isfile, join
import data
import pandas as pd
import json

dir = f'datasets/synthetic/{task}/'

files = [f for f in listdir(dir) if isfile(join(dir, f))]

df = None
for f in files:
    print(f)
    loc = dir + f
    if df is None:
        df = pd.read_csv(loc)
    else:
        df2 = pd.read_csv(loc)
        df = pd.concat([df, df2])

df.drop(df.columns[df.columns.str.contains('^split')], axis = 1, inplace=True)
df.drop(df.columns[df.columns.str.contains('^Unnamed')], axis=1, inplace=True)

df.head()

all_0.csv


Unnamed: 0,id,article,human_summary,gpt_summary,qna
0,000128cbd36642ced67ac90bd7d4d1dd5e8cf554,"December 19, 2014 . CNN Student News is wrappi...",This page includes the show Transcript .\nUse ...,CNN Student News is concluding 2014 with a re...,"[{""Q"": "" Did CNN Student News cover ten intern..."
1,0001f1fcec4ca8bc7e278607ba0e31e5cc046e66,Democratic Republic of Congo (CNN) -- Our frie...,Ashley Judd tells story of girl who was victi...,"The text discusses the plight of Kika, a surv...","[{""Q"": "" Is Kika a survivor of gender violence..."
2,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...,"Ralph Mata, a lieutenant in the Miami-Dade Po...","[{""Q"": "" Was Ralph Mata a police officer?\n"", ..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...,President Vladimir Putin has committed Crimea...,"[{""Q"": "" Did President Putin commit Crimea to ..."
4,000424152bce9d9f36cb43884dacf16b43052463,(CNN) -- The powerful but compact Hurricane Ra...,NEW: Raymond weakens slightly as it moves towa...,"Hurricane Raymond, a Category 3 storm with wi...","[{""Q"": "" Is Hurricane Raymond currently a Cate..."


In [5]:
# sanity check
for col in df.columns:
    print(df[col].unique().shape)

df.head()

(16,)
(16,)
(16,)
(16,)
(16,)


Unnamed: 0,id,article,human_summary,gpt_summary,qna
0,000128cbd36642ced67ac90bd7d4d1dd5e8cf554,"December 19, 2014 . CNN Student News is wrappi...",This page includes the show Transcript .\nUse ...,CNN Student News is concluding 2014 with a re...,"[{""Q"": "" Did CNN Student News cover ten intern..."
1,0001f1fcec4ca8bc7e278607ba0e31e5cc046e66,Democratic Republic of Congo (CNN) -- Our frie...,Ashley Judd tells story of girl who was victi...,"The text discusses the plight of Kika, a surv...","[{""Q"": "" Is Kika a survivor of gender violence..."
2,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...,"Ralph Mata, a lieutenant in the Miami-Dade Po...","[{""Q"": "" Was Ralph Mata a police officer?\n"", ..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...,President Vladimir Putin has committed Crimea...,"[{""Q"": "" Did President Putin commit Crimea to ..."
4,000424152bce9d9f36cb43884dacf16b43052463,(CNN) -- The powerful but compact Hurricane Ra...,NEW: Raymond weakens slightly as it moves towa...,"Hurricane Raymond, a Category 3 storm with wi...","[{""Q"": "" Is Hurricane Raymond currently a Cate..."


In [8]:
df.to_csv(f'datasets/synthetic/{task}_clean.csv')

In [None]:
# json.loads(df['qna'].iloc[0])

[{'Q': ' Did CNN Student News cover ten international stories in their report?  \n',
  'A': True},
 {'Q': ' Will CNN Student News resume its program on January 5, 2015?  \n',
  'A': True},
 {'Q': ' Is there a Weekly Newsquiz available on the page?  \n', 'A': True},
 {'Q': ' Can students younger than 13 request to be mentioned in the CNN Student News Roll Call?  \n',
  'A': False},
 {'Q': ' Did the staff at CNN Student News thank their audience for the past year?  \n',
  'A': True}]