In [1]:
import pandas as pd
import json

# 1. Load the data
stats_df = pd.read_csv('data.csv')

with open('processed_interviews.json', 'r', encoding='utf-8') as f:
    interviews = json.load(f)

# 2. Preprocess dates
stats_df['date'] = pd.to_datetime(stats_df['Date']).dt.date  # Assuming 'Date' column exists

# Extract interviews into a DataFrame
interview_data = []
for entry in interviews:
    interview_data.append({
        'date': pd.to_datetime(entry['date']).date(),
        'title': entry['title'],
        'url': entry['url'],
        'qa': entry['qa']
    })
interviews_df = pd.DataFrame(interview_data)

# 3. Merge stats and interviews based on date
merged_df = stats_df.merge(interviews_df, how='left', on='date')

# 4. Save or view merged data
merged_df.to_csv('merged_data.csv', index=False)

print("Merged dataset saved as merged_data.csv")


Merged dataset saved as merged_data.csv


In [6]:
import pandas as pd
import json

# 1. Load the data
stats_df = pd.read_csv('data.csv')

with open('processed_interviews.json', 'r', encoding='utf-8') as f:
    interviews = json.load(f)

# 2. Preprocess dates
stats_df['date'] = pd.to_datetime(stats_df['Date'], errors='coerce').dt.date  # Handle invalid dates

# Only keep the necessary game stats columns
stats_df = stats_df[['date', 'Result', 'MP', 'PTS', 'FG%', '3P%', 'FT%', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF']]

# Extract interviews into a DataFrame
interview_data = []
for entry in interviews:
    interview_data.append({
        'date': pd.to_datetime(entry['date'], errors='coerce').date(),
        'questions': list(entry['qa']['questions'].values()),
        'answers': list(entry['qa']['answers'].values())
    })
interviews_df = pd.DataFrame(interview_data)

# 3. Merge stats and interviews based on date
merged_df = stats_df.merge(interviews_df, how='left', on='date')

# 4. Format and save as JSON
merged_data = []
for _, row in merged_df.iterrows():
    if pd.isna(row['date']):
        continue
    questions = row['questions'] if isinstance(row['questions'], list) else []
    answers = row['answers'] if isinstance(row['answers'], list) else []
    merged_entry = {
        'date': row['date'].isoformat(),
        'game_stats': {
            'Result': row['Result'],
            'MP': row['MP'],
            'PTS': row['PTS'],
            'FG%': row['FG%'],
            '3P%': row['3P%'],
            'FT%': row['FT%'],
            'TRB': row['TRB'],
            'AST': row['AST'],
            'STL': row['STL'],
            'BLK': row['BLK'],
            'TOV': row['TOV'],
            'PF': row['PF']
        },
        'questions': questions,
        'answers': answers
    }
    merged_data.append(merged_entry)

with open('data.json', 'w', encoding='utf-8') as f:
    json.dump(merged_data, f, ensure_ascii=False, indent=2)

print("Merged dataset saved as data.json")

Merged dataset saved as data.json


In [3]:
import json

INPUT_FILE  = 'data.json'
OUTPUT_FILE = 'data.json'

def remove_percentage_fields(input_path, output_path):
    # Load the data
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Remove "3P%" and "FT%" from each entry's game_stats
    for entry in data:
        stats = entry.get('game_stats', {})
        stats.pop('3P%', None)
        stats.pop('FT%', None)

    # Write the cleaned data back out
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

if __name__ == '__main__':
    remove_percentage_fields(INPUT_FILE, OUTPUT_FILE)
    print(f"Written cleaned data (without 3P% and FT%) to {OUTPUT_FILE}")


Written cleaned data (without 3P% and FT%) to data.json


In [4]:
import json

INPUT_FILE = 'data.json'
OUTPUT_FILE = 'cleaned_data.json'  # change to 'data.json' to overwrite

def filter_entries(input_path, output_path):
    # Load the full data
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Keep only entries where both questions and answers are non-empty lists
    filtered = [
        entry for entry in data
        if isinstance(entry.get('questions'), list)
           and isinstance(entry.get('answers'), list)
           and len(entry['questions']) > 0
           and len(entry['answers']) > 0
    ]

    # Write filtered data back out
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(filtered, f, indent=2, ensure_ascii=False)

if __name__ == '__main__':
    filter_entries(INPUT_FILE, OUTPUT_FILE)
    print(f"Filtered entries written to {OUTPUT_FILE}")


Filtered entries written to cleaned_data.json


In [5]:
import json

INPUT_FILE  = 'cleaned_data.json'
OUTPUT_FILE = 'dataset.json'

def transform_dataset(input_path, output_path):
    # Load original data
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    new_entries = []
    for entry in data:
        date = entry.get("date")
        stats = entry.get("game_stats", {})

        # Extract just 'W' or 'L' from the full result string
        full_result = stats.get("Result", "").strip()
        wl = full_result.split()[0] if full_result else ""

        # Rebuild game_stats with normalized Result
        cleaned_stats = {k: v for k, v in stats.items() if k != "Result"}
        cleaned_stats["Result"] = wl

        # Split out each question/answer pair
        questions = entry.get("questions", [])
        answers   = entry.get("answers",   [])
        for q, a in zip(questions, answers):
            new_entries.append({
                "date":       date,
                "game_stats": cleaned_stats,
                "question":   q,
                "answer":     a
            })

    # Write the new flat dataset
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(new_entries, f, indent=2, ensure_ascii=False)

if __name__ == '__main__':
    transform_dataset(INPUT_FILE, OUTPUT_FILE)
    print(f"Saved transformed dataset ({len(open(OUTPUT_FILE).read().splitlines())} lines) to {OUTPUT_FILE}")


Saved transformed dataset (20283 lines) to dataset.json


In [6]:
import json
import csv

INPUT_JSON  = 'dataset.json'
OUTPUT_CSV   = 'dataset.csv'

def json_to_csv(input_path, output_path):
    # Load the JSON data
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    if not data:
        print("No data found in", input_path)
        return

    # Determine all the columns:
    # top-level fields: date, question, answer
    # plus all keys inside game_stats
    sample = data[0]
    game_stats_keys = list(sample.get('game_stats', {}).keys())
    fieldnames = ['date'] + game_stats_keys + ['question', 'answer']

    # Write CSV
    with open(output_path, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()

        for entry in data:
            row = {}
            row['date'] = entry.get('date', '')

            # flatten game_stats
            stats = entry.get('game_stats', {})
            for key in game_stats_keys:
                row[key] = stats.get(key, '')

            # Q/A
            row['question'] = entry.get('question', '').replace('\n', ' ').strip()
            row['answer']   = entry.get('answer',   '').replace('\n', ' ').strip()

            writer.writerow(row)

    print(f"Wrote {len(data)} rows to {output_path}")

if __name__ == '__main__':
    json_to_csv(INPUT_JSON, OUTPUT_CSV)


Wrote 1193 rows to dataset.csv
