# Master Format Converter & Imputation Audit

This notebook implements the canonical conversion pipeline to make the full master
training dataset match the 10k sample format (uniform `created_at`, imputation
of missing timestamps with audit flags). Use it to run the conversion interactively
and to validate the produced master file prior to writing the full dataset.


In [None]:
# Section 1: Import Required Libraries
import pandas as pd
import numpy as np
from pathlib import Path
import logging
import argparse
import matplotlib.pyplot as plt
import seaborn as sns

logging.basicConfig(level=logging.INFO)


In [None]:
# Section 2: Load and Inspect Master Training Sample
SAMPLE_PATH = Path('../master_training_data/master_training_sample_10kv3_imputed.csv')
if not SAMPLE_PATH.exists():
    SAMPLE_PATH = Path('../master_training_data/master_training_sample_10kv3.csv')

print('Loading sample from', SAMPLE_PATH)
df_sample = pd.read_csv(SAMPLE_PATH, low_memory=False)
print('Rows:', len(df_sample))
print('Columns:', list(df_sample.columns))
print('\ncreated_at NA:', pd.to_datetime(df_sample['created_at'], errors='coerce').isna().sum())
print('\nSample rows (head 3):')
print(df_sample.head(3).to_string())

In [None]:
# Section 3: Define Expected Output Schema
EXPECTED_COLUMNS = [
    'text',
    'emotion_fear','emotion_sadness','emotion_anger','emotion_nervousness','emotion_disgust','emotion_surprise','emotion_confusion','emotion_caring','emotion_grief','emotion_disappointment','emotion_joy','emotion_relief','emotion_neutral',
    'event_type','informativeness','crisis_label','source_dataset','created_at','created_at_imputed','created_at_imputed_method'
]

print('Expected columns count:', len(EXPECTED_COLUMNS))


def validate_schema(df):
    missing = [c for c in EXPECTED_COLUMNS if c not in df.columns]
    extra = [c for c in df.columns if c not in EXPECTED_COLUMNS]
    return missing, extra

print('Missing/extra in sample:', validate_schema(df_sample))

In [None]:
# Section 4: Implement Field-wise Transformation Functions

def normalize_text(s: str) -> str:
    """Simple text normalization: trim and collapse whitespace."""
    if pd.isna(s):
        return ''
    return ' '.join(str(s).split())


def parse_created_at(s: str):
    """Parse various timestamp formats into a standardized string."""
    if pd.isna(s):
        return None
    try:
        dt = pd.to_datetime(s, errors='coerce')
        if pd.isna(dt):
            return None
        return dt.strftime('%Y-%m-%d %H:%M:%S')
    except Exception:
        return None

# quick unit tests
assert normalize_text('  Hello   world\n') == 'Hello world'
assert parse_created_at('2018-07-01T12:00:00Z') == '2018-07-01 12:00:00'
print('Field functions OK')

In [None]:
# Section 5: Normalize and Validate Records

def convert_record(row):
    out = {}
    out['text'] = normalize_text(row.get('text'))
    for col in EMOTION_COLUMNS:
        out[col] = int(row.get(col) or 0)
    out['event_type'] = row.get('event_type')
    out['informativeness'] = row.get('informativeness')
    out['crisis_label'] = row.get('crisis_label')
    out['source_dataset'] = row.get('source_dataset')
    out['created_at'] = parse_created_at(row.get('created_at'))
    out['created_at_imputed'] = bool(row.get('created_at_imputed'))
    out['created_at_imputed_method'] = row.get('created_at_imputed_method')
    return out

# Test conversion on sample
test_out = [convert_record(dict(df_sample.iloc[i])) for i in range(3)]
print('Converted sample records:')
print(test_out[0])

In [None]:
# Section 6: Batch Processing Pipeline for Master Data

MASTER_IN = Path('../master_training_data/master_training_data_v3.csv')
MASTER_OUT = Path('../master_training_data/master_training_data_v4.csv')

print('Master input exists:', MASTER_IN.exists())


def process_master_chunk(df_chunk):
    records = [convert_record(dict(r)) for _, r in df_chunk.iterrows()]
    out_df = pd.DataFrame(records)
    # Validate schema
    missing, extra = validate_schema(out_df)
    return out_df, missing, extra

# Example running on a small slice (do not run on full file interactively here unless you opt-in)
if MASTER_IN.exists():
    sample = pd.read_csv(MASTER_IN, nrows=100)
    out_df, missing, extra = process_master_chunk(sample)
    print('Processed sample rows:', len(out_df), 'missing cols:', missing, 'extra cols:', extra)
else:
    print('MASTER_IN not present locally; run in environment with the master file')

In [None]:
# Section 7: Compare Produced Format to Sample (Automated Checks)

def compare_schema(a: pd.DataFrame, b: pd.DataFrame):
    return set(a.columns) == set(b.columns)

if 'df_sample' in globals():
    print('Sample columns count:', len(df_sample.columns))
    # do a small transform check
    print('Transform record equality test (first row):')
    transformed = convert_record(dict(df_sample.iloc[0]))
    print('Transformed keys:', transformed.keys())
else:
    print('No sample loaded')

In [None]:
# Section 8: Save Formatted Master Training Data (idempotent)

def save_master(df_out, path=MASTER_OUT):
    if path.exists():
        backup = path.with_suffix('.bak')
        print('Backing up existing master to', backup)
        path.rename(backup)
    df_out.to_csv(path, index=False)
    print('Saved formatted master to', path)

# NOTE: Run this only when you are ready to write the full dataset (it may be large)

# Section 9: CLI/Script Entrypoint (example usage)

# In practice you would use the scripts/phase4_combine/create_master_training_file.py
# This notebook demonstrates how to call the conversion logic programmatically.

print('To run full conversion, call the script with WRITE_FULL_MASTER = True and verify outputs.')

In [None]:
# Section 10: Unit Tests and Example Runs

# Simple inline test: transform first 10 rows of the sample and ensure schema validity
if 'df_sample' in globals():
    transformed = pd.DataFrame([convert_record(dict(r)) for _, r in df_sample.head(10).iterrows()])
    missing, extra = validate_schema(transformed)
    print('Transformed rows:', len(transformed), 'missing cols:', missing, 'extra cols:', extra)
else:
    print('No sample loaded')

In [None]:
# Section 11: Logging, Error Handling, and Reporting

import json

def generate_report(valid_count, error_examples):
    report = {
        'valid_count': int(valid_count),
        'error_examples': error_examples,
        'generated_at': pd.Timestamp.now().isoformat()
    }
    print(json.dumps(report, indent=2))

print('Notebook ready: use cells above to run the conversion interactively and save outputs when ready')

# Imputation audit & master conversion
# Purpose: Validate imputed timestamps, compare distributions (hour/day), spot-check rows, and produce final master-format export

In [None]:
# Imports & config
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

SAMPLE_IMPUTED = Path('master_training_data/master_training_sample_10kv3_imputed.csv')
REPORT_DIR = Path('reports/imputation')
REPORT_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
# Load & basic checks
df = pd.read_csv(SAMPLE_IMPUTED, parse_dates=['created_at'], na_values=[''])
print('rows:', len(df))
print('imputed_count:', int(df['created_at_imputed'].sum()))

# Save quick CSV with imputed row ids
imputed_rows = df.loc[df['created_at_imputed'], ['text']].reset_index().rename(columns={'index': 'row_index'})
imputed_rows.to_csv(REPORT_DIR / 'imputed_rows_sample.csv', index=False)
print('Saved imputed row index CSV to', REPORT_DIR / 'imputed_rows_sample.csv')

In [None]:
# Hour-of-day comparison for real vs imputed
# Prepare
if 'created_at_imputed' not in df.columns:
    raise ValueError('No imputed column found - run imputation first')

# Extract hour
df['hour'] = df['created_at'].dt.hour
real_hours = df.loc[~df['created_at_imputed'], 'hour']
imp_hours = df.loc[df['created_at_imputed'], 'hour']

plt.figure(figsize=(10,4))
plt.hist([real_hours, imp_hours], bins=24, label=['real','imputed'], alpha=0.75)
plt.legend(); plt.title('Hour-of-day: real vs imputed'); plt.xlabel('hour'); plt.show()

# Save figure
plt.savefig(REPORT_DIR / 'hour_of_day_real_vs_imputed.png', bbox_inches='tight')
print('Saved plot to', REPORT_DIR / 'hour_of_day_real_vs_imputed.png')

## Actions

- If distributions are reasonable, export the final master format and optionally enable `WRITE_FULL_MASTER=True` in `scripts/phase4_combine/create_master_training_file.py` and re-run.
- Save plots and an imputed row list to `reports/imputation/` for audit and reviewer sign-off.