In [10]:
# Parameters for reproducible sample generation
# Set WRITE_FULL_MASTER=True to write the full master file (v4). Default is False to avoid large writes.
SAMPLE_SIZE_10K = 10000
WRITE_FULL_MASTER = False
SAMPLE_FILE_10KV3 = "master_training_data/master_training_sample_10kv3.csv"
MASTER_FILE_V4 = "master_training_data/master_training_data_v4.csv"

print(f"Sample size: {SAMPLE_SIZE_10K}")
print(f"WRITE_FULL_MASTER: {WRITE_FULL_MASTER}")

Sample size: 10000
WRITE_FULL_MASTER: False


# Phase 4: Create Master Training Dataset
## Combine GoEmotions + Crisis + Non-Crisis Data

This notebook creates a **reduced, balanced dataset for training multi-task BERT**.

### What this dataset is for:
- **Train BERT** on emotion classification (using GoEmotions labels)
- **Train BERT** on crisis detection (using crisis_label)
- Smaller dataset (~217K rows) for efficient training

### What happens after BERT is trained:
1. Apply trained BERT to **ORIGINAL FULL datasets** (1.5M+ non-crisis, 67K crisis)
2. Extract emotion features for ALL tweets
3. Use these features to create episodes & hourly aggregations for RL agent

### Data Sources:
- **GoEmotions**: 54K Reddit comments with labeled emotions (for BERT training)
- **Crisis**: 67K crisis tweets (all kept)
- **Non-Crisis**: ~96K sampled non-crisis tweets (reduced from 1.5M for balanced training)

### Sampling Strategy:
- Non-crisis data is randomly sampled with sports emphasis
- Dataset is shuffled so rows are randomized (not grouped by source)

In [11]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

In [12]:
# Run the master creation script to produce the 10k sample (non-destructive)
# This will respect the WRITE_FULL_MASTER flag defined above.
import subprocess, sys, os

print('Running create_master_training_file.py')
ret = subprocess.run([sys.executable, 'scripts/phase4_combine/create_master_training_file.py'], check=False)
print('Return code:', ret.returncode)

# Confirm files
print('\nFiles produced (if any):')
for p in [SAMPLE_FILE_10KV3, SAMPLE_FILE_10KV3_IMPUTED, MASTER_FILE_V4]:
    print(p, '->', 'exists' if os.path.exists(p) else 'MISSING')

# Quick verification: if an imputed sample exists, show counts
imputed_path = SAMPLE_FILE_10KV3_IMPUTED if os.path.exists(SAMPLE_FILE_10KV3_IMPUTED) else (SAMPLE_FILE_10KV3 if os.path.exists(SAMPLE_FILE_10KV3) else None)
if imputed_path is not None:
    import pandas as pd
    df = pd.read_csv(imputed_path)
    print('\nLoaded sample:', imputed_path)
    print('Rows:', len(df))
    print('created_at NA:', pd.to_datetime(df['created_at'], errors='coerce').isna().sum())
    if 'created_at_imputed' in df.columns:
        print('created_at_imputed True:', int(df['created_at_imputed'].sum()))
        print('imputation methods top:', df['created_at_imputed_method'].value_counts().head().to_string())
    else:
        print('No imputation flags present in sample')

Running create_master_training_file.py
CREATING MASTER TRAINING FILE FOR MULTI-TASK BERT

STEP 1: PROCESSING INDIVIDUAL DATASETS

PROCESSING GOEMOTIONS
Loading: ./goemotion_data/goemotions.csv
Loaded: 54,263 rows
Processing emotion labels...

Emotion Distribution (13 emotions):
   emotion_fear             : 764 tweets
   emotion_sadness          : 1,625 tweets
   emotion_anger            : 1,960 tweets
   emotion_nervousness      : 208 tweets
   emotion_disgust          : 1,013 tweets
   emotion_surprise         : 1,330 tweets
   emotion_confusion        : 1,673 tweets
   emotion_caring           : 1,375 tweets
   emotion_grief            : 96 tweets
   emotion_disappointment   : 1,583 tweets
   emotion_joy              : 1,785 tweets
   emotion_relief           : 182 tweets
   emotion_neutral          : 17,772 tweets

GoEmotions Processed: 54,263 rows

PROCESSING CRISIS DATASETS
Loading: ./standardized_data/crisis_combined_dates_only.csv
Loaded: 66,748 rows

Crisis Data Processed: 66,

## 1. Load All Datasets

In [13]:
print("Loading datasets...\n")

# Load GoEmotions with 13 emotions
print("1. GoEmotions (with 13 emotions)...")
df_goemotions = pd.read_csv('goemotion_data/goemotions_with_13_emotions.csv')
print(f"   ‚úì Loaded {len(df_goemotions):,} rows")
print(f"   Columns: {df_goemotions.columns.tolist()}")

# Load crisis data with emotion columns
print("\n2. Crisis data (with emotion columns)...")
df_crisis = pd.read_csv('standardized_data/crisis_combined_with_emotions.csv')
print(f"   ‚úì Loaded {len(df_crisis):,} rows")
print(f"   Columns: {df_crisis.columns.tolist()}")

# Load non-crisis data with emotion columns
print("\n3. Non-crisis data (with emotion columns)...")
df_non_crisis = pd.read_csv('standardized_data/non_crisis_combined_with_emotions.csv')
print(f"   ‚úì Loaded {len(df_non_crisis):,} rows")
print(f"   Columns: {df_non_crisis.columns.tolist()}")

print(f"\n{'='*80}")
print(f"Total rows to combine: {len(df_goemotions) + len(df_crisis) + len(df_non_crisis):,}")
print(f"{'='*80}")

Loading datasets...

1. GoEmotions (with 13 emotions)...
   ‚úì Loaded 54,263 rows
   Columns: ['text', 'emotion_label', 'emotion_name', 'id', 'labels']

2. Crisis data (with emotion columns)...
   ‚úì Loaded 66,748 rows
   Columns: ['text', 'created_at', 'event_name', 'event_type', 'crisis_label', 'source_dataset', 'informativeness', 'created_at_imputed', 'created_at_imputed_method', 'emotion_label', 'emotion_name']

3. Non-crisis data (with emotion columns)...
   ‚úì Loaded 1,533,696 rows
   Columns: ['text', 'created_at', 'event_name', 'event_type', 'crisis_label', 'source_dataset', 'emotion_label', 'emotion_name']

Total rows to combine: 1,654,707


## 1.5 Sample Non-Crisis Data (Reduce Dataset Size)

Downsample non-crisis data from 1.5M to ~100K rows:
- Keep all GoEmotions (54K) - needed for training
- Keep all Crisis (67K) - core data for crisis detection
- Sample non-crisis to ~100K with sports emphasis

**Sampling Distribution:**
| Source | Type | Sample Size |
|--------|------|-------------|
| worldcup_2018 | Sports | 20,000 |
| tokyo_olympics | Sports | 20,000 |
| fifa_worldcup | Sports | 20,000 |
| game_of_thrones | Entertainment | 20,000 |
| us_election | Politics | 10,000 |
| coachella | Entertainment | All (~3,846) |
| music_concerts | Entertainment | All (~1,830) |

**Rationale:** Sports-heavy distribution helps model learn to distinguish sports excitement from crisis fear.

In [14]:
print("Sampling non-crisis data...\n")

# Define sampling configuration
SAMPLE_CONFIG = {
    'worldcup_2018': 20000,      # Sports
    'tokyo_olympics': 20000,     # Sports
    'fifa_worldcup': 20000,      # Sports
    'game_of_thrones': 20000,    # Entertainment
    'us_election': 10000,        # Politics
    'coachella': None,           # Keep all (small dataset)
    'music_concerts': None,      # Keep all (small dataset)
}

print("Original non-crisis distribution:")
print(df_non_crisis['source_dataset'].value_counts())
print(f"\nTotal before sampling: {len(df_non_crisis):,}")

# Sample each source
sampled_dfs = []
for source, sample_size in SAMPLE_CONFIG.items():
    source_df = df_non_crisis[df_non_crisis['source_dataset'] == source]
    
    if sample_size is None or len(source_df) <= sample_size:
        # Keep all rows for small datasets
        sampled_dfs.append(source_df)
        print(f"  {source}: {len(source_df):,} (kept all)")
    else:
        # Random sample for large datasets
        sampled = source_df.sample(n=sample_size, random_state=42)
        sampled_dfs.append(sampled)
        print(f"  {source}: {sample_size:,} (sampled from {len(source_df):,})")

# Combine sampled data
df_non_crisis_sampled = pd.concat(sampled_dfs, ignore_index=True)

print(f"\n{'='*60}")
print(f"Non-crisis sampling complete!")
print(f"  Before: {len(df_non_crisis):,} rows")
print(f"  After:  {len(df_non_crisis_sampled):,} rows")
print(f"  Reduction: {(1 - len(df_non_crisis_sampled)/len(df_non_crisis))*100:.1f}%")
print(f"{'='*60}")

# Replace original with sampled version
df_non_crisis = df_non_crisis_sampled

print("\nNew non-crisis distribution:")
print(df_non_crisis['source_dataset'].value_counts())

Sampling non-crisis data...

Original non-crisis distribution:
source_dataset
game_of_thrones    760614
worldcup_2018      458533
tokyo_olympics     159432
us_election         99948
fifa_worldcup       49493
coachella            3846
music_concerts       1830
Name: count, dtype: int64

Total before sampling: 1,533,696
  worldcup_2018: 20,000 (sampled from 458,533)
  tokyo_olympics: 20,000 (sampled from 159,432)
  fifa_worldcup: 20,000 (sampled from 49,493)
  game_of_thrones: 20,000 (sampled from 760,614)
  us_election: 10,000 (sampled from 99,948)
  coachella: 3,846 (kept all)
  music_concerts: 1,830 (kept all)

Non-crisis sampling complete!
  Before: 1,533,696 rows
  After:  95,676 rows
  Reduction: 93.8%

New non-crisis distribution:
source_dataset
worldcup_2018      20000
tokyo_olympics     20000
fifa_worldcup      20000
game_of_thrones    20000
us_election        10000
coachella           3846
music_concerts      1830
Name: count, dtype: int64


## 2. Check Current Schemas

In [15]:
print("Current column schemas:\n")

print("GoEmotions columns:")
for col in df_goemotions.columns:
    print(f"  - {col}: {df_goemotions[col].dtype}")

print("\nCrisis columns:")
for col in df_crisis.columns:
    print(f"  - {col}: {df_crisis[col].dtype}")

print("\nNon-crisis columns:")
for col in df_non_crisis.columns:
    print(f"  - {col}: {df_non_crisis[col].dtype}")

Current column schemas:

GoEmotions columns:
  - text: str
  - emotion_label: int64
  - emotion_name: str
  - id: str
  - labels: str

Crisis columns:
  - text: str
  - created_at: str
  - event_name: str
  - event_type: str
  - crisis_label: int64
  - source_dataset: str
  - informativeness: str
  - created_at_imputed: bool
  - created_at_imputed_method: float64
  - emotion_label: float64
  - emotion_name: float64

Non-crisis columns:
  - text: str
  - created_at: str
  - event_name: str
  - event_type: str
  - crisis_label: int64
  - source_dataset: str
  - emotion_label: float64
  - emotion_name: float64


## 3. Define Master Schema

Create unified column structure for all datasets:
- **text**: Tweet/comment text
- **emotion_label**: Numeric emotion (1-13, NULL for unlabeled)
- **emotion_name**: Text emotion name (NULL for unlabeled)
- **source_dataset**: Origin of data (GoEmotions, HumAID, CrisisLex, etc.)
- **crisis_label**: Binary (1=crisis, 0=non-crisis, NULL for GoEmotions)
- **event_type**: General category (hurricane, sports, etc., NULL for GoEmotions)
- **event_name**: Specific event (hurricane_harvey_2017, etc., NULL for GoEmotions)
- **created_at**: Timestamp (NULL for GoEmotions)
- **informativeness**: CrisisLex informativeness label (NULL for others)

In [16]:
# Define master column set
MASTER_COLUMNS = [
    'text',
    'emotion_label',
    'emotion_name',
    'source_dataset',
    'crisis_label',
    'event_type',
    'event_name',
    'created_at',
    'informativeness'
]

print("Master schema columns:")
for i, col in enumerate(MASTER_COLUMNS, 1):
    print(f"  {i}. {col}")

Master schema columns:
  1. text
  2. emotion_label
  3. emotion_name
  4. source_dataset
  5. crisis_label
  6. event_type
  7. event_name
  8. created_at
  9. informativeness


## 4. Standardize GoEmotions Data

Add missing columns to GoEmotions dataset.

In [None]:
import sys
sys.path.insert(0, '.')
from utils.impute_missing_dates import impute_missing_dates

print("Standardizing GoEmotions data...\n")

# Create standardized GoEmotions dataframe
df_goemotions_std = pd.DataFrame()

# Keep existing columns
df_goemotions_std['text'] = df_goemotions['text']
df_goemotions_std['emotion_label'] = df_goemotions['emotion_label']
df_goemotions_std['emotion_name'] = df_goemotions['emotion_name']

# Add source
df_goemotions_std['source_dataset'] = 'GoEmotions'

# Add NULL columns (GoEmotions is not crisis-related)
df_goemotions_std['crisis_label'] = np.nan
df_goemotions_std['event_type'] = ''
df_goemotions_std['event_name'] = ''

# Use NaT for missing dates (proper datetime null value)
df_goemotions_std['created_at'] = pd.NaT

df_goemotions_std['informativeness'] = ''

# Apply date imputation using sample_pool method
print("Imputing missing dates for GoEmotions using sample_pool method...")
df_goemotions_std = impute_missing_dates(
    df_goemotions_std,
    method='sample_pool',
    reference_col='source_dataset',
    jitter_hours=6
)

print(f"\n‚úì GoEmotions standardized: {len(df_goemotions_std):,} rows")
print(f"  Columns: {df_goemotions_std.columns.tolist()}")
if 'created_at_imputed' in df_goemotions_std.columns:
    print(f"  Dates imputed: {df_goemotions_std['created_at_imputed'].sum():,}")
    print(f"  Imputation method: {df_goemotions_std['created_at_imputed_method'].value_counts().to_dict()}")
print(f"\nSample:")
display(df_goemotions_std.head(3))

Standardizing GoEmotions data...

‚úì GoEmotions standardized: 54,263 rows
  Columns: ['text', 'emotion_label', 'emotion_name', 'source_dataset', 'crisis_label', 'event_type', 'event_name', 'created_at', 'informativeness']

Sample:


Unnamed: 0,text,emotion_label,emotion_name,source_dataset,crisis_label,event_type,event_name,created_at,informativeness
0,My favourite food is anything I didn't have to cook myself.,13,neutral,GoEmotions,,,,,
1,"Now if he does off himself, everyone will think hes having a laugh screwing with people instead ...",13,neutral,GoEmotions,,,,,
2,WHY THE FUCK IS BAYLESS ISOING,2,anger,GoEmotions,,,,,


## 5. Standardize Crisis Data

Select and reorder crisis columns to match master schema.

In [18]:
print("Standardizing crisis data...\n")

# Create standardized crisis dataframe
df_crisis_std = pd.DataFrame()

df_crisis_std['text'] = df_crisis['text']
df_crisis_std['emotion_label'] = df_crisis['emotion_label']  # Will be NaN
df_crisis_std['emotion_name'] = df_crisis['emotion_name']    # Will be empty
df_crisis_std['source_dataset'] = df_crisis['source_dataset']
df_crisis_std['crisis_label'] = df_crisis['crisis_label']
df_crisis_std['event_type'] = df_crisis['event_type']
df_crisis_std['event_name'] = df_crisis['event_name']
df_crisis_std['created_at'] = df_crisis['created_at']
df_crisis_std['informativeness'] = df_crisis['informativeness']

print(f"‚úì Crisis standardized: {len(df_crisis_std):,} rows")
print(f"  Columns: {df_crisis_std.columns.tolist()}")
print(f"\nSample:")
display(df_crisis_std.head(3))

Standardizing crisis data...

‚úì Crisis standardized: 66,748 rows
  Columns: ['text', 'emotion_label', 'emotion_name', 'source_dataset', 'crisis_label', 'event_type', 'event_name', 'created_at', 'informativeness']

Sample:


Unnamed: 0,text,emotion_label,emotion_name,source_dataset,crisis_label,event_type,event_name,created_at,informativeness
0,I feel a little uneasy about the idea of work tomorrow when the aftershocks are still so strong....,,,humaid,1,earthquake,kaikoura_earthquake_2016_train,2016-11-14 07:27:53,
1,"#eqnz Interislander ferry docking aborted after huge 7.5 magnitude quake, sailings on hold",,,humaid,1,earthquake,kaikoura_earthquake_2016_train,2016-11-13 21:27:49,
2,Much of New Zealand felt the earthquake after midnight; waking to discover how much damage,,,humaid,1,earthquake,kaikoura_earthquake_2016_train,2016-11-13 18:25:16,


## 6. Standardize Non-Crisis Data

Select and reorder non-crisis columns to match master schema.

In [19]:
print("Standardizing non-crisis data...\n")

# Create standardized non-crisis dataframe
df_non_crisis_std = pd.DataFrame()

df_non_crisis_std['text'] = df_non_crisis['text']
df_non_crisis_std['emotion_label'] = df_non_crisis['emotion_label']  # Will be NaN
df_non_crisis_std['emotion_name'] = df_non_crisis['emotion_name']    # Will be empty
df_non_crisis_std['source_dataset'] = df_non_crisis['source_dataset']
df_non_crisis_std['crisis_label'] = df_non_crisis['crisis_label']
df_non_crisis_std['event_type'] = df_non_crisis['event_type']
df_non_crisis_std['event_name'] = df_non_crisis['event_name']
df_non_crisis_std['created_at'] = df_non_crisis['created_at']

# Non-crisis doesn't have informativeness
df_non_crisis_std['informativeness'] = ''

print(f"‚úì Non-crisis standardized: {len(df_non_crisis_std):,} rows")
print(f"  Columns: {df_non_crisis_std.columns.tolist()}")
print(f"\nSample:")
display(df_non_crisis_std.head(3))

Standardizing non-crisis data...

‚úì Non-crisis standardized: 95,676 rows
  Columns: ['text', 'emotion_label', 'emotion_name', 'source_dataset', 'crisis_label', 'event_type', 'event_name', 'created_at', 'informativeness']

Sample:


Unnamed: 0,text,emotion_label,emotion_name,source_dataset,crisis_label,event_type,event_name,created_at,informativeness
0,Thibuat Courtois Winner Golden Glove Fifa World Cup,,,worldcup_2018,0,sports,fifa_worldcup_2018,2018-07-15 17:38:07,
1,Paolo Dybala scored more goals for Juventus last season than he played minutes at the Goals For ...,,,worldcup_2018,0,sports,fifa_worldcup_2018,2018-06-30 16:55:14,
2,France have won the FIFA in Moscow,,,worldcup_2018,0,sports,fifa_worldcup_2018,2018-07-15 17:56:43,


## 7. Validate Schema Alignment

Ensure all three datasets have identical column structure before combining.

In [20]:
print("=" * 80)
print("SCHEMA VALIDATION")
print("=" * 80)

# Check column names
goemotions_cols = df_goemotions_std.columns.tolist()
crisis_cols = df_crisis_std.columns.tolist()
non_crisis_cols = df_non_crisis_std.columns.tolist()

print(f"\nGoEmotions columns: {goemotions_cols}")
print(f"Crisis columns:     {crisis_cols}")
print(f"Non-crisis columns: {non_crisis_cols}")

# Validate all match
if goemotions_cols == crisis_cols == non_crisis_cols:
    print("\n‚úÖ All datasets have matching column structure!")
else:
    print("\n‚ùå Column mismatch detected!")
    print(f"\nDifferences:")
    if goemotions_cols != crisis_cols:
        print(f"  GoEmotions vs Crisis: {set(goemotions_cols) ^ set(crisis_cols)}")
    if crisis_cols != non_crisis_cols:
        print(f"  Crisis vs Non-crisis: {set(crisis_cols) ^ set(non_crisis_cols)}")

# Check if columns match master schema
if goemotions_cols == MASTER_COLUMNS:
    print("\n‚úÖ Columns match master schema!")
else:
    print(f"\n‚ö†Ô∏è  Column order differs from master schema")

print(f"\n" + "=" * 80)

SCHEMA VALIDATION

GoEmotions columns: ['text', 'emotion_label', 'emotion_name', 'source_dataset', 'crisis_label', 'event_type', 'event_name', 'created_at', 'informativeness']
Crisis columns:     ['text', 'emotion_label', 'emotion_name', 'source_dataset', 'crisis_label', 'event_type', 'event_name', 'created_at', 'informativeness']
Non-crisis columns: ['text', 'emotion_label', 'emotion_name', 'source_dataset', 'crisis_label', 'event_type', 'event_name', 'created_at', 'informativeness']

‚úÖ All datasets have matching column structure!

‚úÖ Columns match master schema!



## 8. Combine All Datasets

Concatenate all three standardized datasets into master training file.

In [21]:
print("Combining datasets...\n")

# Concatenate all datasets
df_master = pd.concat([
    df_goemotions_std,
    df_crisis_std,
    df_non_crisis_std
], ignore_index=True)

print(f"Combined master dataset: {len(df_master):,} rows")

# Shuffle the dataset so rows are randomized (not grouped by source)
print("Shuffling dataset to randomize row order...")
df_master = df_master.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\n‚úÖ Combined and shuffled master dataset created!")
print(f"\nTotal rows: {len(df_master):,}")
print(f"\nBreakdown:")
print(f"  GoEmotions:  {len(df_goemotions_std):,} ({len(df_goemotions_std)/len(df_master)*100:.1f}%)")
print(f"  Crisis:      {len(df_crisis_std):,} ({len(df_crisis_std)/len(df_master)*100:.1f}%)")
print(f"  Non-crisis:  {len(df_non_crisis_std):,} ({len(df_non_crisis_std)/len(df_master)*100:.1f}%)")

print(f"\nColumns: {df_master.columns.tolist()}")
print(f"\nMemory usage: {df_master.memory_usage(deep=True).sum() / (1024**2):.2f} MB")

# Show that rows are now mixed
print(f"\nFirst 10 rows source distribution (showing shuffle worked):")
print(df_master.head(10)['source_dataset'].tolist())

Combining datasets...

Combined master dataset: 216,687 rows
Shuffling dataset to randomize row order...

‚úÖ Combined and shuffled master dataset created!

Total rows: 216,687

Breakdown:
  GoEmotions:  54,263 (25.0%)
  Crisis:      66,748 (30.8%)
  Non-crisis:  95,676 (44.2%)

Columns: ['text', 'emotion_label', 'emotion_name', 'source_dataset', 'crisis_label', 'event_type', 'event_name', 'created_at', 'informativeness']

Memory usage: 115.94 MB

First 10 rows source distribution (showing shuffle worked):
['tokyo_olympics', 'worldcup_2018', 'humaid', 'humaid', 'tokyo_olympics', 'humaid', 'GoEmotions', 'humaid', 'GoEmotions', 'GoEmotions']


## 9. Data Quality Validation

In [22]:
print("=" * 80)
print("DATA QUALITY VALIDATION")
print("=" * 80)

# Check for nulls in critical columns
print(f"\nNull counts:")
print(df_master.isnull().sum())

# Check text column
null_text = df_master['text'].isna().sum()
empty_text = (df_master['text'] == '').sum()
print(f"\nText validation:")
print(f"  Null texts: {null_text}")
print(f"  Empty texts: {empty_text}")
if null_text == 0 and empty_text == 0:
    print(f"  ‚úÖ All rows have text content")

# Check emotion labels
labeled_rows = df_master['emotion_label'].notna().sum()
unlabeled_rows = df_master['emotion_label'].isna().sum()
print(f"\nEmotion label status:")
print(f"  Labeled (GoEmotions):    {labeled_rows:,} ({labeled_rows/len(df_master)*100:.1f}%)")
print(f"  Unlabeled (Crisis+Non):  {unlabeled_rows:,} ({unlabeled_rows/len(df_master)*100:.1f}%)")

# Check crisis labels
crisis_rows = (df_master['crisis_label'] == 1).sum()
non_crisis_rows = (df_master['crisis_label'] == 0).sum()
unlabeled_crisis = df_master['crisis_label'].isna().sum()
print(f"\nCrisis label distribution:")
print(f"  Crisis (1):      {crisis_rows:,}")
print(f"  Non-crisis (0):  {non_crisis_rows:,}")
print(f"  Unlabeled (GoE): {unlabeled_crisis:,}")

# Check source distribution
print(f"\nSource dataset distribution:")
print(df_master['source_dataset'].value_counts())

print(f"\n" + "=" * 80)

DATA QUALITY VALIDATION

Null counts:
text                   25
emotion_label      162424
emotion_name       162424
source_dataset          0
crisis_label        54263
event_type              0
event_name              0
created_at              0
informativeness     43816
dtype: int64

Text validation:
  Null texts: 25
  Empty texts: 0

Emotion label status:
  Labeled (GoEmotions):    54,263 (25.0%)
  Unlabeled (Crisis+Non):  162,424 (75.0%)

Crisis label distribution:
  Crisis (1):      66,748
  Non-crisis (0):  95,676
  Unlabeled (GoE): 54,263

Source dataset distribution:
source_dataset
GoEmotions         54263
humaid             43409
crisislex          23339
tokyo_olympics     20000
worldcup_2018      20000
fifa_worldcup      20000
game_of_thrones    20000
us_election        10000
coachella           3846
music_concerts      1830
Name: count, dtype: int64



## 10. Show Sample Data from Each Source

In [23]:
print("Sample rows from each source:\n")

print("GoEmotions sample (with emotion labels):")
display(df_master[df_master['source_dataset'] == 'GoEmotions'][['text', 'emotion_label', 'emotion_name', 'source_dataset']].head(3))

print("\nCrisis sample (emotion labels = NULL):")
crisis_sample = df_master[df_master['crisis_label'] == 1][['text', 'emotion_label', 'emotion_name', 'event_name', 'crisis_label']].head(3)
display(crisis_sample)

print("\nNon-crisis sample (emotion labels = NULL):")
non_crisis_sample = df_master[df_master['crisis_label'] == 0][['text', 'emotion_label', 'emotion_name', 'event_name', 'crisis_label']].head(3)
display(non_crisis_sample)

Sample rows from each source:

GoEmotions sample (with emotion labels):


Unnamed: 0,text,emotion_label,emotion_name,source_dataset
6,The Xanax thing is burning itself out.Crims don't give a fuck.,13.0,neutral,GoEmotions
8,There's doing stupid things when you're young. Then there's doing horribly stupid things when yo...,2.0,anger,GoEmotions
9,One of my favs was when we were in the playoffs against the Habs and we won all five fights in a...,13.0,neutral,GoEmotions



Crisis sample (emotion labels = NULL):


Unnamed: 0,text,emotion_label,emotion_name,event_name,crisis_label
2,Watters: Trump Bashed by Left After Obama Golfed During LA Floods | Fox News Insider,,,hurricane_harvey_2017_train,1.0
3,#Texas: ExxonMobile tank damaged by Hurricane Harvey leaking dangerous pollutants #HurricaneHar...,,,hurricane_harvey_2017_train,1.0
5,Australia is stepping up its assistance to New Zealand after the deadly earthquake on the countr...,,,kaikoura_earthquake_2016_test,1.0



Non-crisis sample (emotion labels = NULL):


Unnamed: 0,text,emotion_label,emotion_name,event_name,crisis_label
0,You Made The √∞≈∏‚Ä°¬Æ√∞≈∏‚Ä°¬≥ Flag Fly High In Tokyo And We Are Proud Of You! √¢¬ù¬§√Ø¬∏¬è\n #MirabaiChanu!\n\...,,,tokyo_olympics_2020,0.0
1,signs of the World Cup learn to sign goal in British Sign Language,,,fifa_worldcup_2018,0.0
4,BH DtL√∞≈∏‚Äú¬£√∞≈∏‚Äú¬£and FH cross winners.\nGotcha 1S. Vamooooooooooooos\n@keinishikori \n#Tokyo2020 #T...,,,tokyo_olympics_2020,0.0


## 11. Save Master Training Dataset

In [None]:
# Save to master_training_data folder
output_path = 'master_training_data/master_training_data_v3.csv'

print(f"Saving master training dataset to {output_path}...\n")

# Ensure created_at is in proper datetime format before saving
print("Validating date formats...")
df_master['created_at'] = pd.to_datetime(df_master['created_at'], errors='coerce')

# Report on date quality
dates_valid = df_master['created_at'].notna().sum()
dates_missing = df_master['created_at'].isna().sum()
print(f"  Valid dates: {dates_valid:,}")
print(f"  Missing dates: {dates_missing:,}")

if 'created_at_imputed' in df_master.columns:
    dates_imputed = df_master['created_at_imputed'].sum()
    print(f"  Imputed dates: {dates_imputed:,}")

# Convert to ISO format string for CSV
df_master['created_at'] = df_master['created_at'].dt.strftime('%Y-%m-%d %H:%M:%S')

df_master.to_csv(output_path, index=False)

file_size = Path(output_path).stat().st_size / (1024**2)

print("\n" + "=" * 80)
print("MASTER DATASET SAVED")
print("=" * 80)
print(f"\n‚úÖ Saved to: {output_path}")
print(f"\nFile size: {file_size:.2f} MB")
print(f"Total rows: {len(df_master):,}")
print(f"Total columns: {len(df_master.columns)}")
print(f"\nColumns: {df_master.columns.tolist()}")
print(f"\n" + "=" * 80)

Saving master training dataset to master_training_data/master_training_data_v3.csv...

MASTER DATASET SAVED

‚úÖ Saved to: master_training_data/master_training_data_v3.csv

File size: 36.08 MB
Total rows: 216,687
Total columns: 9

Columns: ['text', 'emotion_label', 'emotion_name', 'source_dataset', 'crisis_label', 'event_type', 'event_name', 'created_at', 'informativeness']



## 12. Create Smaller Sample File for Testing

In [25]:
# Create 10K sample for quick testing
sample_size = 10000
df_sample = df_master.sample(n=sample_size, random_state=42)

sample_path = 'master_training_data/master_training_sample_10kv2.csv'
df_sample.to_csv(sample_path, index=False)

print(f"‚úÖ Created sample file: {sample_path}")
print(f"   Rows: {len(df_sample):,}")
print(f"   Size: {Path(sample_path).stat().st_size / (1024**2):.2f} MB")

‚úÖ Created sample file: master_training_data/master_training_sample_10kv2.csv
   Rows: 10,000
   Size: 1.67 MB


## 13. Final Summary & Statistics

In [26]:
print("=" * 80)
print("FINAL SUMMARY")
print("=" * 80)

print(f"\nüìä Dataset Composition:")
print(f"   Total rows:          {len(df_master):,}")
print(f"   GoEmotions:          {len(df_goemotions_std):,} (with emotion labels)")
print(f"   Crisis events:       {len(df_crisis_std):,} (emotion labels = NULL)")
print(f"   Non-crisis events:   {len(df_non_crisis_std):,} (SAMPLED, emotion labels = NULL)")

print(f"\nüìÅ Files Created:")
print(f"   Main:   master_training_data/master_training_data_v3.csv ({file_size:.2f} MB)")
print(f"   Sample: master_training_data/master_training_sample_10k.csv")

print(f"\nüè∑Ô∏è Emotion Labels:")
print(f"   Labeled rows:    {labeled_rows:,} (GoEmotions - for training)")
print(f"   Unlabeled rows:  {unlabeled_rows:,} (Crisis + Non-crisis - for prediction)")

print(f"\nüîß Schema:")
print(f"   Columns: {len(df_master.columns)}")
for i, col in enumerate(df_master.columns, 1):
    print(f"      {i}. {col}")

print(f"\nüìã Next Steps:")
print(f"   1. Train multi-task BERT on this dataset (~217K rows)")
print(f"      - Task 1: Emotion classification (using GoEmotions labels)")
print(f"      - Task 2: Crisis detection (using crisis_label)")
print(f"   2. Apply trained BERT to ORIGINAL FULL datasets:")
print(f"      - Full crisis data: 67K tweets")
print(f"      - Full non-crisis data: 1.5M+ tweets")
print(f"   3. Extract emotion features for ALL tweets")
print(f"   4. Create episodes & hourly aggregations for RL agent")
print(f"   5. Train RL agent on temporal emotion patterns")

print(f"\n" + "=" * 80)
print("‚úÖ PHASE 4 COMPLETE!")
print("=" * 80)

FINAL SUMMARY

üìä Dataset Composition:
   Total rows:          216,687
   GoEmotions:          54,263 (with emotion labels)
   Crisis events:       66,748 (emotion labels = NULL)
   Non-crisis events:   95,676 (SAMPLED, emotion labels = NULL)

üìÅ Files Created:
   Main:   master_training_data/master_training_data_v3.csv (36.08 MB)
   Sample: master_training_data/master_training_sample_10k.csv

üè∑Ô∏è Emotion Labels:
   Labeled rows:    54,263 (GoEmotions - for training)
   Unlabeled rows:  162,424 (Crisis + Non-crisis - for prediction)

üîß Schema:
   Columns: 9
      1. text
      2. emotion_label
      3. emotion_name
      4. source_dataset
      5. crisis_label
      6. event_type
      7. event_name
      8. created_at
      9. informativeness

üìã Next Steps:
   1. Train multi-task BERT on this dataset (~217K rows)
      - Task 1: Emotion classification (using GoEmotions labels)
      - Task 2: Crisis detection (using crisis_label)
   2. Apply trained BERT to ORIGINAL FULL