# üìä TEMPO Project Status Dashboard
## Visual Overview of Data Files and Structure

This notebook provides a complete visual summary of:
- What data folders exist
- File sizes and row counts
- Column structures
- What's missing vs. what you have

In [2]:
import pandas as pd
import os
from pathlib import Path
import json

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

## 1. üìÇ Directory Structure Overview

In [3]:
# Expected data folders
expected_folders = [
    'baseline_data',
    'crisis_datasets', 
    'goemotion_data',
    'master_training_data',
    'non_crisis_data',
    'standardized_data'
]

print("=" * 80)
print("DATA FOLDER STATUS")
print("=" * 80)
print()

folder_status = {}
for folder in expected_folders:
    exists = Path(folder).exists()
    status = "‚úÖ EXISTS" if exists else "‚ùå MISSING"
    folder_status[folder] = exists
    
    # Get size if exists
    if exists:
        total_size = sum(f.stat().st_size for f in Path(folder).rglob('*') if f.is_file())
        size_mb = total_size / (1024 * 1024)
        print(f"{status}  {folder:<30} ({size_mb:>8.1f} MB)")
    else:
        print(f"{status}  {folder:<30} (NOT DOWNLOADED)")

print()
print(f"Summary: {sum(folder_status.values())}/{len(expected_folders)} folders present")

DATA FOLDER STATUS

‚úÖ EXISTS  baseline_data                  (   138.9 MB)
‚úÖ EXISTS  crisis_datasets                (    51.6 MB)
‚úÖ EXISTS  goemotion_data                 (     9.4 MB)
‚úÖ EXISTS  master_training_data           (   277.9 MB)
‚úÖ EXISTS  non_crisis_data                (  1938.2 MB)
‚úÖ EXISTS  standardized_data              (   557.2 MB)

Summary: 6/6 folders present


## 2. üìÑ CSV Files Inventory

In [4]:
# Find all CSV files
csv_files = list(Path('.').rglob('*.csv'))

# Filter out git and venv folders
csv_files = [f for f in csv_files if '.git' not in str(f) and 'venv' not in str(f) and '.venv' not in str(f)]

print("=" * 80)
print(f"FOUND {len(csv_files)} CSV FILES")
print("=" * 80)
print()

# Group by folder
from collections import defaultdict
files_by_folder = defaultdict(list)

for f in csv_files:
    folder = str(f.parent)
    size_mb = f.stat().st_size / (1024 * 1024)
    files_by_folder[folder].append({
        'name': f.name,
        'size_mb': size_mb,
        'path': str(f)
    })

# Display by folder
for folder in sorted(files_by_folder.keys()):
    print(f"\nüìÅ {folder}/")
    print("-" * 80)
    for file_info in sorted(files_by_folder[folder], key=lambda x: x['name']):
        print(f"   {file_info['name']:<45} {file_info['size_mb']:>8.2f} MB")

FOUND 115 CSV FILES


üìÅ baseline_data/
--------------------------------------------------------------------------------
   baseline_noise.csv                              138.88 MB

üìÅ crisis_datasets/
--------------------------------------------------------------------------------
   crisislex_all_combined.csv                        5.09 MB
   crisislex_all_complete.csv                        6.85 MB
   humaid_all_with_timestamps.csv                    9.88 MB

üìÅ crisis_datasets/crisislex_complete/
--------------------------------------------------------------------------------
   2012_Colorado_wildfires_complete.csv              0.35 MB
   2012_Costa_Rica_earthquake_complete.csv           0.41 MB
   2012_Guatemala_earthquake_complete.csv            0.31 MB
   2012_Italy_earthquakes_complete.csv               0.29 MB
   2012_Philipinnes_floods_complete.csv              0.30 MB
   2012_Typhoon_Pablo_complete.csv                   0.28 MB
   2012_Venezuela_refinery_complete.csv 

## 3. üìä Dataset Summary Table

In [5]:
# Key datasets to analyze
key_datasets = [
    ('standardized_data/crisis_combined.csv', 'Crisis Combined'),
    ('standardized_data/non_crisis_combined.csv', 'Non-Crisis Combined'),
    ('standardized_data/humaid_standardized.csv', 'HumAID Standardized'),
    ('standardized_data/crisislex_standardized.csv', 'CrisisLex Standardized'),
    ('goemotion_data/goemotions.csv', 'GoEmotions (27 emotions)'),
    ('master_training_data/master_training_data.csv', 'Master Training File'),
]

dataset_summary = []

print("=" * 80)
print("KEY DATASETS ANALYSIS")
print("=" * 80)
print()

for filepath, name in key_datasets:
    if Path(filepath).exists():
        # Quick check using line count for very large files
        try:
            df = pd.read_csv(filepath, nrows=0)  # Just get columns
            columns = list(df.columns)
            
            # Count rows efficiently
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                row_count = sum(1 for line in f) - 1  # Subtract header
            
            size_mb = Path(filepath).stat().st_size / (1024 * 1024)
            
            dataset_summary.append({
                'Dataset': name,
                'Status': '‚úÖ',
                'Rows': f"{row_count:,}",
                'Columns': len(columns),
                'Size (MB)': f"{size_mb:.1f}"
            })
            
            print(f"‚úÖ {name}")
            print(f"   Rows: {row_count:,} | Columns: {len(columns)} | Size: {size_mb:.1f} MB")
            print(f"   Columns: {', '.join(columns[:8])}{'...' if len(columns) > 8 else ''}")
            print()
            
        except Exception as e:
            dataset_summary.append({
                'Dataset': name,
                'Status': '‚ö†Ô∏è ERROR',
                'Rows': str(e)[:50],
                'Columns': '-',
                'Size (MB)': '-'
            })
            print(f"‚ö†Ô∏è  {name}: Error - {str(e)[:50]}")
            print()
    else:
        dataset_summary.append({
            'Dataset': name,
            'Status': '‚ùå',
            'Rows': 'Not found',
            'Columns': '-',
            'Size (MB)': '-'
        })
        print(f"‚ùå {name}: NOT FOUND")
        print()

# Display summary table
summary_df = pd.DataFrame(dataset_summary)
display(summary_df)

KEY DATASETS ANALYSIS

‚úÖ Crisis Combined
   Rows: 66,766 | Columns: 7 | Size: 12.9 MB
   Columns: text, created_at, event_name, event_type, crisis_label, source_dataset, informativeness

‚úÖ Non-Crisis Combined
   Rows: 2,306,740 | Columns: 6 | Size: 265.7 MB
   Columns: text, created_at, event_name, event_type, crisis_label, source_dataset

‚úÖ HumAID Standardized
   Rows: 43,409 | Columns: 7 | Size: 8.3 MB
   Columns: text, created_at, event_name, event_type, crisis_label, source_dataset, informativeness

‚úÖ CrisisLex Standardized
   Rows: 23,357 | Columns: 7 | Size: 4.6 MB
   Columns: text, created_at, event_name, event_type, crisis_label, source_dataset, informativeness

‚úÖ GoEmotions (27 emotions)
   Rows: 54,263 | Columns: 5 | Size: 4.5 MB
   Columns: text, labels, id, Unnamed: 3, [27] = neutral [0] = admiration [1] = amusement [2] = anger [3] = annoyance [4] = approval [5] = caring [6] = confusion [7] = curiosity [8] = desire [9] = disappointment [10] = disapproval [11] = di

Unnamed: 0,Dataset,Status,Rows,Columns,Size (MB)
0,Crisis Combined,‚úÖ,66766,7,12.9
1,Non-Crisis Combined,‚úÖ,2306740,6,265.7
2,HumAID Standardized,‚úÖ,43409,7,8.3
3,CrisisLex Standardized,‚úÖ,23357,7,4.6
4,GoEmotions (27 emotions),‚úÖ,54263,5,4.5
5,Master Training File,‚úÖ,2427769,19,277.8


## 4. üîç Detailed Column Analysis

In [6]:
print("=" * 80)
print("DETAILED COLUMN STRUCTURE")
print("=" * 80)
print()

# Crisis data
if Path('standardized_data/crisis_combined.csv').exists():
    print("üìä CRISIS COMBINED")
    print("-" * 80)
    crisis_df = pd.read_csv('standardized_data/crisis_combined.csv', nrows=3)
    print(f"Columns: {list(crisis_df.columns)}")
    print(f"\nData types:\n{crisis_df.dtypes}")
    print(f"\nFirst row sample:")
    display(crisis_df.head(1).T)
    print()

# Non-crisis data  
if Path('standardized_data/non_crisis_combined.csv').exists():
    print("\nüìä NON-CRISIS COMBINED")
    print("-" * 80)
    non_crisis_df = pd.read_csv('standardized_data/non_crisis_combined.csv', nrows=3)
    print(f"Columns: {list(non_crisis_df.columns)}")
    print(f"\nData types:\n{non_crisis_df.dtypes}")
    print(f"\nFirst row sample:")
    display(non_crisis_df.head(1).T)
    print()

# GoEmotions
if Path('goemotion_data/goemotions.csv').exists():
    print("\nüìä GOEMOTIONS (27 EMOTIONS)")
    print("-" * 80)
    goemo_df = pd.read_csv('goemotion_data/goemotions.csv', nrows=3)
    print(f"Columns: {list(goemo_df.columns)}")
    print(f"\nData types:\n{goemo_df.dtypes}")
    print(f"\nFirst row sample:")
    display(goemo_df.head(1).T)
else:
    print("\n‚ùå GoEmotions NOT FOUND - Need to download from Google Drive!")

DETAILED COLUMN STRUCTURE

üìä CRISIS COMBINED
--------------------------------------------------------------------------------
Columns: ['text', 'created_at', 'event_name', 'event_type', 'crisis_label', 'source_dataset', 'informativeness']

Data types:
text                   str
created_at             str
event_name             str
event_type             str
crisis_label         int64
source_dataset         str
informativeness    float64
dtype: object

First row sample:


Unnamed: 0,0
text,.@GreenABEnergy How can @AirworksCanada assist...
created_at,2016-05-19 18:16:11.727000+00:00
event_name,canada_wildfires_2016_dev
event_type,wildfire
crisis_label,1
source_dataset,humaid
informativeness,




üìä NON-CRISIS COMBINED
--------------------------------------------------------------------------------
Columns: ['text', 'created_at', 'event_name', 'event_type', 'crisis_label', 'source_dataset']

Data types:
text                str
created_at          str
event_name          str
event_type          str
crisis_label      int64
source_dataset      str
dtype: object

First row sample:


Unnamed: 0,0
text,#Coachella2015 tickets selling out in less tha...
created_at,2015-01-07 15:02:00
event_name,coachella_2015
event_type,entertainment
crisis_label,0
source_dataset,coachella




üìä GOEMOTIONS (27 EMOTIONS)
--------------------------------------------------------------------------------
Columns: ['text', 'labels', 'id', 'Unnamed: 3', '[27] = neutral [0] = admiration [1] = amusement [2] = anger [3] = annoyance [4] = approval [5] = caring [6] = confusion [7] = curiosity [8] = desire [9] = disappointment [10] = disapproval [11] = disgust [12] = embarrassment [13] = excitement [14] = fear [15] = gratitude [16] = grief [17] = joy [18] = love [19] = nervousness [20] = optimism [21] = pride [22] = realization [23] = relief [24] = remorse [25] = sadness [26] = surprise [27] = neutral']

Data types:
text                                                                                                                                                                                                                                                                                                                                                                                 

Unnamed: 0,0
text,My favourite food is anything I didn't have to...
labels,[27]
id,eebbqej
Unnamed: 3,
[27] = neutral [0] = admiration [1] = amusement [2] = anger [3] = annoyance [4] = approval [5] = caring [6] = confusion [7] = curiosity [8] = desire [9] = disappointment [10] = disapproval [11] = disgust [12] = embarrassment [13] = excitement [14] = fear [15] = gratitude [16] = grief [17] = joy [18] = love [19] = nervousness [20] = optimism [21] = pride [22] = realization [23] = relief [24] = remorse [25] = sadness [26] = surprise [27] = neutral,


## 5. ‚ö†Ô∏è Issues & Missing Data

In [7]:
print("=" * 80)
print("ISSUES & MISSING COMPONENTS")
print("=" * 80)
print()

issues = []

# Check for GoEmotions
if not Path('goemotion_data/goemotions.csv').exists():
    issues.append("‚ùå GoEmotions data missing - CRITICAL for emotion mapping")

# Check for emotion columns in crisis/non-crisis data
if Path('standardized_data/crisis_combined.csv').exists():
    crisis_df = pd.read_csv('standardized_data/crisis_combined.csv', nrows=1)
    emotion_cols = [col for col in crisis_df.columns if 'emotion' in col.lower()]
    if not emotion_cols:
        issues.append("‚ö†Ô∏è  No emotion columns in crisis_combined.csv - needs to be added")

if Path('standardized_data/non_crisis_combined.csv').exists():
    non_crisis_df = pd.read_csv('standardized_data/non_crisis_combined.csv', nrows=1)
    emotion_cols = [col for col in non_crisis_df.columns if 'emotion' in col.lower()]
    if not emotion_cols:
        issues.append("‚ö†Ô∏è  No emotion columns in non_crisis_combined.csv - needs to be added")

# Check for master training data
if not Path('master_training_data/master_training_data.csv').exists():
    issues.append("‚ö†Ô∏è  Master training file missing - may need to be regenerated")

if issues:
    for issue in issues:
        print(issue)
else:
    print("‚úÖ No major issues detected!")

print()
print("=" * 80)
print("RECOMMENDED ACTIONS")
print("=" * 80)
print("""
1. Download goemotion_data/ from Google Drive (if missing)
2. Define 13 target emotions for crisis detection
3. Create 27‚Üí13 emotion mapping using LLM
4. Add emotion_label column to standardized datasets
5. Refactor code to industry standards
6. Convert Python scripts to notebooks
""")

ISSUES & MISSING COMPONENTS

‚ö†Ô∏è  No emotion columns in crisis_combined.csv - needs to be added
‚ö†Ô∏è  No emotion columns in non_crisis_combined.csv - needs to be added

RECOMMENDED ACTIONS

1. Download goemotion_data/ from Google Drive (if missing)
2. Define 13 target emotions for crisis detection
3. Create 27‚Üí13 emotion mapping using LLM
4. Add emotion_label column to standardized datasets
5. Refactor code to industry standards
6. Convert Python scripts to notebooks



## 6. üìà Quick Statistics

In [8]:
print("=" * 80)
print("QUICK STATISTICS")
print("=" * 80)
print()

stats = {}

# Crisis data stats
if Path('standardized_data/crisis_combined.csv').exists():
    crisis_df = pd.read_csv('standardized_data/crisis_combined.csv')
    print("üìä Crisis Data:")
    print(f"   Total tweets: {len(crisis_df):,}")
    if 'event_type' in crisis_df.columns:
        print(f"   Unique event types: {crisis_df['event_type'].nunique()}")
        print(f"   Top 5 event types:")
        for event, count in crisis_df['event_type'].value_counts().head(5).items():
            print(f"      {event}: {count:,}")
    print()

# Non-crisis data stats
if Path('standardized_data/non_crisis_combined.csv').exists():
    non_crisis_df = pd.read_csv('standardized_data/non_crisis_combined.csv')
    print("üìä Non-Crisis Data:")
    print(f"   Total tweets: {len(non_crisis_df):,}")
    if 'event_name' in non_crisis_df.columns:
        print(f"   Unique events: {non_crisis_df['event_name'].nunique()}")
        print(f"   Top 5 events:")
        for event, count in non_crisis_df['event_name'].value_counts().head(5).items():
            print(f"      {event}: {count:,}")
    print()

# Combined totals
if 'crisis_df' in dir() and 'non_crisis_df' in dir():
    total_tweets = len(crisis_df) + len(non_crisis_df)
    print(f"üìä TOTAL DATASET: {total_tweets:,} tweets")
    print(f"   Crisis: {len(crisis_df):,} ({len(crisis_df)/total_tweets*100:.1f}%)")
    print(f"   Non-crisis: {len(non_crisis_df):,} ({len(non_crisis_df)/total_tweets*100:.1f}%)")

QUICK STATISTICS

üìä Crisis Data:
   Total tweets: 66,748
   Unique event types: 7
   Top 5 event types:
      hurricane: 33,422
      earthquake: 11,429
      flood: 7,498
      wildfire: 7,151
      accident: 4,248

üìä Non-Crisis Data:
   Total tweets: 1,533,696
   Unique events: 7
   Top 5 events:
      got_season8_2019: 760,614
      fifa_worldcup_2018: 458,533
      tokyo_olympics_2020: 159,432
      us_election_2020: 99,948
      fifa_worldcup_2022: 49,493

üìä TOTAL DATASET: 1,600,444 tweets
   Crisis: 66,748 (4.2%)
   Non-crisis: 1,533,696 (95.8%)
