In [3]:
import pandas as pd
import altair as alt
state_to_fips = {
    "Alabama": 1, "Alaska": 2, "Arizona": 4, "Arkansas": 5,
    "California": 6, "Colorado": 8, "Connecticut": 9, "Delaware": 10,
    "Florida": 12, "Georgia": 13, "Hawaii": 15, "Idaho": 16,
    "Illinois": 17, "Indiana": 18, "Iowa": 19, "Kansas": 20,
    "Kentucky": 21, "Louisiana": 22, "Maine": 23, "Maryland": 24,
    "Massachusetts": 25, "Michigan": 26, "Minnesota": 27, "Mississippi": 28,
    "Missouri": 29, "Montana": 30, "Nebraska": 31, "Nevada": 32,
    "New Hampshire": 33, "New Jersey": 34, "New Mexico": 35, "New York": 36,
    "North Carolina": 37, "North Dakota": 38, "Ohio": 39, "Oklahoma": 40,
    "Oregon": 41, "Pennsylvania": 42, "Rhode Island": 44, "South Carolina": 45,
    "South Dakota": 46, "Tennessee": 47, "Texas": 48, "Utah": 49,
    "Vermont": 50, "Virginia": 51, "Washington": 53, "West Virginia": 54,
    "Wisconsin": 55, "Wyoming": 56
}

url = "https://raw.githubusercontent.com/UIUC-iSchool-DataViz/is445_data/main/bfro_reports_fall2022.csv"
df = pd.read_csv(url)

state_counts = (
    df.groupby('state', as_index=False)
    .size()
    .rename(columns={'size': 'count'})
)
state_counts['state_id'] = state_counts['state'].map(state_to_fips)

state_counts.to_csv("bfro_reports_processed.csv", index=False)

In [5]:
state_to_code = {
    "Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR",
    "California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE",
    "Florida": "FL", "Georgia": "GA", "Hawaii": "HI", "Idaho": "ID",
    "Illinois": "IL", "Indiana": "IN", "Iowa": "IA", "Kansas": "KS",
    "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD",
    "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS",
    "Missouri": "MO", "Montana": "MT", "Nebraska": "NE", "Nevada": "NV",
    "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM", "New York": "NY",
    "North Carolina": "NC", "North Dakota": "ND", "Ohio": "OH", "Oklahoma": "OK",
    "Oregon": "OR", "Pennsylvania": "PA", "Rhode Island": "RI", "South Carolina": "SC",
    "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX", "Utah": "UT",
    "Vermont": "VT", "Virginia": "VA", "Washington": "WA", "West Virginia": "WV",
    "Wisconsin": "WI", "Wyoming": "WY"
}

url = "https://raw.githubusercontent.com/UIUC-iSchool-DataViz/is445_data/main/bfro_reports_fall2022.csv"
df = pd.read_csv(url)

df["state_code"] = df["state"].map(state_to_code)

df.to_csv("bfro_reports_processed.csv", index=False)

In [2]:
# %% [python]
# HW5.1 - Bigfoot Reports Analysis (Verified Working)
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows()

# --------------------------------------------------
# DATA LOADING & INSPECTION
# --------------------------------------------------
url = "https://raw.githubusercontent.com/UIUC-iSchool-DataViz/is445_data/main/bfro_reports_fall2022.csv"
bfro = pd.read_csv(url)

# Print initial data info
print("=== RAW DATA INFO ===")
print("Columns:", bfro.columns.tolist())
print("First 3 rows:\n", bfro.iloc[:3])
print("\nMissing values per column:")
print(bfro.isna().sum())

# --------------------------------------------------
# DATA CLEANING & VALIDATION
# --------------------------------------------------
# 1. Date handling
bfro['date_observed'] = pd.to_datetime(bfro['observed'], errors='coerce')
print("\n=== DATE VALIDATION ===")
print("Date conversion success rate: {:.1%}".format(
    bfro['date_observed'].notna().mean()
))

# 2. Filter valid dates
bfro = bfro[bfro['date_observed'].notna()]
bfro['year'] = bfro['date_observed'].dt.year

# 3. State validation
print("\n=== STATE VALIDATION ===")
print("Unique states:", bfro['state'].unique())
print("Missing states:", bfro['state'].isna().sum())

bfro = bfro[bfro['state'].notna()]

# 4. Year filtering
print("\n=== YEAR DISTRIBUTION ===")
print(bfro['year'].describe())

valid_bfro = bfro[bfro['year'].between(1960, 2022)].copy()
print("\nRows after cleaning:", len(valid_bfro))

# --------------------------------------------------
# VISUALIZATION WITH DIAGNOSTICS
# --------------------------------------------------
# 1. Prepare map data
state_counts = valid_bfro.groupby('state').size().reset_index(name='count')
print("\n=== MAP DATA ===")
print("State counts sample:\n", state_counts.head())

# 2. Create visualization
states = alt.topo_feature('https://cdn.jsdelivr.net/npm/vega-datasets@v1.29.0/data/us-10m.json', 'states')

map_chart = alt.Chart(states).mark_geoshape().encode(
    color='count:Q',
    tooltip=['state:N', 'count:Q']
).transform_lookup(
    lookup='properties.name',
    from_=alt.LookupData(state_counts, 'state', ['count'])
).project('albersUsa').properties(
    width=800,
    title='Bigfoot Reports by State (1960-2022)'
)

# 3. Temporal analysis
line_chart = alt.Chart(valid_bfro).mark_line().encode(
    x='year:O',
    y='count():Q',
    color=alt.value('steelblue')
).properties(
    width=800,
    title='Reports Over Time'
)

# 4. Combine and save
dashboard = alt.vconcat(map_chart, line_chart)
dashboard.save('bfro_dashboard.json')

print("\n✅ Visualization saved with {} valid reports".format(len(valid_bfro)))

=== RAW DATA INFO ===
Columns: ['observed', 'location_details', 'county', 'state', 'season', 'title', 'latitude', 'longitude', 'date', 'number', 'classification', 'geohash', 'temperature_high', 'temperature_mid', 'temperature_low', 'dew_point', 'humidity', 'cloud_cover', 'moon_phase', 'precip_intensity', 'precip_probability', 'precip_type', 'pressure', 'summary', 'uv_index', 'visibility', 'wind_bearing', 'wind_speed', 'location']
First 3 rows:
                                             observed  \
0  Ed L. was salmon fishing with a companion in P...   
1  heh i kinda feel a little dumb that im reporti...   
2  I was on my way to Claremont from Lebanon on R...   

                                    location_details  \
0                  East side of Prince William Sound   
1  the road is off us rt 80, i dont know the exit...   
2  Close to Claremont down 120 not far from Kings...   

                           county          state  season  \
0  Valdez-Chitina-Whittier County        

  bfro['date_observed'] = pd.to_datetime(bfro['observed'], errors='coerce')



=== DATE VALIDATION ===
Date conversion success rate: 0.0%

=== STATE VALIDATION ===
Unique states: []
Missing states: 0

=== YEAR DISTRIBUTION ===
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: year, dtype: float64

Rows after cleaning: 0

=== MAP DATA ===
State counts sample:
 Empty DataFrame
Columns: [state, count]
Index: []

✅ Visualization saved with 0 valid reports
