In [None]:
import pandas as pd
import altair as alt
import numpy as np
from datetime import datetime
import re

In [None]:
alt.data_transformers.disable_max_rows()

In [None]:
url = "https://github.com/UIUC-iSchool-DataViz/is445_data/raw/main/bfro_reports_fall2022.csv"
df = pd.read_csv(url)

print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
df.isnull().sum()

In [None]:
def extract_year(date_str):
    if pd.isna(date_str):
        return np.nan
    
    # Try to find a 4-digit year pattern
    year_match = re.search(r'\b(19\d{2}|20\d{2})\b', str(date_str))
    if year_match:
        return int(year_match.group(1))
    return np.nan

df['year'] = df['date'].apply(extract_year)

df_clean = df.dropna(subset=['latitude', 'longitude', 'year'])

df_clean = df_clean[(df_clean['year'] >= 1950) & (df_clean['year'] <= 2022)]

In [None]:
state_counts = df_clean.groupby('state').size().reset_index(name='count')
state_counts = state_counts.sort_values('count', ascending=False)

state_counts.head()

In [None]:
year_counts = df_clean.groupby('year').size().reset_index(name='count')

season_counts = df_clean.groupby('season').size().reset_index(name='count')
season_counts

In [None]:
classification_counts = df_clean.groupby('classification').size().reset_index(name='count')

classification_counts

In [None]:
states = alt.topo_feature('https://cdn.jsdelivr.net/npm/vega-datasets@v1.29.0/data/us-10m.json', 'states')

background = alt.Chart(states).mark_geoshape(
    fill='lightgray',
    stroke='white'
).properties(
    width=800,
    height=500
).project('albersUsa')


In [None]:
year_slider = alt.binding_range(min=int(df_clean['year'].min()), 
                               max=int(df_clean['year'].max()), 
                               step=1, 
                               name='Year: ')
year_selection = alt.selection_point(fields=['year_threshold'], 
                                    bind=year_slider, 
                                    value={'year_threshold': 2000})

In [None]:
points = alt.Chart(df_clean).mark_circle(size=60, opacity=0.6).encode(
    longitude='longitude:Q',
    latitude='latitude:Q',
    color=alt.Color('classification:N', scale=alt.Scale(scheme='category10'),
                  legend=alt.Legend(title="Classification Type")),
    tooltip=['state:N', 'county:N', 'year:Q', 'season:N', 'classification:N', 'observed:N']
).transform_filter(
    alt.datum.year <= year_selection.year_threshold
).properties(
    title=alt.TitleParams(
        ['Bigfoot Sightings in the United States',
         'Drag the slider to filter by year'],
        anchor='middle'
    )
)

In [None]:
map_chart = background + points.add_selection(year_selection)

map_chart.save('assets/json/bigfoot_map.json')

In [None]:
seasonal_state = df_clean.groupby(['state', 'season']).size().reset_index(name='count')

state_dropdown = alt.binding_select(
    options=sorted(df_clean['state'].unique().tolist()),
    name='State: '
)
state_select = alt.selection_point(fields=['state'], bind=state_dropdown, value={'state': 'CA'})

seasonal_chart = alt.Chart(seasonal_state).mark_bar().encode(
    x=alt.X('season:N', title='Season', sort=['Spring', 'Summer', 'Fall', 'Winter']),
    y=alt.Y('count:Q', title='Number of Sightings'),
    color=alt.Color('season:N', 
                  scale=alt.Scale(domain=['Spring', 'Summer', 'Fall', 'Winter'],
                                 range=['#A1D99B', '#FC8D59', '#B30000', '#67A9CF']),
                  legend=None),
    tooltip=['state:N', 'season:N', 'count:Q']
).transform_filter(
    state_select
).properties(
    width=600,
    height=400,
    title=alt.TitleParams(
        ['Seasonal Patterns of Bigfoot Sightings',
         'Select a state from the dropdown menu'],
        anchor='middle'
    )
).add_selection(
    state_select
)

seasonal_chart.save('assets/json/bigfoot_seasonal.json')

map_chart