In [1]:
import pandas as pd

# Importing data
demographics_url = 'https://raw.githubusercontent.com/juanpaul96/holcim_DAE_test/main/demographics_sample.csv'
demographics_raw = pd.read_csv(demographics_url)


In [11]:
########## Data Exploration ##########

In [2]:
# Check initial data types
print("Dataset columns type:\n", demographics_raw.dtypes)

Dataset columns type:
 Restaurant Key           object
Restaurant Open Date     object
Restaurant Close Date    object
Restaurant Coop          object
Restaurant Region        object
AACM                     object
HCM                      object
ACM                      object
Household Count          object
Urban Uptown             object
Midtown Mix              object
Urban Core               object
Elite Suburbs            object
The Affluentials         object
Middleburbs              object
Inner Suburbs            object
Second City Society      object
City Centers             object
Micro-City Blues         object
Landed Gentry            object
Country Comfort          object
Middle America           object
Rustic Living            object
Midlife Success          object
Young Achievers          object
Striving Singles         object
Accumulated Wealth       object
Young Accumulators       object
Mainstream Families      object
Sustaining Families      object
Affluent Empty Ne

In [11]:

# Copy the raw DataFrame
demographics_datatypes = demographics_raw.copy()

# Drop the metadata row (first row)
demographics_datatypes = demographics_datatypes.iloc[1:].reset_index(drop=True)

# Standardize column names to snake_case
demographics_datatypes.columns = [c.strip().lower().replace(' ', '_').replace('restaurant_', 'rest_') for c in demographics_datatypes.columns]

# Parse date columns
demographics_datatypes['rest_open_date'] = pd.to_datetime(demographics_datatypes['rest_open_date'], format='%m/%d/%y', errors='coerce')
demographics_datatypes['rest_close_date'] = pd.to_datetime(demographics_datatypes['rest_close_date'], format='%m/%d/%y', errors='coerce')

# Function to parse numeric and percentage values
def parse_numeric(val):
    if pd.isna(val):
        return pd.NA
    s = str(val).strip()
    if s.endswith('%'):
        try:
            return float(s.rstrip('%')) / 100
        except:
            return pd.NA
    s_clean = s.replace(',', '')
    try:
        return float(s_clean)
    except:
        return pd.NA

# Apply parsing to all relevant columns
exclude = {'rest_key', 'rest_open_date', 'rest_close_date', 'rest_coop', 'rest_region'}
for col in demographics_datatypes.columns:
    if col not in exclude:
        demographics_datatypes[col] = demographics_datatypes[col].apply(parse_numeric)

# Ensure restaurant_key is integer
demographics_datatypes['rest_key'] = pd.to_numeric(demographics_datatypes['rest_key'], errors='coerce').astype('Int64')

demographics_datatypes


Unnamed: 0,rest_key,rest_open_date,rest_close_date,rest_coop,rest_region,aacm,hcm,acm,household_count,urban_uptown,...,conservative_classics,cautious_couples,sustaining_seniors,prom_soc,prom_life,row,urban,suburban,second_city,town_and_rural
0,1364,1970-02-26,NaT,SEA/TCA WA CP-0024,NORTHWEST REGION-0160480000,9.2,9.67,13.66,9080.0,0.0,...,0.21,0.01,0.02,,,13055.0,0.0,0.0,1.0,0.0
1,5357,1980-02-06,NaT,SEA/TCA WA CP-0024,NORTHWEST REGION-0160480000,9.24,11.34,42.83,1281.0,0.1,...,0.06,0.17,0.0,,,10213.0,0.13,0.87,0.0,0.0
2,13369,1995-11-03,NaT,SEA/TCA WA CP-0024,NORTHWEST REGION-0160480000,8.91,22.72,19.63,3108.0,0.0,...,0.11,0.16,0.0,,,5748.0,0.0,0.93,0.07,0.0
3,13604,1996-03-08,NaT,SEA/TCA WA CP-0024,NORTHWEST REGION-0160480000,3.2,6.28,6.89,9219.0,0.0,...,0.13,0.0,0.0,,,5652.0,0.0,0.07,0.93,0.0


In [12]:
demographics_datatypes.to_csv('demographics_silver.csv', index=False)