In [1]:
import pandas as pd
import numpy as np
import timeit
from numpy.linalg import inv

In [None]:
start_time = timeit.default_timer()
df = pd.read_csv("../data_proj1/iowaliquor.csv", low_memory=False)

elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

df.head()

In [None]:
# DATA CLEANING/ORGANIZATION
start_time = timeit.default_timer()

# only include the features we need
df = df[["Date", "County", "Pack", "Category Name", 
         "Vendor Name", "State Bottle Cost", "State Bottle Retail", 
         "Bottles Sold", "Sale (Dollars)", "Volume Sold (Gallons)"]]

# type casting
categorical_cols = ["Category Name", "Vendor Name"]
for col in categorical_cols:
    df[col] = df[col].astype('category')
    
# get year from date
df['Date'] = pd.to_datetime(df['Date'])
df.insert(1, 'Year', df['Date'].dt.year)

# get dummies for categorical variables
county_var = df["County"]
df = pd.get_dummies(df.drop(["County"], axis=1))
df["County"] = county_var

elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60) 

In [None]:
# AGGREGATION BY COUNTY AND YEAR
start_time = timeit.default_timer()

df.insert(1, 'Month-Year', df['Date'].dt.strftime('%m-%Y'))
df.drop(["Date", "Year"], axis=1, inplace=True)

df = df.groupby(['County', 'Month-Year']).agg('sum')

# add the year column back to the data frame
df.reset_index(level=["County", "Month-Year"], inplace=True)
df.insert(1, "Year", df["Month-Year"].str.replace("..-", ""))
df["Year"] = df["Year"].astype(int)

elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60) 

In [None]:
# EXTERNAL DATASET, IOWA POPULATION FOR (county, year) PAIRS
df2 = pd.read_excel("iowa_county.xlsx")

# data cleaning/organization
df2 = df2.iloc[4:103]
df2.drop(["Unnamed: 1", "Unnamed: 2"], axis=1, inplace=True)
df2.rename(columns=
{"table with row headers in column A " +
 "and column headers in rows 3 through " +
 "4 (leading dots indicate sub-parts)": "County",
                    "Unnamed: 3": "2010", 
                    "Unnamed: 4": "2011", 
                    "Unnamed: 5": "2012", 
                    "Unnamed: 6": "2013", 
                    "Unnamed: 7": "2014",
                    "Unnamed: 8": "2015",
                    "Unnamed: 9": "2016",
                    "Unnamed: 10": "2017",
                    "Unnamed: 11": "2018",
                    "Unnamed: 12": "2019"}, inplace=True)
df2.reset_index(drop=True, inplace=True)
     
df2["County"] = df2["County"].str.replace(".", "")
df2["County"] = df2["County"].str.replace(" County, Iowa", "")

# convert df to standard form
df2 = df2.melt(var_name="Year", value_name="Population", id_vars=['County'])

# type casting
df2["County"] = df2["County"].astype('category')
df2["Year"] = df2["Year"].astype('int')
df2["Population"] = df2["Population"].astype('int')

df2.head()

In [None]:
# EXTERNAL DATASET, IOWA INCOME PER CAPITA FOR (county, year) PAIRS
df_income = pd.read_csv("income.csv")

df_income = df_income[["Name", "Variable", "Value", "Date"]]
df_income = df_income[df_income["Variable"] == "Per capita personal income"]


df_income.rename(columns={"Name": "County", "Value": "Income Per Capita"}, inplace=True)

df_income["County"] = df_income["County"].str.replace(", IA", "")

df_income['Date'] = pd.to_datetime(df_income['Date'])
df_income["Year"] = df_income["Date"].dt.year

df_income.drop(["Variable", "Date"], axis=1, inplace=True)
df_income.head()

In [None]:
# MERGE BOTH EXTERAL DATASETS
start_time = timeit.default_timer()

# merge with population values for each (county, year) pair
df = pd.merge(df, df2, on=["County", "Year"])

# per capita normalization
df.insert(6, "Bottles Sold Per Capita", df["Bottles Sold"]/df["Population"])
df.insert(7, "Alcohol Expense Per Capita", df["Sale (Dollars)"]/df["Population"])
df.insert(8, "Volume Sold (Gallons) Per Capita", df["Volume Sold (Gallons)"]/df["Population"])

cols_to_drop = ["Bottles Sold", "Sale (Dollars)", "Volume Sold (Gallons)"]
df.drop(cols_to_drop, axis=1, inplace=True)

# merge with income values for each (county, year) pair
df = pd.merge(df, df_income, on=["County", "Year"])
df.drop(["Year"], axis=1, inplace=True)

elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60) 
df.head()

In [None]:
iowa_month_county = df
iowa_month_county.to_csv("iowa_month_county_main.csv", index=False)