In [68]:
import pandas as pd
import numpy as np
import gc
import timeit
from numpy.linalg import inv

In [69]:
start_time = timeit.default_timer()
df = pd.read_csv("iowa-sample.csv")

elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

df.head()

Time (minutes) elapsed for this cell: 0.0002387946833332914


Unnamed: 0,Invoice/Item Number,Date,Store Number,Store Name,Address,City,Zip Code,Store Location,County Number,County,...,Item Number,Item Description,Pack,Bottle Volume (ml),State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Volume Sold (Gallons)
0,S08096000008,10/04/2012,4641,Kum & Go #573 / SE 14th DM,5830 SE 14th ST,DES MOINES,50315,,77,Polk,...,89191,Jose Cuervo Especial Reposado Tequila Mini,12,500,11.5,17.25,1,17.25,0.5,0.13
1,S23102300041,12/20/2014,4346,Roy's Foodland,105 PEARL ST,SHELLSBURG,52332,POINT (-91.869285 42.094155),6,Benton,...,11774,Black Velvet,24,375,3.07,4.61,6,27.66,2.25,0.59
2,S14410500035,09/09/2013,3628,Wal-Mart 1528 / Cedar Rapids,2645 BLAIRS FERRY RD NE,CEDAR RAPIDS,52402,POINT (-91.680734 42.034748),57,Linn,...,43410,Captain Morgan Parrot Bay Coconut,12,750,7.49,11.23,12,134.76,9.0,2.38
3,S09427600021,12/12/2012,4708,No Frills Supermarkets #803 / Glenwo,423 SHARP ST,GLENWOOD,51534,POINT (-95.742987 41.04635),65,Mills,...,24458,Kessler Blend Whiskey,6,1750,11.01,16.52,30,495.6,52.5,13.87
4,S28446900114,10/14/2015,2594,Hy-Vee Food Store / Sioux City,4500 SERGEANT ROAD,SIOUX CITY,51106,POINT (-96.346969 42.447396),97,Woodbury,...,10550,Black Velvet Toasted Caramel,12,750,6.75,10.13,12,121.56,9.0,2.38


In [70]:
df = df[["Date", "County", "Pack", "Category Name", "State Bottle Retail", "Volume Sold (Gallons)"]]

df.dropna(inplace=True)

category_name = df["Category Name"].copy()
category_name.loc[category_name.str.contains("WHISK")] = 'Whiskey'
category_name.loc[category_name.str.contains("VODKA")] = 'Vodka'
category_name.loc[category_name.str.contains("RUM")] = 'Rum'
category_name.loc[category_name.str.contains("SCHNAPPS")] = 'Schnapps'
category_name.loc[category_name.str.contains("BRAND")] = 'Brand'
category_name.loc[category_name.str.contains("GIN")] = 'Gin'
category_name.loc[category_name.str.contains("TEQUILA")] = 'Tequila'
category_name.loc[(category_name != 'Whiskey') &
                  (category_name != 'Vodka') &
                  (category_name != 'Rum') &
                  (category_name != 'Schapps') &
                  (category_name != 'Brand') &
                  (category_name != 'Gin') &
                  (category_name != 'Tequila')] = 'Other'

df["Category Name"] = category_name

df['County'] = df['County'].str.lower()
df.loc[df["County"]=="buena vist","County"] = "buena vista"
df.loc[df["County"]=="cerro gord","County"] = "cerro gordo"
df.loc[df["County"]=="obrien","County"] = "o'brien"
df.loc[df["County"]=="pottawatta","County"] = "pottawattamie"

# type casting
categorical_cols = ["Category Name"]
for col in categorical_cols:
    df[col] = df[col].astype('category')
    
# get year from date
df['Date'] = pd.to_datetime(df['Date'])
df.insert(1, 'Year', df['Date'].dt.year)


In [71]:
# AGGREGATION BY COUNTY AND YEAR
start_time = timeit.default_timer()

df.insert(1, 'Month-Year', df['Date'].dt.strftime('%m-%Y'))
df.drop(["Date", "Year"], axis=1, inplace=True)

df = df.groupby(['County', 'Month-Year']).agg('sum')

# add the year column back to the data frame
df.reset_index(level=["County", "Month-Year"], inplace=True)
df.insert(1, "Year", df["Month-Year"].str.replace("..-", ""))
df["Year"] = df["Year"].astype(int)

elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60) 

Time (minutes) elapsed for this cell: 0.0004244797499997806


In [72]:
# EXTERNAL DATASET, IOWA POPULATION FOR (county, year) PAIRS
df2 = pd.read_excel("iowa_county.xlsx")

# data cleaning/organization
df2 = df2.iloc[4:103]
df2.drop(["Unnamed: 1", "Unnamed: 2"], axis=1, inplace=True)
df2.rename(columns=
{"table with row headers in column A " +
 "and column headers in rows 3 through " +
 "4 (leading dots indicate sub-parts)": "County",
                    "Unnamed: 3": "2010", 
                    "Unnamed: 4": "2011", 
                    "Unnamed: 5": "2012", 
                    "Unnamed: 6": "2013", 
                    "Unnamed: 7": "2014",
                    "Unnamed: 8": "2015",
                    "Unnamed: 9": "2016",
                    "Unnamed: 10": "2017",
                    "Unnamed: 11": "2018",
                    "Unnamed: 12": "2019"}, inplace=True)
df2.reset_index(drop=True, inplace=True)
     
df2["County"] = df2["County"].str.replace(".", "")
df2["County"] = df2["County"].str.replace(" County, Iowa", "")
df2["County"] = df2['County'].str.lower()

# convert df to standard form
df2 = df2.melt(var_name="Year", value_name="Population", id_vars=['County'])

# type casting
df2["County"] = df2["County"].astype('category')
df2["Year"] = df2["Year"].astype('int')
df2["Population"] = df2["Population"].astype('int')

df2.head()

Unnamed: 0,County,Year,Population
0,adair,2010,7679
1,adams,2010,4023
2,allamakee,2010,14378
3,appanoose,2010,12856
4,audubon,2010,6098


In [73]:
# EXTERNAL DATASET, IOWA INCOME PER CAPITA FOR (county, year) PAIRS
df_income = pd.read_csv("income.csv")

df_income = df_income[["Name", "Variable", "Value", "Date"]]
df_income = df_income[df_income["Variable"] == "Per capita personal income"]


df_income.rename(columns={"Name": "County", "Value": "Income Per Capita"}, inplace=True)

df_income["County"] = df_income["County"].str.replace(", IA", "")
df_income["County"] = df_income["County"].str.lower()

df_income['Date'] = pd.to_datetime(df_income['Date'])
df_income["Year"] = df_income["Date"].dt.year

df_income.drop(["Variable", "Date"], axis=1, inplace=True)
df_income.head()

Unnamed: 0,County,Income Per Capita,Year
2073,adair,21269,1997
2074,adair,22305,1998
2075,adair,23030,1999
2076,adair,25831,2000
2077,adair,26276,2001


In [74]:
# MERGE BOTH EXTERAL DATASETS
start_time = timeit.default_timer()

# merge with population values for each (county, year) pair
df = pd.merge(df, df2, on=["County", "Year"])

# per capita normalization
df["Volume Sold (Gallons) Per Capita"] = df["Volume Sold (Gallons)"]/df["Population"]

cols_to_drop = ["Volume Sold (Gallons)"]
df.drop(cols_to_drop, axis=1, inplace=True)

# merge with income values for each (county, year) pair
df = pd.merge(df, df_income, on=["County", "Year"])
df.drop(["Year"], axis=1, inplace=True)

df.head()

Unnamed: 0,County,Month-Year,Pack,State Bottle Retail,Population,Volume Sold (Gallons) Per Capita,Income Per Capita
0,adair,01-2013,12,16.49,7387,8e-05,45559
1,adair,09-2012,24,6.45,7468,5.4e-05,42093
2,adair,12-2012,6,16.35,7468,0.000186,42093
3,allamakee,03-2015,12,22.86,13874,3.8e-05,43992
4,allamakee,06-2012,24,6.42,14149,1.4e-05,38814


In [9]:
iowa_month_county = df
iowa_month_county.to_csv("iowa_month_county.csv", index=False)