In [1]:
import pandas as pd
import numpy as np

In [2]:
raw_data_path_prefix = 'data/raw/'

tables_dict = {
    'Sales': 'Candy_Sales.csv',
    'Factories': 'Candy_Factories.csv',
    'Products': 'Candy_Products.csv',
    'Targets': 'Candy_Targets.csv',
    'US Zips': 'uszips.csv',
}

df_sales = pd.read_csv(raw_data_path_prefix + tables_dict['Sales'])
df_factories = pd.read_csv(raw_data_path_prefix + tables_dict['Factories'])
df_products = pd.read_csv(raw_data_path_prefix + tables_dict['Products'])
df_targets = pd.read_csv(raw_data_path_prefix + tables_dict['Targets'])
df_zips = pd.read_csv(raw_data_path_prefix + tables_dict['US Zips'])

In [3]:
df_dict = {
    'Sales': df_sales,
    'Factories': df_factories,
    'Products': df_products,
    'Targets': df_targets,
    'US Zips': df_zips
}

In [4]:
for df_name, df in df_dict.items():
    print(f'{df_name}:\n{df.shape}\nContains null values: {df.isnull().sum().any()}\n')

Sales:
(10194, 18)
Contains null values: False

Factories:
(5, 3)
Contains null values: False

Products:
(15, 6)
Contains null values: False

Targets:
(3, 2)
Contains null values: False

US Zips:
(33787, 18)
Contains null values: True



In [5]:
df_zips.isnull().sum()

zip                     0
lat                     0
lng                     0
city                    0
state_id                0
state_name              0
zcta                    0
parent_zcta         33787
population             17
density                17
county_fips             0
county_name             0
county_weights          0
county_names_all        0
county_fips_all         0
imprecise               0
military                0
timezone                0
dtype: int64

In [6]:
df_zips['parent_zcta'].unique()

array([nan])

In [7]:
df_zips.drop('parent_zcta', axis=1, inplace=True)

In [8]:
df_zips.isnull().sum()

zip                  0
lat                  0
lng                  0
city                 0
state_id             0
state_name           0
zcta                 0
population          17
density             17
county_fips          0
county_name          0
county_weights       0
county_names_all     0
county_fips_all      0
imprecise            0
military             0
timezone             0
dtype: int64

In [9]:
df_zips[df_zips['population'].isnull()]

Unnamed: 0,zip,lat,lng,city,state_id,state_name,zcta,population,density,county_fips,county_name,county_weights,county_names_all,county_fips_all,imprecise,military,timezone
91,802,18.34349,-64.9287,St Thomas,VI,Virgin Islands,True,,,78030,St. Thomas,"{""78030"": 100}",St. Thomas,78030,False,False,America/St_Thomas
92,820,17.7388,-64.70695,Christiansted,VI,Virgin Islands,True,,,78010,St. Croix,"{""78010"": 100}",St. Croix,78010,False,False,America/St_Thomas
93,830,18.33857,-64.73802,St John,VI,Virgin Islands,True,,,78020,St. John,"{""78020"": 100}",St. John,78020,False,False,America/St_Thomas
94,840,17.72649,-64.84717,Frederiksted,VI,Virgin Islands,True,,,78010,St. Croix,"{""78010"": 100}",St. Croix,78010,False,False,America/St_Thomas
95,850,17.72763,-64.79122,Kingshill,VI,Virgin Islands,True,,,78010,St. Croix,"{""78010"": 100}",St. Croix,78010,False,False,America/St_Thomas
96,851,17.74757,-64.78737,Kingshill,VI,Virgin Islands,True,,,78010,St. Croix,"{""78010"": 100}",St. Croix,78010,False,False,America/St_Thomas
32479,96799,-14.21984,-170.37005,Pago Pago,AS,American Samoa,True,,,60050,Western,"{""60050"": 36.49, ""60010"": 32.58, ""60020"": 29.1...",Western|Eastern|Manu'a|Swains Island,60050|60010|60020|60040,False,False,Pacific/Pago_Pago
32500,96910,13.45472,144.75128,Hagatna,GU,Guam,True,,,66010,Guam,"{""66010"": 100}",Guam,66010,False,False,Pacific/Guam
32501,96913,13.47825,144.81519,Barrigada,GU,Guam,True,,,66010,Guam,"{""66010"": 100}",Guam,66010,False,False,Pacific/Guam
32502,96915,13.37526,144.70714,Santa Rita,GU,Guam,True,,,66010,Guam,"{""66010"": 100}",Guam,66010,False,False,Pacific/Guam


In [10]:
df_zips.drop_duplicates(inplace=True)

In [11]:
def fill_missing_values_native(df, column, group_column):
    df[column] = df[column].fillna(df.groupby(group_column)[column].transform('mean'))

fill_missing_values_native(df_zips, 'population', 'county_name')
fill_missing_values_native(df_zips, 'population', 'state_name')
fill_missing_values_native(df_zips, 'density', 'county_name')
fill_missing_values_native(df_zips, 'density', 'state_name')

In [12]:
population_dict = {
    'Guam': 154000,
    'Northern Mariana Islands': 48000,
    'American Samoa': 50000
}

density_dict = {
    'Guam': 313,
    'Northern Mariana Islands': 96,
    'American Samoa': 259
}

for state, population in population_dict.items():
    df_zips.loc[(df_zips['population'].isnull()) & (df_zips['state_name'] == state), 'population'] = population
    df_zips.loc[(df_zips['density'].isnull()) & (df_zips['state_name'] == state), 'density'] = density_dict[state]

In [13]:
for df_name, df in df_dict.items():
    print(f'{df_name}:\n{df.shape}\nContains null values: {df.isnull().sum().any()}\n')

Sales:
(10194, 18)
Contains null values: False

Factories:
(5, 3)
Contains null values: False

Products:
(15, 6)
Contains null values: False

Targets:
(3, 2)
Contains null values: False

US Zips:
(33787, 17)
Contains null values: False

