In [2]:
import pandas as pd
import numpy as np

In [3]:
raw_data_path_prefix = 'data/raw/'

tables_dict = {
    'Sales': 'Candy_Sales.csv',
    'Factories': 'Candy_Factories.csv',
    'Products': 'Candy_Products.csv',
    'Targets': 'Candy_Targets.csv',
    'US Zips': 'uszips.csv',
}

df_sales = pd.read_csv(raw_data_path_prefix + tables_dict['Sales'])
df_factories = pd.read_csv(raw_data_path_prefix + tables_dict['Factories'])
df_products = pd.read_csv(raw_data_path_prefix + tables_dict['Products'])
df_targets = pd.read_csv(raw_data_path_prefix + tables_dict['Targets'])
df_zips = pd.read_csv(raw_data_path_prefix + tables_dict['US Zips'])

In [4]:
df_dict = {
    'Sales': df_sales,
    'Factories': df_factories,
    'Products': df_products,
    'Targets': df_targets,
    'US Zips': df_zips
}

In [None]:
for df_name, df in df_dict.items():
    print(f'{df_name}:\n{df.shape}\nContains null values: {df.isnull().sum().any()}\n')

## Clean `df_zips`

In [None]:
df_zips.isnull().sum()

In [None]:
df_zips['parent_zcta'].unique()

In [8]:
df_zips.drop('parent_zcta', axis=1, inplace=True)

In [None]:
df_zips.isnull().sum()

In [None]:
df_zips[df_zips['population'].isnull()]

In [11]:
df_zips.drop_duplicates(inplace=True)

In [12]:
def fill_missing_values_native(df, column, group_column):
    df[column] = df[column].fillna(df.groupby(group_column)[column].transform('mean'))

fill_missing_values_native(df_zips, 'population', 'county_name')
fill_missing_values_native(df_zips, 'population', 'state_name')
fill_missing_values_native(df_zips, 'density', 'county_name')
fill_missing_values_native(df_zips, 'density', 'state_name')

In [13]:
population_dict = {
    'Guam': 154000,
    'Northern Mariana Islands': 48000,
    'American Samoa': 50000
}

density_dict = {
    'Guam': 313,
    'Northern Mariana Islands': 96,
    'American Samoa': 259
}

for state, population in population_dict.items():
    df_zips.loc[(df_zips['population'].isnull()) & (df_zips['state_name'] == state), 'population'] = population
    df_zips.loc[(df_zips['density'].isnull()) & (df_zips['state_name'] == state), 'density'] = density_dict[state]

In [None]:
for df_name, df in df_dict.items():
    print(f'{df_name}:\n{df.shape}\nContains null values: {df.isnull().sum().any()}\n')

## Check Remaining Columns

### Factories Table

In [None]:
df_factories.head()

In [None]:
df_factories.describe(include='all')

In [None]:
df_factories.info()

In [None]:
df_factories['Factory'].unique()

### Sales Table

In [None]:
df_sales.head()

In [None]:
df_sales.describe(include='all')

In [None]:
df_sales.info()

In [31]:
# Convert Date Columns
df_sales['Order Date'] = pd.to_datetime(df_sales['Order Date'])
df_sales['Ship Date'] = pd.to_datetime(df_sales['Ship Date'])

In [None]:
df_sales['Order ID'].value_counts()

In [None]:
df_sales['Product ID'].value_counts()

### Products Table

In [None]:
df_products.head()

In [None]:
df_products['Division'].unique()

In [None]:
df_products.describe(include='all')

In [None]:
df_products.info()

### Targets Table

In [None]:
df_targets.head()

In [None]:
df_targets.info()

## Remove Duplicates

In [39]:
df_sales.drop_duplicates(inplace=True)
df_products.drop_duplicates(inplace=True)
df_targets.drop_duplicates(inplace=True)
df_zips.drop_duplicates(inplace=True)
df_factories.drop_duplicates(inplace=True)