In [2]:
import pandas as pd
import numpy as np

from utils.clean_shipper import clean_row

## Set up countries table

- Get table of countries with alpha-2 code that includes region from repository
    - https://github.com/lukes/ISO-3166-Countries-with-Regional-Codes
- Remove the unnecessary columns, change the index field and rename to id
- Set up lookup table to convert country names to country codes

In [3]:
countries_cols = ['name', 'alpha-2', 'region', 'sub-region']
# keep_default_na=False prevents pandas from interpreting Namibia's alpha-2 (NA) as NaN
countries = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/all.csv', keep_default_na=False, usecols=countries_cols)

# Change countries index column to be alpha-2 values and rename to id
countries.set_index('alpha-2', inplace=True)
countries.index.name = 'id'

# Set for O(1) look up during data cleaning
alpha_2_set = set(countries.index)

# Add dictionary for O(1) look up during cleaning
country_dict = {x[1].upper(): x[0] for x in countries.itertuples()}
# Add a few of the statiscally significant enough outliers - 
    # cheaper computationally than performing trims on each row (I think)
country_dict['TAIWAN'] = 'TW'
country_dict['SOUTH KOREA'] = 'KR'
country_dict['SHANGHAI CN'] = 'CN'
country_dict['SHANGHAI'] = 'CN'
country_dict['SHANGHAI .'] = 'CN'
country_dict['HONG KONG .'] = 'CN'
country_dict['TAIPEI .'] = 'TW'

- Convert altered countries dataframe to csv

In [8]:
countries_file_path = '/Users/jesseputnam/cs-learning/skillstorm/project01/data/final/countries.csv'
countries.to_csv(countries_file_path)


## Consolidate CSV Files

In [None]:
# Choose columns to keep
shipper_keep_cols = ['shipper_party_name', 'shipper_party_address_1', 'shipper_party_address_2', 'shipper_party_address_3', 'shipper_party_address_4', 'country_code']

shipper_raw_path = '/Users/jesseputnam/cs-learning/skillstorm/project01/data/final_raw/shipper.csv'

In [None]:
shipper_0 = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/2018/shipper_2018_part_0.csv', usecols=shipper_keep_cols)
shipper_1 = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/2018/shipper_2018_part_1.csv', usecols=shipper_keep_cols)
shipper_2 = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/2019/shipper_2019_part_0.csv', usecols=shipper_keep_cols)
shipper_3 = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/2019/shipper_2019_part_1.csv', usecols=shipper_keep_cols)
shipper_4 = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/2020/shipper_2020_part_0.csv', usecols=shipper_keep_cols)
shipper_5 = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/2020/shipper_2020_part_1.csv', usecols=shipper_keep_cols)

## Set up Shipper table

In [10]:
# Choose columns to keep
shipper_keep_cols = ['shipper_party_name', 'shipper_party_address_1', 'shipper_party_address_2', 'shipper_party_address_3', 'shipper_party_address_4', 'country_code']

# 
shipper_2018_0 = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/2018/shipper_2018_part_0.csv', index_col=False, usecols=shipper_keep_cols)
shipper_2018_1 = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/2018/shipper_2018_part_1.csv', usecols=shipper_keep_cols)

# Replace NaN in name with Unknown
shipper_2018_0['shipper_party_name'].fillna('N/A', inplace=True)

- Clean and consolidate 

In [None]:
shipper_2018_1
shipper_2018_0_clean = shipper_2018_0.apply(lambda row: clean_row(row, alpha_2_set, country_dict), axis=1)
shipper_2018_0_clean = shipper_2018_0_clean[['identifier', 'shipper_party_name', 'country_code']]

- Map names to ids

In [None]:
shipper_id_dict = {}
shipper_id_count = 0

for name in shipper_2018_0_clean['shipper_party_name'].unique():
    shipper_id_dict[name] = shipper_id_count
    shipper_id_count += 1

shipper_2018_0_clean['id'] = shipper_2018_0_clean['shipper_party_name'].map(shipper_id_dict)

In [None]:

shipper_2018_0.to_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/final/shippers')