In [70]:
import pandas as pd
import numpy as np

## Set up countries table

- Get table of countries with alpha-2 code that includes region from repository
    - https://github.com/lukes/ISO-3166-Countries-with-Regional-Codes
- Remove the unnecessary columns, change the index field and rename to id

In [68]:
countries_cols = ['name', 'alpha-2', 'region', 'sub-region']
countries = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/all.csv', usecols=countries_cols)

# Copy alpha-2 and name columns for country_lookup conversion table
country_lookup = countries[['name', 'alpha-2']].copy()
country_lookup['name'] = country_lookup['name'].str.upper()

# For ease of searching, change 'Taiwan, Province of China' to 'Taiwan'
country_lookup.loc[country_lookup['name'].str.contains('TAIWAN'),'name'] = 'TAIWAN'

# Change countries index column to be alpha-2 values and rename to id
countries.set_index('alpha-2', inplace=True)
countries.index.name = 'id'

# change country_lookup index col to name values
country_lookup.set_index('name', inplace=True)

- Convert altered countries dataframe to csv

In [12]:
countries_file_path = '/Users/jesseputnam/cs-learning/skillstorm/project01/data/final/countries.csv'
countries.to_csv(countries_file_path)


- Set up lookup table to convert country names to country codes

## Set up Shipper table

In [3]:
# Choose columns to keep
shipper_keep_cols = ['shipper_party_name', 'shipper_party_address_1', 'shipper_party_address_2', 'shipper_party_address_3', 'shipper_party_address_4', 'city', 'state_province', 'zip_code', 'country_code']

# 
shipper_2018_0 = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/2018/shipper_2018_part_0.csv', index_col=False, usecols=shipper_keep_cols)
shipper_2018_1 = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/2018/shipper_2018_part_1.csv', usecols=shipper_keep_cols)

# Replace NaN in name with Unknown
shipper_2018_0['shipper_party_name'].fillna('N/A', inplace=True)

- Clean and consolidate 

In [None]:
def clean_mr_mrs_error(row):
    """Cleans the error where a non-quotation surrounded name where 'Mr AND/OR MRS.,' appears and erroneously splits the names to the next column"""

    row['shipper_party_name'] = row['shipper_party_address_1']
    row['shipper_party_address_1'] = row['shipper_party_address_2']
    row['shipper_party_address_2'] = row['shipper_party_address_3']
    row['shipper_party_address_3'] = row['shipper_party_address_4']
    row['shipper_party_address_4'] = np.nan
    return row

def combine_addresses(row, cols):
    """Combine address columns if not NaN for the number of columns wanted, starting at 1, end inclusive"""

    output = ''
    for i in range(cols):
        address_col = str(row[f"shipper_party_address_{i + 1}"]) + ' '
        if not pd.isna(address_col):
            output += address_col
    return output.strip()

def handle_n_address_cols(row, n):
    """Handle consolidating address columns from column n"""

    if row[f'shipper_party_address_{n}'] in countries.index:
        row['country_code'] = row[f'shipper_party_address_{n}']
    elif row[f'shipper_party_address_{n}'] in country_lookup.index:
        row['country_code'] = country_lookup.loc[row[f'shipper_party_address_{n}']]['alpha-2']
    else:
        row['country_code'] = 'N/A'
        row['shipper_party_address_1'] = combine_addresses(row, n)
        return row
    row['shipper_party_address_1'] = combine_addresses(row, n - 1)
    return row

def clean_row(row):
    """Cleans the row by fixing naming issues, consolidating address, and adding correct country code if found"""

    # Handle the mr/mrs error
    if 'mr. and/or' in row['shipper_party_name'].lower():
        row = clean_mr_mrs_error(row)

    # Consolidate address columns
    if row['country_code'] is not np.nan:
        if row['country_code'] in countries.index:
            row['shipper_party_address_1'] = combine_addresses(row, 4)
            return row
    if not pd.isna(row['shipper_party_address_4']):
        return handle_n_address_cols(row, 4)
    if not pd.isna(row['shipper_party_address_3']):
        return handle_n_address_cols(row, 3)
    if not pd.isna(row['shipper_party_address_2']):
        return handle_n_address_cols(row, 2)
    if not pd.isna(row['shipper_party_address_1']):
        return handle_n_address_cols(row, 1)
        
    return row

In [7]:
shipper_2018_1.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4654643 entries, 0 to 4654642
Data columns (total 9 columns):
 #   Column                   Non-Null Count    Dtype 
---  ------                   --------------    ----- 
 0   shipper_party_name       4654640 non-null  object
 1   shipper_party_address_1  4654639 non-null  object
 2   shipper_party_address_2  4090433 non-null  object
 3   shipper_party_address_3  2934400 non-null  object
 4   shipper_party_address_4  503952 non-null   object
 5   city                     1214756 non-null  object
 6   state_province           407030 non-null   object
 7   zip_code                 595539 non-null   object
 8   country_code             1164673 non-null  object
dtypes: object(9)
memory usage: 319.6+ MB


In [None]:

shipper_2018_0.to_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/final/shippers')