In [168]:
import pandas as pd
import numpy as np

In [56]:
pd.set_option('display.max_rows', 100)

In [71]:
ct = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/final/countries.csv')

## Country Work

In [256]:
countries_cols = ['name', 'alpha-2', 'region', 'sub-region']
countries = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/all.csv', usecols=countries_cols)

# Copy alpha-2 and name columns for country_lookup conversion table
country_lookup = countries[['name', 'alpha-2']].copy()
country_lookup['name'] = country_lookup['name'].str.upper()

# For ease of searching, change 'Taiwan, Province of China' to 'Taiwan'
country_lookup.loc[country_lookup['name'].str.contains('TAIWAN'),'name'] = 'TAIWAN'

# Change countries index column to be alpha-2 values and rename to id
countries.set_index('alpha-2', inplace=True)
countries.index.name = 'id'

# change country_lookup index col to name values
country_lookup.set_index('name', inplace=True)

In [32]:
countries_file_path = '/Users/jesseputnam/cs-learning/skillstorm/project01/data/final/countries.csv'
countries.to_csv(countries_file_path)

In [99]:
header_keep_cols = ['conveyance_id', 'vessel_name', 'vessel_country_code', 'carrier_code', 'identifier', 'estimated_arrival_date', 'actual_arrival_date', 'foreign_port_of_lading', 'place_of_receipt', 'port_of_unlading', 'record_status_indicator']

header_2018_0 = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/2018/header_2018_part_0.csv', parse_dates=['estimated_arrival_date', 'actual_arrival_date'], usecols=header_keep_cols)

In [105]:
# header_2018_0.info()
# header_2018_0[header_2018_0['record_status_indicator'] == 'Deleted']
# header_2018_0[['vessel_name', 'vessel_country_code']].value_counts(sort=False).tail(100)
# header_2018_0[['vessel_name', 'vessel_country_code', 'carrier_code']].value_counts(sort=False).tail(100)
# header_2018_0.loc[header_2018_0['vessel_name'] == 'EVER SIGMA']
# header_2018_0[['port_of_unlading', 'foreign_port_of_lading']].value_counts()
header_2018_0[header_2018_0['identifier'] == 2018012038125]

Unnamed: 0,identifier,carrier_code,vessel_country_code,vessel_name,port_of_unlading,estimated_arrival_date,foreign_port_of_lading,record_status_indicator,place_of_receipt,conveyance_id,actual_arrival_date
1026717,2018012038125,EGLV,GB,EVER SIGMA,"Los Angeles, California",2018-01-18,"Yantian,China (Mainland)",New,"YANTIAN, CHINA",9300398,2018-01-19


In [None]:
header_2018_0.info()

# Shipper work

In [258]:
shipper_sample = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/sample/shipper_2018.csv')

In [259]:
shipper_sample

Unnamed: 0,identifier,shipper_party_name,shipper_party_address_1,shipper_party_address_2,shipper_party_address_3,shipper_party_address_4,city,state_province,zip_code,country_code,contact_name,comm_number_qualifier,comm_number
0,201801010,JET FAST COMPANY LIMITED,"NO.5 JWU GONG 2ND LANE,","JWU HO VILLAGE,REN WU HSIANG",KAOHSIUNG,TAIWAN,,,,,,,
1,201801011,UNION WONDERFUL MACHINERY LTD.,"NO.51-10 PEI TAO. PEI TAOLI,","TAMSUI DISTRICT,",NEW TAIPEI CITY,TAIWAN,,,,,,,
2,201801012,"SUMEEKO INDUSTRIES CO.,LTD.","NO.20 HUAXI RD., TA-FA INDUSTRIAL","DISTRICT,",KAOHSIUNG,TAIWAN,,,,,,,
3,201801013,YUTY INDUSTRIES CO. LTD.,"NO. 13, SUN PO ROAD, CHUNG-LI",TAOYUAN,TAIWAN,,,,,,,,
4,201801014,"BE SOUND CO., LTD.","1F, NO. 765 MIN-TSU EAST ROAD,",TAIPEI (TAIBEI),TAIWAN,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,20180101612,CHIN LIH HSING PRECISION,"ENTERPRISE CO., LTD., NO.10,","LANE 711, CHUNG-CHENG ROAD,","SHU LIN DIST, NEW TAIPEI CITY,",,,,,,,Telephone Number,TAIWAN
495,20180101613,CHIN LIH HSING PRECISION,"ENTERPRISE CO., LTD., NO.10,","LANE 711, CHUNG-CHENG ROAD,","SHU LIN DIST, NEW TAIPEI CITY,",,,,,,,Telephone Number,TAIWAN
496,20180101614,"NEW LAND LIGHTING(HK) INDUSTRY CO.,","LONGCHANG INDUSTRIAL ZONE#1,LONGHUA","ROAD,ZHOUWU DISTRICT, DONGGUAN CITY","GUANGDONG PROVINCE, CHINA",,,,,,,,
497,20180101615,"AURORA AUTO PARTS C.,LTD",NO. 34 SIBEI TONG JING ROAD,,,,GUANGZHOU,,,CN,,,


In [233]:
shipper_keep_cols = ['identifier', 'shipper_party_name', 'shipper_party_address_1', 'shipper_party_address_2', 'shipper_party_address_3', 'shipper_party_address_4', 'country_code']

shipper_2018_0 = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/2018/shipper_2018_part_0.csv', index_col=False, usecols=shipper_keep_cols)

# Dealing with missing names
shipper_2018_0['shipper_party_name'].fillna('N/A', inplace=True)

### Dealing with Names

In [246]:
misplaced_names = shipper_2018_0[shipper_2018_0['shipper_party_name'].str.contains("mr. and/or", case=False)]

In [254]:
def clean_mr_mrs_error(row):
    """Cleans the error where a non-quotation surrounded name where 'Mr AND/OR MRS.,' appears and erroneously splits the names to the next column"""

    row['shipper_party_name'] = row['shipper_party_address_1']
    row['shipper_party_address_1'] = row['shipper_party_address_2']
    row['shipper_party_address_2'] = row['shipper_party_address_3']
    row['shipper_party_address_3'] = row['shipper_party_address_4']
    row['shipper_party_address_4'] = np.nan
    return row

def combine_addresses(row, cols):
    """Combine address columns if not NaN for the number of columns wanted, starting at 1, end inclusive"""

    output = ''
    for i in range(cols):
        address_col = str(row[f"shipper_party_address_{i + 1}"]) + ' '
        if not pd.isna(address_col):
            output += address_col
    return output.strip()

def handle_n_address_cols(row, n):
    """Handle consolidating address columns from column n"""

    if row[f'shipper_party_address_{n}'] in countries.index:
        row['country_code'] = row[f'shipper_party_address_{n}']
    elif row[f'shipper_party_address_{n}'] in country_lookup.index:
        row['country_code'] = country_lookup.loc[row[f'shipper_party_address_{n}']]['alpha-2']
    else:
        row['country_code'] = 'N/A'
        row['shipper_party_address_1'] = combine_addresses(row, n)
        return row
    row['shipper_party_address_1'] = combine_addresses(row, n - 1)
    return row

def clean_row(row):
    """Cleans the row by fixing naming issues, consolidating address, and adding correct country code if found"""

    # Handle the mr/mrs error
    if 'mr. and/or' in row['shipper_party_name'].lower():
        row = clean_mr_mrs_error(row)

    # Consolidate address columns
    if row['country_code'] is not np.nan:
        if row['country_code'] in countries.index:
            row['shipper_party_address_1'] = combine_addresses(row, 4)
            return row
    if not pd.isna(row['shipper_party_address_4']):
        return handle_n_address_cols(row, 4)
    if not pd.isna(row['shipper_party_address_3']):
        return handle_n_address_cols(row, 3)
    if not pd.isna(row['shipper_party_address_2']):
        return handle_n_address_cols(row, 2)
    if not pd.isna(row['shipper_party_address_1']):
        return handle_n_address_cols(row, 1)
        
    return row

In [264]:
# misplaced_names = misplaced_names.apply(lambda row: clean_row(row), axis=1)
# misplaced_names

shipper_sample.apply(lambda row: clean_row(row), axis=1)[['identifier', 'shipper_party_name', 'shipper_']]
# shipper_sample

Unnamed: 0,identifier,shipper_party_name,shipper_party_address_1,shipper_party_address_2,shipper_party_address_3,shipper_party_address_4,city,state_province,zip_code,country_code,contact_name,comm_number_qualifier,comm_number
0,201801010,JET FAST COMPANY LIMITED,"NO.5 JWU GONG 2ND LANE, JWU HO VILLAGE,REN WU ...","JWU HO VILLAGE,REN WU HSIANG",KAOHSIUNG,TAIWAN,,,,TW,,,
1,201801011,UNION WONDERFUL MACHINERY LTD.,"NO.51-10 PEI TAO. PEI TAOLI, TAMSUI DISTRICT, ...","TAMSUI DISTRICT,",NEW TAIPEI CITY,TAIWAN,,,,TW,,,
2,201801012,"SUMEEKO INDUSTRIES CO.,LTD.","NO.20 HUAXI RD., TA-FA INDUSTRIAL DISTRICT, KA...","DISTRICT,",KAOHSIUNG,TAIWAN,,,,TW,,,
3,201801013,YUTY INDUSTRIES CO. LTD.,"NO. 13, SUN PO ROAD, CHUNG-LI TAOYUAN",TAOYUAN,TAIWAN,,,,,TW,,,
4,201801014,"BE SOUND CO., LTD.","1F, NO. 765 MIN-TSU EAST ROAD, TAIPEI (TAIBEI)",TAIPEI (TAIBEI),TAIWAN,,,,,TW,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,20180101612,CHIN LIH HSING PRECISION,"ENTERPRISE CO., LTD., NO.10, LANE 711, CHUNG-C...","LANE 711, CHUNG-CHENG ROAD,","SHU LIN DIST, NEW TAIPEI CITY,",,,,,,,Telephone Number,TAIWAN
495,20180101613,CHIN LIH HSING PRECISION,"ENTERPRISE CO., LTD., NO.10, LANE 711, CHUNG-C...","LANE 711, CHUNG-CHENG ROAD,","SHU LIN DIST, NEW TAIPEI CITY,",,,,,,,Telephone Number,TAIWAN
496,20180101614,"NEW LAND LIGHTING(HK) INDUSTRY CO.,","LONGCHANG INDUSTRIAL ZONE#1,LONGHUA ROAD,ZHOUW...","ROAD,ZHOUWU DISTRICT, DONGGUAN CITY","GUANGDONG PROVINCE, CHINA",,,,,,,,
497,20180101615,"AURORA AUTO PARTS C.,LTD",NO. 34 SIBEI TONG JING ROAD nan nan nan,,,,GUANGZHOU,,,CN,,,


In [None]:
shipper_2018_0['shipper_party_name'] = shipper_2018_0.apply(lambda row: replace_mr_mrs(row), axis=1)
shipper_2018_0[shipper_2018_0['shipper_party_name'].str.contains('and/or', case=False)]

In [236]:
# shipper_2018_0['country_code'].value_counts()
n = shipper_2018_0['shipper_party_address_1'].dropna().values.tolist()
total = 0
errors = 0
for i in n:
    total += 1
    if len(i) == 2:
        if i in countries.index:
            print('------- ' + countries.loc[i]['name'])
        elif i in country_lookup.index:
            print("++++++++ " + i)
        else:
            errors += 1
            print(f"ERROR {i}: total: {errors}")
print(total)

------- Chad
------- Chad
------- Cabo Verde
------- Cabo Verde
------- Chad
------- Chad
------- Chad
------- Chad
------- Chad
------- Chad
ERROR D.: total: 1
ERROR D.: total: 2
ERROR 21: total: 3
------- Switzerland
ERROR D.: total: 4
ERROR D.: total: 5
------- Chad
ERROR ED: total: 6
------- Chad
------- Cabo Verde
------- Cabo Verde
------- Cabo Verde
------- Colombia
ERROR D.: total: 7
ERROR HQ: total: 8
------- Switzerland
------- Chad
------- Chad
ERROR 3,: total: 9
------- Chad
------- Chad
------- Chad
ERROR D.: total: 10
------- Chad
ERROR D.: total: 11
------- Chad
ERROR D.: total: 12
------- China
------- Chad
------- Kyrgyzstan
ERROR T): total: 13
ERROR T): total: 14
ERROR T): total: 15
ERROR OY: total: 16
------- Chad
------- Chad
------- Chad
ERROR ED: total: 17
------- Cabo Verde
------- Chad
------- Chad
------- Chad
------- Chad
------- Chad
------- Switzerland
------- Switzerland
------- Switzerland
------- Switzerland
------- Chad
ERROR D.: total: 18
------- Chad
-

### Rules for dealing with address

#### virgin islands

In [80]:
shipper_2018_0['shipper_party_address_4'].value_counts()
shipper_2018_0[(shipper_2018_0['shipper_party_address_4'] == 'ST THOMAS') | (shipper_2018_0['shipper_party_address_3'] == 'ST THOMAS') | (shipper_2018_0['shipper_party_address_2'] == 'ST THOMAS')]

Unnamed: 0,identifier,shipper_party_name,shipper_party_address_1,shipper_party_address_2,shipper_party_address_3,shipper_party_address_4,city,state_province,zip_code,country_code
120916,20180103107079,RODRIGUEZ AUTO PARTS,PO BOX 302576,"ST THOMAS, VI 00802",ST THOMAS,VIRGINISLANDS VI,,,,
219593,2018010624993,DOUBLE DEUCE JAMAICA LTD,YALLAHS INDUSTRIAL ESTATE,POORMAN'S CORNER,ST THOMAS,JAMAICA,,,,
471217,2018011272211,V.I. REGULATED WASTE MGMT. INC.,P.O. BOX 222994,"CHRISTIANSTED, VI 00822",ST THOMAS,VIRGIN ISLANDS VI,,,,
473283,201801132456,YLA GOODINGS,133 ESTATE FREDENHJ,"ST THOMAS, VI",ST THOMAS,VIRGIN ISLANDS VI,,,,
558078,2018011567250,DOUBLE DEUCE JAMAICA LTD,YALLAHS INDUSTRIAL ESTATE,POORMAN'S CORNER,ST THOMAS,JAMAICA,,,,
...,...,...,...,...,...,...,...,...,...,...
8778289,2018081485531,DOUBLE DEUCE JAMAICA LTD,YALLAHS INDUSTRIAL ESTATE,ST THOMAS,JAMAICA,,,,,
9510333,2018083173905,SANTOS GUERRERO,ST THOMAS,"ST THOMAS, USVI",ST THOMAS,VIRGIN ISLANDS VI,,,,
9557742,2018090163081,DOUBLE DEUCE JAMAICA LTD,YALLAHS INDUSTRIAL ESTATE,ST THOMAS,JAMAICA W.I.,,,,,
9618554,2018090421225,NESTOR RAMOS,LIMBERS BAY #61,"ST THOMAS, USVI",ST THOMAS,VIRGIN ISLANDS VI,,,,


In [70]:
ct.iloc[217]

id                      TW
name                Taiwan
region                Asia
sub-region    Eastern Asia
Name: 217, dtype: object