In [1]:
import pandas as pd
import numpy as np
from utils.clean_shipper import clean_row

In [29]:
pd.set_option('display.max_rows', 100)

In [4]:
ct = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/final/countries.csv')

## Country Work

In [2]:
countries_cols = ['name', 'alpha-2', 'region', 'sub-region']
countries = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/all.csv', usecols=countries_cols, keep_default_na=False)

# Change countries index column to be alpha-2 values and rename to id
countries.set_index('alpha-2', inplace=True)
countries.index.name = 'id'

alpha_2_set = set(countries.index)

country_dict = {x[1].upper(): x[0] for x in countries.itertuples()}
country_dict['TAIWAN'] = 'TW'
country_dict['SOUTH KOREA'] = 'KR'
country_dict['SHANGHAI CN'] = 'CN'
country_dict['SHANGHAI'] = 'CN'
country_dict['SHANGHAI .'] = 'CN'
country_dict['HONG KONG .'] = 'CN'
country_dict['TAIPEI .'] = 'TW'

In [6]:
countries_file_path = '/Users/jesseputnam/cs-learning/skillstorm/project01/data/final/countries.csv'
countries.to_csv(countries_file_path)

In [7]:
header_keep_cols = ['conveyance_id', 'vessel_name', 'vessel_country_code', 'carrier_code', 'identifier', 'estimated_arrival_date', 'actual_arrival_date', 'foreign_port_of_lading', 'place_of_receipt', 'port_of_unlading', 'record_status_indicator']

header_2018_0 = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/2018/header_2018_part_0.csv', parse_dates=['estimated_arrival_date', 'actual_arrival_date'], usecols=header_keep_cols)

In [None]:
# header_2018_0.info()
# header_2018_0[header_2018_0['record_status_indicator'] == 'Deleted']
# header_2018_0[['vessel_name', 'vessel_country_code']].value_counts(sort=False).tail(100)
# header_2018_0[['vessel_name', 'vessel_country_code', 'carrier_code']].value_counts(sort=False).tail(100)
# header_2018_0.loc[header_2018_0['vessel_name'] == 'EVER SIGMA']
# header_2018_0[['port_of_unlading', 'foreign_port_of_lading']].value_counts()
header_2018_0[header_2018_0['identifier'] == 2018012038125]

Unnamed: 0,identifier,carrier_code,vessel_country_code,vessel_name,port_of_unlading,estimated_arrival_date,foreign_port_of_lading,record_status_indicator,place_of_receipt,conveyance_id,actual_arrival_date
1026717,2018012038125,EGLV,GB,EVER SIGMA,"Los Angeles, California",2018-01-18,"Yantian,China (Mainland)",New,"YANTIAN, CHINA",9300398,2018-01-19


In [None]:
header_2018_0.info()

# Shipper work

In [9]:
shipper_sample = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/sample/shipper_2018.csv')

In [3]:
shipper_keep_cols = ['identifier', 'shipper_party_name', 'shipper_party_address_1', 'shipper_party_address_2', 'shipper_party_address_3', 'shipper_party_address_4', 'country_code']

shipper_2018_0 = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/2018/shipper_2018_part_0.csv', index_col=False, usecols=shipper_keep_cols)

# Dealing with missing names
shipper_2018_0['shipper_party_name'].fillna('N/A', inplace=True)

In [4]:
shipper_test = shipper_2018_0.apply(lambda row: clean_row(row, alpha_2_set, country_dict), axis=1)

In [6]:
shipper_test = shipper_test[['identifier', 'shipper_party_name', 'country_code']]

In [8]:
shipper_test.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 3 columns):
 #   Column              Non-Null Count     Dtype 
---  ------              --------------     ----- 
 0   identifier          10000000 non-null  int64 
 1   shipper_party_name  10000000 non-null  object
 2   country_code        3208767 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.9+ MB


### Dealing with Names

In [12]:
shipper_sample_cleaned = shipper_sample.apply(lambda row: clean_row(row, alpha_2_set, country_dict), axis=1)
shipper_sample_cleaned = shipper_sample_cleaned[['identifier', 'shipper_party_name', 'country_code']]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   identifier          499 non-null    int64 
 1   shipper_party_name  499 non-null    object
 2   country_code        187 non-null    object
dtypes: int64(1), object(2)
memory usage: 11.8+ KB


## Mapping names to id numbers

In [22]:
shipper_id_dict = {}
shipper_id_count = 0

for name in shipper_sample_cleaned['shipper_party_name'].unique():
    shipper_id_dict[name] = shipper_id_count
    shipper_id_count += 1

shipper_sample_cleaned['id'] = shipper_sample_cleaned['shipper_party_name'].map(shipper_id_dict)
shipper_sample_cleaned


Unnamed: 0,identifier,shipper_party_name,country_code,id
0,201801010,JET FAST COMPANY LIMITED,TW,0
1,201801011,UNION WONDERFUL MACHINERY LTD.,TW,1
2,201801012,"SUMEEKO INDUSTRIES CO.,LTD.",TW,2
3,201801013,YUTY INDUSTRIES CO. LTD.,TW,3
4,201801014,"BE SOUND CO., LTD.",TW,4
...,...,...,...,...
494,20180101612,CHIN LIH HSING PRECISION,,364
495,20180101613,CHIN LIH HSING PRECISION,,364
496,20180101614,"NEW LAND LIGHTING(HK) INDUSTRY CO.,",,365
497,20180101615,"AURORA AUTO PARTS C.,LTD",CN,366


### Combine shipper csv's to one file

In [26]:
shipper_raw_path = '/Users/jesseputnam/cs-learning/skillstorm/project01/data/final_raw/shipper.csv'

shipper_0 = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/2018/shipper_2018_part_0.csv', usecols=shipper_keep_cols)
shipper_1 = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/2018/shipper_2018_part_1.csv', usecols=shipper_keep_cols)
shipper_2 = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/2019/shipper_2019_part_0.csv', usecols=shipper_keep_cols)
shipper_3 = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/2019/shipper_2019_part_1.csv', usecols=shipper_keep_cols)
shipper_4 = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/2020/shipper_2020_part_0.csv', usecols=shipper_keep_cols)
shipper_5 = pd.read_csv('/Users/jesseputnam/cs-learning/skillstorm/project01/data/2020/shipper_2020_part_1.csv', usecols=shipper_keep_cols)

In [27]:
shipper_0.to_csv(shipper_raw_path, mode='x')
shipper_1.to_csv(shipper_raw_path, mode='a')
shipper_2.to_csv(shipper_raw_path, mode='a')
shipper_3.to_csv(shipper_raw_path, mode='a')
shipper_4.to_csv(shipper_raw_path, mode='a')
shipper_5.to_csv(shipper_raw_path, mode='a')