# Prediction Delivery Time

In [108]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import OrdinalEncoder
import datetime
import pytz



## EDA

Loading data

In [109]:
df_ebay = pd.read_csv('../data/raw/ebayShort.csv', nrows=300000 ,index_col=0)

### Explore Data


As the data is too big, we first working in the shortcut of the total data with 300000 rows.

In [110]:
# Look at the first 5 row

df_ebay.head()

Unnamed: 0,b2c_c2c,seller_id,declared_handling_days,acceptance_scan_timestamp,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,item_zip,buyer_zip,category_id,item_price,quantity,payment_datetime,delivery_date,weight,weight_units,package_size,record_number
0,B2C,25454,3.0,2019-03-26 15:11:00.000-07:00,0,0.0,3,5,97219,49040,13,27.95,1,2019-03-24 03:56:49.000-07:00,2019-03-29,5,1,LETTER,1
1,C2C,6727381,2.0,2018-06-02 12:53:00.000-07:00,0,3.0,3,5,11415-3528,62521,0,20.5,1,2018-06-01 13:43:54.000-07:00,2018-06-05,0,1,PACKAGE_THICK_ENVELOPE,2
2,B2C,18507,1.0,2019-01-07 16:22:00.000-05:00,0,4.5,3,5,27292,53010,1,19.9,1,2019-01-06 00:02:00.000-05:00,2019-01-10,9,1,PACKAGE_THICK_ENVELOPE,3
3,B2C,4677,1.0,2018-12-17 16:56:00.000-08:00,0,0.0,3,5,90703,80022,1,35.5,1,2018-12-16 10:28:28.000-08:00,2018-12-21,8,1,PACKAGE_THICK_ENVELOPE,4
4,B2C,4677,1.0,2018-07-27 16:48:00.000-07:00,0,0.0,3,5,90703,55070,1,25.0,1,2018-07-26 18:20:02.000-07:00,2018-07-30,3,1,PACKAGE_THICK_ENVELOPE,5


In [111]:
#Look at 5 random row
df_ebay.sample(5)

Unnamed: 0,b2c_c2c,seller_id,declared_handling_days,acceptance_scan_timestamp,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,item_zip,buyer_zip,category_id,item_price,quantity,payment_datetime,delivery_date,weight,weight_units,package_size,record_number
286208,C2C,360472,1.0,2018-10-22 17:26:00.000-07:00,1,7.9,2,5,90041,90010,11,19.99,1,2018-10-20 13:25:18.000-07:00,2018-10-23,48,1,PACKAGE_THICK_ENVELOPE,286209
111515,B2C,71100,2.0,2018-10-23 18:22:00.000-05:00,1,6.5,2,5,37216,49001,0,19.95,1,2018-10-22 19:25:25.000-05:00,2018-10-25,0,1,PACKAGE_THICK_ENVELOPE,111516
89541,C2C,98404,3.0,2018-06-27 15:42:00.000-04:00,1,8.75,2,5,1543,21053,0,22.99,1,2018-06-27 07:53:13.000-04:00,2018-06-29,32,1,PACKAGE_THICK_ENVELOPE,89542
58568,B2C,1370,1.0,2018-05-22 11:39:00.000-04:00,6,0.0,2,5,44614,36575,17,20.95,1,2018-05-21 21:21:01.000-04:00,2018-05-24,21,1,PACKAGE_THICK_ENVELOPE,58569
33039,B2C,3887,1.0,2019-07-23 21:47:00.000-04:00,0,0.0,3,5,11435,77388,5,103.95,2,2019-07-23 09:14:20.000-04:00,2019-07-25,20,1,LETTER,33040


In [112]:
#check the shape of the dataset
print(f'There are {df_ebay.shape[0]} columns and {df_ebay.shape[1]} rows')

There are 300000 columns and 19 rows


In [113]:
# Get a quick overview of dataset variables
df_ebay.dtypes

b2c_c2c                       object
seller_id                      int64
declared_handling_days       float64
acceptance_scan_timestamp     object
shipment_method_id             int64
shipping_fee                 float64
carrier_min_estimate           int64
carrier_max_estimate           int64
item_zip                      object
buyer_zip                     object
category_id                    int64
item_price                   float64
quantity                       int64
payment_datetime              object
delivery_date                 object
weight                         int64
weight_units                   int64
package_size                  object
record_number                  int64
dtype: object

Show the columns which is not numeric

In [114]:
# 5 rows showing only 'object' columns
df_ebay.select_dtypes('object').head()

Unnamed: 0,b2c_c2c,acceptance_scan_timestamp,item_zip,buyer_zip,payment_datetime,delivery_date,package_size
0,B2C,2019-03-26 15:11:00.000-07:00,97219,49040,2019-03-24 03:56:49.000-07:00,2019-03-29,LETTER
1,C2C,2018-06-02 12:53:00.000-07:00,11415-3528,62521,2018-06-01 13:43:54.000-07:00,2018-06-05,PACKAGE_THICK_ENVELOPE
2,B2C,2019-01-07 16:22:00.000-05:00,27292,53010,2019-01-06 00:02:00.000-05:00,2019-01-10,PACKAGE_THICK_ENVELOPE
3,B2C,2018-12-17 16:56:00.000-08:00,90703,80022,2018-12-16 10:28:28.000-08:00,2018-12-21,PACKAGE_THICK_ENVELOPE
4,B2C,2018-07-27 16:48:00.000-07:00,90703,55070,2018-07-26 18:20:02.000-07:00,2018-07-30,PACKAGE_THICK_ENVELOPE


### Data cleanding

First let do some common checking in the dataset.
 -  Null value
 -  Duplicate row

In [115]:
# checking duplicate rows
df_ebay.duplicated().any()

False

There are no duplicate row in the database

In [116]:
#Checking null
df_ebay.isna().sum()

b2c_c2c                          0
seller_id                        0
declared_handling_days       14348
acceptance_scan_timestamp        0
shipment_method_id               0
shipping_fee                     0
carrier_min_estimate             0
carrier_max_estimate             0
item_zip                         0
buyer_zip                        0
category_id                      0
item_price                       0
quantity                         0
payment_datetime                 0
delivery_date                    0
weight                           0
weight_units                     0
package_size                     0
record_number                    0
dtype: int64

There are missing values in the `declared_handling_day` column

Checking is there any unsusal value in columns.

In [155]:
df_ebay[(df_ebay['carrier_max_estimate']<=0 ) | (df_ebay['carrier_min_estimate']<=0)]

Unnamed: 0,b2c_c2c,seller_id,declared_handling_days,acceptance_scan_timestamp,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,item_zip,buyer_zip,category_id,item_price,quantity,payment_datetime,delivery_date,weight,weight_units,package_size,record_number
19357,C2C,708260,2.0,2018-09-19,12,65.0,-1,-1,45504,01412000,3,61.0,1,2018-09-18,2018-10-08,48,1,PACKAGE_THICK_ENVELOPE,19358
19975,B2C,908,1.0,2019-01-05,12,12.0,-1,-1,03104,28232,2,22.44,1,2019-01-03,2019-01-31,0,1,LETTER,19976
27005,B2C,269,0.0,2018-08-21,12,1.7,-1,-1,93535-4990,L1T 3L7,4,2.2,1,2018-08-17,2018-08-29,2,1,PACKAGE_THICK_ENVELOPE,27006
34165,C2C,173517,2.0,2019-10-09,15,0.0,0,1,32064,12570,3,15.0,1,2019-10-07,2019-10-10,0,1,PACKAGE_THICK_ENVELOPE,34166
38042,B2C,102172,1.0,2018-01-30,12,0.95,-1,-1,53538,7570090,4,8.0,1,2018-01-29,2018-02-15,0,1,NONE,38043
42391,B2C,102172,1.0,2019-01-23,12,0.0,-1,-1,53538,7570090,4,10.55,1,2019-01-21,2019-01-30,0,1,NONE,42392
63216,B2C,1762594,2.0,2018-05-19,12,0.0,-1,-1,94124,80099,30,105.0,1,2018-05-17,2018-05-22,16,1,PACKAGE_THICK_ENVELOPE,63217
66574,B2C,40345,4.0,2018-11-28,12,10.0,-1,-1,48038,B3H 2T1,11,39.99,1,2018-11-27,2018-12-10,3,1,PACKAGE_THICK_ENVELOPE,66575
66892,C2C,225573,3.0,2019-01-12,15,0.0,0,1,55124,44647,9,28.96,3,2019-01-10,2019-01-14,0,1,NONE,66893
73682,B2C,8073,3.0,2018-06-07,15,0.0,0,1,89030,70769,5,11.99,1,2018-06-05,2018-06-09,0,1,NONE,73683


In [156]:
df= df_ebay[(df_ebay['carrier_max_estimate']<0 ) | (df_ebay['carrier_min_estimate']<0)]
print(df.shape)
df

(25, 19)


Unnamed: 0,b2c_c2c,seller_id,declared_handling_days,acceptance_scan_timestamp,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,item_zip,buyer_zip,category_id,item_price,quantity,payment_datetime,delivery_date,weight,weight_units,package_size,record_number
19357,C2C,708260,2.0,2018-09-19,12,65.0,-1,-1,45504,01412000,3,61.0,1,2018-09-18,2018-10-08,48,1,PACKAGE_THICK_ENVELOPE,19358
19975,B2C,908,1.0,2019-01-05,12,12.0,-1,-1,03104,28232,2,22.44,1,2019-01-03,2019-01-31,0,1,LETTER,19976
27005,B2C,269,0.0,2018-08-21,12,1.7,-1,-1,93535-4990,L1T 3L7,4,2.2,1,2018-08-17,2018-08-29,2,1,PACKAGE_THICK_ENVELOPE,27006
38042,B2C,102172,1.0,2018-01-30,12,0.95,-1,-1,53538,7570090,4,8.0,1,2018-01-29,2018-02-15,0,1,NONE,38043
42391,B2C,102172,1.0,2019-01-23,12,0.0,-1,-1,53538,7570090,4,10.55,1,2019-01-21,2019-01-30,0,1,NONE,42392
63216,B2C,1762594,2.0,2018-05-19,12,0.0,-1,-1,94124,80099,30,105.0,1,2018-05-17,2018-05-22,16,1,PACKAGE_THICK_ENVELOPE,63217
66574,B2C,40345,4.0,2018-11-28,12,10.0,-1,-1,48038,B3H 2T1,11,39.99,1,2018-11-27,2018-12-10,3,1,PACKAGE_THICK_ENVELOPE,66575
78072,B2C,66159,3.0,2018-09-14,12,23.5,-1,-1,79968,154-0012,6,116.5,1,2018-09-13,2018-09-21,16,1,PACKAGE_THICK_ENVELOPE,78073
78140,B2C,5197,,2018-09-25,12,25.0,-1,-1,07307,8241587,18,9.99,1,2018-09-23,2018-10-20,0,1,PACKAGE_THICK_ENVELOPE,78141
98005,B2C,23064,1.0,2018-06-06,12,4.49,-1,-1,18045,K9J2P6,6,5.49,1,2018-06-04,2018-06-13,4,1,PACKAGE_THICK_ENVELOPE,98006


In [150]:
df.groupby('seller_id')['carrier_max_estimate'].value_counts()


seller_id  carrier_max_estimate
19         -1                      1
269        -1                      1
647        -1                      1
785        -1                      1
908        -1                      1
1254       -1                      1
5197       -1                      1
7365       -1                      1
8524       -1                      1
14274      -1                      1
23064      -1                      2
23219      -1                      1
26107      -1                      1
39681      -1                      1
40345      -1                      1
42722      -1                      1
57727      -1                      1
66159      -1                      1
67800      -1                      1
102172     -1                      2
595180     -1                      1
708260     -1                      1
1762594    -1                      1
Name: carrier_max_estimate, dtype: int64

There is some negative value in `carrier_max_estimate` and `carrier_min_estimate`

In [118]:
df_ebay['shipping_fee'].isna().sum()

0

In [119]:
df_ebay['category_id'].isna().sum()

0

In [120]:
#Recall these 2 columns
df_ebay[['acceptance_scan_timestamp', 'payment_datetime']]

Unnamed: 0,acceptance_scan_timestamp,payment_datetime
0,2019-03-26 15:11:00.000-07:00,2019-03-24 03:56:49.000-07:00
1,2018-06-02 12:53:00.000-07:00,2018-06-01 13:43:54.000-07:00
2,2019-01-07 16:22:00.000-05:00,2019-01-06 00:02:00.000-05:00
3,2018-12-17 16:56:00.000-08:00,2018-12-16 10:28:28.000-08:00
4,2018-07-27 16:48:00.000-07:00,2018-07-26 18:20:02.000-07:00
...,...,...
299995,2019-08-22 10:49:00.000-04:00,2019-08-21 20:11:07.000-04:00
299996,2019-02-13 00:27:00.000-05:00,2019-02-09 15:55:37.000-05:00
299997,2019-02-20 10:03:00.000-08:00,2019-02-18 08:34:21.000-08:00
299998,2019-06-20 15:51:00.000-07:00,2019-06-19 12:29:06.000-07:00


Working with `acceptance_scan_timestamp` and `payment_datetime`. convert all in to buyer timezone

In [121]:
accept_date= df_ebay['acceptance_scan_timestamp']
pay_date= df_ebay['payment_datetime']
delivery_date= df_ebay['delivery_date']

In [122]:
#Date convert function
from dateutil import tz
from dateutil import parser
def convert_date_format(date_list):
    output_format = "%Y-%m-%d"
    dates=[]
    # input_date= datetime.strftime(date[0], input_format)
    for day in date_list:    
        # Parse the original timestamp into a datetime object
        dt = parser.parse(day)

        # Extract the original timezone offset from the timestamp
        original_timezone_offset = dt.strftime("%z")

        # Create a custom time zone with the original offset
        day = tz.tzoffset("Custom", datetime.timedelta(hours=int(original_timezone_offset[:3]), minutes=int(original_timezone_offset[3:])))

        # Convert the timezone to the desired timezone (US/Eastern)
        us_timezone = tz.gettz("US/Eastern")
        converted_dt = dt.replace(tzinfo=day).astimezone(us_timezone)

        # Check if the hour is greater than 11
        if converted_dt.hour > 11:
            # Add one day to the converted datetime
            converted_dt += datetime.timedelta(days=1)

        # Remove the hour and timezone
        converted_dt = converted_dt.replace(hour=0, minute=0, second=0, microsecond=0)

        # Format the converted datetime object into the desired string format
        converted_timestamp = converted_dt.strftime("%Y-%m-%d")
        # put in dates list
        dates.append(converted_timestamp)
    return dates

In [123]:
#conver 
df_ebay['acceptance_scan_timestamp']= convert_date_format(accept_date)
df_ebay['payment_datetime']= convert_date_format(pay_date)

In [126]:
#Santity check
df_ebay['acceptance_scan_timestamp']

0         2019-03-27
1         2018-06-03
2         2019-01-08
3         2018-12-18
4         2018-07-28
             ...    
299995    2019-08-22
299996    2019-02-13
299997    2019-02-21
299998    2019-06-21
299999    2019-04-18
Name: acceptance_scan_timestamp, Length: 300000, dtype: object

In [127]:
#Santity check
df_ebay['payment_datetime']

0         2019-03-24
1         2018-06-02
2         2019-01-06
3         2018-12-17
4         2018-07-27
             ...    
299995    2019-08-22
299996    2019-02-10
299997    2019-02-18
299998    2019-06-20
299999    2019-04-17
Name: payment_datetime, Length: 300000, dtype: object

In [132]:
#Conver delivery_date to datetime
df_ebay['delivery_date'] = pd.to_datetime(df_ebay['delivery_date'])

In [133]:
type(df_ebay['delivery_date'][1])

pandas._libs.tslibs.timestamps.Timestamp

### Data Visualization

Let see the percent of the kind of customer B2C(Business to Consumer) and C2C(Consumer to Consumer)

In [20]:
df_ebay['b2c_c2c'].value_counts()

B2C    212946
C2C     87054
Name: b2c_c2c, dtype: int64

In [21]:
fig= px.pie(df_ebay, values= df_ebay['b2c_c2c'].value_counts(), names=df_ebay['b2c_c2c'].unique())

fig.show()

Seem like most of the seller is the business which take about 69.7% of the total amount of sale in the online selling platform. Consumer to consumer takes about 30.3% of the total amount, which is not a small amount, that we can see that there are more and more people start using the online platform to sell their product.

Let see how the seller declared the amount of time they need to proceesing order to make it ready to ship for customer base on there weight of the package>

In [11]:
df_ebay.groupby('declared_handling_days')['weight'].count().sort_values()

declared_handling_days
20.0       64
30.0       68
15.0       99
10.0      411
4.0       760
5.0      1460
0.0      6907
3.0     16582
2.0     17850
1.0     50841
Name: weight, dtype: int64

In [12]:
fig= px.scatter(df_ebay, x= 'declared_handling_days', y= df_ebay['shipment_method_id'])
fig.show()

## Feature Engineering

Let check out the distributions, we are going to check over all the column in the dataset

In [19]:
#checking the b2c_c2c column
df_ebay['b2c_c2c'].value_counts()

B2C    66944
C2C    28098
Name: b2c_c2c, dtype: int64

Convert `b2c_c2c` column to binary

In [20]:
df_ebay['b2c_c2c']= np.where(df_ebay['b2c_c2c']=='B2C', 1, 0)

Test it

In [21]:
df_ebay.head()

Unnamed: 0,b2c_c2c,seller_id,declared_handling_days,acceptance_scan_timestamp,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,item_zip,buyer_zip,category_id,item_price,quantity,payment_datetime,delivery_date,weight,weight_units,package_size,record_number
0,1,25454,3.0,2019-03-26 15:11:00.000-07:00,0,0.0,3,5,97219,49040,13,27.95,1,2019-03-24 03:56:49.000-07:00,2019-03-29,5,1,LETTER,1
1,0,6727381,2.0,2018-06-02 12:53:00.000-07:00,0,3.0,3,5,11415-3528,62521,0,20.5,1,2018-06-01 13:43:54.000-07:00,2018-06-05,0,1,PACKAGE_THICK_ENVELOPE,2
2,1,18507,1.0,2019-01-07 16:22:00.000-05:00,0,4.5,3,5,27292,53010,1,19.9,1,2019-01-06 00:02:00.000-05:00,2019-01-10,9,1,PACKAGE_THICK_ENVELOPE,3
3,1,4677,1.0,2018-12-17 16:56:00.000-08:00,0,0.0,3,5,90703,80022,1,35.5,1,2018-12-16 10:28:28.000-08:00,2018-12-21,8,1,PACKAGE_THICK_ENVELOPE,4
4,1,4677,1.0,2018-07-27 16:48:00.000-07:00,0,0.0,3,5,90703,55070,1,25.0,1,2018-07-26 18:20:02.000-07:00,2018-07-30,3,1,PACKAGE_THICK_ENVELOPE,5


Check the seller_id Colluum


In [22]:
df_ebay['seller_id'].value_counts()

0         1059
11         191
1          182
4          140
40          99
          ... 
91582        1
301906       1
113882       1
10213        1
311534       1
Name: seller_id, Length: 54912, dtype: int64

In [23]:
#Check null
df_ebay['seller_id'].isna().any()

False

There is 2 `weight_units` is 1 and 2. Let say 1 - 's' and 2 - 'kg'. We are going to convert all the weight units to lb.

In [24]:
#check the record has weigtht unit =2 to able to compare
df_ebay[df_ebay['weight_units']==2].head()

Unnamed: 0,b2c_c2c,seller_id,declared_handling_days,acceptance_scan_timestamp,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,item_zip,buyer_zip,category_id,item_price,quantity,payment_datetime,delivery_date,weight,weight_units,package_size,record_number


In [25]:
df_ebay['weight'] = np.where(df_ebay['weight_units']==1, df_ebay['weight'], df_ebay['weight']*2.20462 )

check the record with `weight_units`=2 to see if the `weight` column is change

In [26]:
df_ebay[df_ebay['weight_units']==2]

Unnamed: 0,b2c_c2c,seller_id,declared_handling_days,acceptance_scan_timestamp,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,item_zip,buyer_zip,category_id,item_price,quantity,payment_datetime,delivery_date,weight,weight_units,package_size,record_number


All the `weight` with the `weight_units` = 2 are convert to the **lb** unit. So now we do not need to keep the weight column since all the weight is in the **lb**. Therefore, we are going to drop the `weight column`.

In [33]:
df_ebay.drop(columns='weight', axis=1, inplace=True)

In [28]:
#check it
df_ebay.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95042 entries, 0 to 99999
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   b2c_c2c                    95042 non-null  int64  
 1   seller_id                  95042 non-null  int64  
 2   declared_handling_days     95042 non-null  float64
 3   acceptance_scan_timestamp  95042 non-null  object 
 4   shipment_method_id         95042 non-null  int64  
 5   shipping_fee               95042 non-null  float64
 6   carrier_min_estimate       95042 non-null  int64  
 7   carrier_max_estimate       95042 non-null  int64  
 8   item_zip                   95042 non-null  object 
 9   buyer_zip                  95042 non-null  object 
 10  category_id                95042 non-null  int64  
 11  item_price                 95042 non-null  float64
 12  quantity                   95042 non-null  int64  
 13  payment_datetime           95042 non-null  obj

The `weigth` column has been dropped. Next we are going to calculate the distance between the buyer and the seller buy using there zip code in the `item_zip` and `buyer_zip` columns.

Check the `package_size` column

In [29]:
#How many type of package 
df_ebay['package_size'].value_counts()

PACKAGE_THICK_ENVELOPE    80022
NONE                       6866
LETTER                     5604
LARGE_ENVELOPE             1445
LARGE_PACKAGE              1105
Name: package_size, dtype: int64

There are 5 type of package. so we can use the ordinal encoding to tranfer this column to numeric instead of category column. `'NONE': 0, 'LETTER': 1, 'LARGE_ENVELOPE': 2, 'LARGE_PACKAGE': 3, 'PACKAGE_THICK_ENVELOPE': 4`

In [30]:
oe_package= OrdinalEncoder(categories=[['NONE', 'LETTER', 'LARGE_ENVELOPE',
                                         'LARGE_PACKAGE', 'PACKAGE_THICK_ENVELOPE']])
df_ebay['package_size']=oe_package.fit_transform(pd.DataFrame(df_ebay['package_size'])) 

In [31]:
df_ebay['package_size'].value_counts()

4.0    80022
0.0     6866
1.0     5604
2.0     1445
3.0     1105
Name: package_size, dtype: int64

In [32]:
#check any null value in these columns.
print(df_ebay['item_zip'].isna().any())
print(df_ebay['buyer_zip'].isna().any())

False
False


In [33]:
item_zip= df_ebay['item_zip']
buyer_zip= df_ebay['buyer_zip']

In [2]:
import mpu
from uszipcode import SearchEngine

def get_distance(item_zip, buyer_zip):
    """
    We are going to use the package mpu and uszipcode to imporve the speed of calculating the distance between buyer and seller by using zip code. 
    These 2 packages make the speed of calculation much faster compare with using the geopy package
    """
    if item_zip is not None and buyer_zip is not None:
        search = SearchEngine()
        item_location = search.by_zipcode(item_zip[0:5])
        buyer_location =search.by_zipcode(buyer_zip[0:5])
        if item_location is None or buyer_location is None:
            return None
        else:
            lat1 =item_location.lat
            long1 =item_location.lng
            lat2 =buyer_location.lat
            long2 =buyer_location.lng
            if lat1 is None or lat2 is None or long1 is None or long2 is None:
                return None
            return mpu.haversine_distance((lat1,long1),(lat2,long2)) 
    else:
        return None


def add_zip_distance_column(item_zip, buyer_zip):
    item_zip_str = item_zip.apply(lambda x: str(x))
    buyer_zip_str = buyer_zip.apply(lambda x: str(x))

    zips = pd.concat([item_zip_str, buyer_zip_str], axis=1)

    zips['distance'] = zips.apply(lambda x: get_distance(x.item_zip, x.buyer_zip), axis=1)
    
    return zips['distance']

ImportError: cannot import name 'SearchEngine' from 'uszipcode' (/Users/henryvuong/anaconda3/lib/python3.10/site-packages/uszipcode/__init__.py)

In [None]:
distance = add_zip_distance_column(item_zip, buyer_zip)

Add the value of the distance in to the data frame

In [None]:
df_ebay['distance']= distance
df_ebay.head(10)

Unnamed: 0,b2c_c2c,seller_id,declared_handling_days,acceptance_scan_timestamp,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,item_zip,buyer_zip,category_id,item_price,quantity,payment_datetime,delivery_date,weight_units,package_size,record_number,distance
0,1,25454,3.0,2019-03-26 15:11:00.000-07:00,0,0.0,3,5,97219,49040,13,27.95,1,2019-03-24 03:56:49.000-07:00,2019-03-29,1,1.0,1,3001.839179
1,0,6727381,2.0,2018-06-02 12:53:00.000-07:00,0,3.0,3,5,11415-3528,62521,0,20.5,1,2018-06-01 13:43:54.000-07:00,2018-06-05,1,4.0,2,1282.908017
2,1,18507,1.0,2019-01-07 16:22:00.000-05:00,0,4.5,3,5,27292,53010,1,19.9,1,2019-01-06 00:02:00.000-05:00,2019-01-10,1,4.0,3,1104.444168
3,1,4677,1.0,2018-12-17 16:56:00.000-08:00,0,0.0,3,5,90703,80022,1,35.5,1,2018-12-16 10:28:28.000-08:00,2018-12-21,1,4.0,4,1353.390003
4,1,4677,1.0,2018-07-27 16:48:00.000-07:00,0,0.0,3,5,90703,55070,1,25.0,1,2018-07-26 18:20:02.000-07:00,2018-07-30,1,4.0,5,2456.330752
5,1,10514,1.0,2019-04-19 19:42:00.000-04:00,0,0.0,3,5,43215,77063,3,10.39,1,2019-04-18 14:11:09.000-04:00,2019-04-22,1,4.0,6,1608.340906
6,1,104,1.0,2019-02-08 17:35:00.000-08:00,0,0.0,3,5,91304,60565,11,5.7,1,2019-02-08 09:33:13.000-08:00,2019-02-11,1,4.0,7,2781.527939
7,1,340356,1.0,2018-04-23 17:31:00.000-04:00,0,2.95,3,5,49735,29379,1,6.0,1,2018-04-22 18:32:04.000-04:00,2018-04-25,1,4.0,8,1174.270246
8,1,113915,5.0,2019-10-12 09:22:00.000-04:00,3,0.0,2,8,43606,32958,18,5.55,1,2019-10-11 04:54:25.000-04:00,2019-10-15,1,0.0,9,1568.8678
9,1,130301,1.0,2019-08-09 11:24:00.000-05:00,1,0.0,2,5,35117,84776,13,59.98,1,2019-08-08 12:47:14.000-05:00,2019-08-12,1,4.0,10,2311.01545


Take a look at th data  after change

In [None]:
df_ebay.head(10)

Unnamed: 0,b2c_c2c,seller_id,declared_handling_days,acceptance_scan_timestamp,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,item_zip,buyer_zip,category_id,item_price,quantity,payment_datetime,delivery_date,weight_units,package_size,record_number,distance
0,1,25454,3.0,2019-03-26 15:11:00,0,0.0,3,5,97219,49040,13,27.95,1,2019-03-24 03:56:49,2019-03-29,1,1.0,1,3001.839179
1,0,6727381,2.0,2018-06-02 12:53:00,0,3.0,3,5,11415-3528,62521,0,20.5,1,2018-06-01 13:43:54,2018-06-05,1,4.0,2,1282.908017
2,1,18507,1.0,2019-01-07 16:22:00,0,4.5,3,5,27292,53010,1,19.9,1,2019-01-06 00:02:00,2019-01-10,1,4.0,3,1104.444168
3,1,4677,1.0,2018-12-17 16:56:00,0,0.0,3,5,90703,80022,1,35.5,1,2018-12-16 10:28:28,2018-12-21,1,4.0,4,1353.390003
4,1,4677,1.0,2018-07-27 16:48:00,0,0.0,3,5,90703,55070,1,25.0,1,2018-07-26 18:20:02,2018-07-30,1,4.0,5,2456.330752
5,1,10514,1.0,2019-04-19 19:42:00,0,0.0,3,5,43215,77063,3,10.39,1,2019-04-18 14:11:09,2019-04-22,1,4.0,6,1608.340906
6,1,104,1.0,2019-02-08 17:35:00,0,0.0,3,5,91304,60565,11,5.7,1,2019-02-08 09:33:13,2019-02-11,1,4.0,7,2781.527939
7,1,340356,1.0,2018-04-23 17:31:00,0,2.95,3,5,49735,29379,1,6.0,1,2018-04-22 18:32:04,2018-04-25,1,4.0,8,1174.270246
8,1,113915,5.0,2019-10-12 09:22:00,3,0.0,2,8,43606,32958,18,5.55,1,2019-10-11 04:54:25,2019-10-15,1,0.0,9,1568.8678
9,1,130301,1.0,2019-08-09 11:24:00,1,0.0,2,5,35117,84776,13,59.98,1,2019-08-08 12:47:14,2019-08-12,1,4.0,10,2311.01545


Now we see that most of the column is in the good shape. But the model cannot run with the datetime type of variable. So we are going to create some 2 new column base on the datetime column. 
-   **handling_date** = `acceptance_scan_timestamp` - `payment_dateime`
-   **shipping_date** = `delivery_date` - `acceptance_scan_timestamp`
-   **total_time** = `delivery_date` - `payment_dateime`

In [None]:
#calulate date function
def date_calculate(day1, day2):
    number_of_day=[]
    date_list= day2- day1
    for day in date_list:
        date= day.days
        if day.seconds/3600 >=12:
            date+=1
        number_of_day.append(date)
    return number_of_day
        


In [None]:
#calulate handling_date
handling_date= date_calculate(df_ebay['payment_datetime'], df_ebay['acceptance_scan_timestamp'])

#calculate shipping_date
shipping_date= date_calculate(df_ebay['acceptance_scan_timestamp'], df_ebay['delivery_date'])

#calculate totlatime

total_time= date_calculate(df_ebay['payment_datetime'], df_ebay['delivery_date'])


Create `handling_date` and `shipping_date` as new columns in data frame and add the value for it

In [None]:
df_ebay['handling_date']= handling_date
df_ebay['shipping_date']= shipping_date
df_ebay['total_time']= total_time

In [None]:
#check data frame
df_ebay.sample(10)

Unnamed: 0,b2c_c2c,seller_id,declared_handling_days,acceptance_scan_timestamp,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,item_zip,buyer_zip,...,quantity,payment_datetime,delivery_date,weight_units,package_size,record_number,distance,handling_date,shipping_date,total_time
38944,1,2074,2.0,2018-04-13 18:33:00,0,3.49,3,5,10036,32713,...,1,2018-04-12 19:36:58,2018-04-16,1,4.0,38945,1479.396198,1,2,3
52607,1,83214,1.0,2018-09-24 10:32:00,1,0.0,2,5,93013,17019,...,1,2018-09-24 07:31:25,2018-09-26,1,4.0,52608,3776.133928,0,2,2
44635,0,2526513,3.0,2019-09-05 16:26:00,0,4.81,3,5,84084,21502,...,1,2019-09-05 14:56:49,2019-09-09,1,4.0,44636,2809.317241,0,3,3
98120,1,886,1.0,2018-09-14 15:35:00,0,1.99,3,5,32224,18237,...,1,2018-09-14 04:31:52,2018-09-17,1,4.0,98121,1280.956497,0,2,3
79123,1,6973,1.0,2018-12-07 13:08:00,0,0.0,3,5,18354,53144,...,1,2018-12-06 22:18:34,2018-12-11,1,1.0,79124,1060.853115,1,3,4
18942,1,1372,1.0,2018-11-05 18:00:00,0,0.0,3,5,91748,36028,...,1,2018-11-04 12:59:23,2018-11-09,1,0.0,18943,2950.348721,1,3,4
44576,0,3540882,3.0,2019-08-01 11:24:00,0,4.57,3,5,43528,77058,...,1,2019-07-29 18:33:34,2019-08-05,1,4.0,44577,1688.122355,3,4,6
26266,0,5024024,3.0,2018-01-16 14:47:00,2,17.99,2,9,33180,28315,...,1,2018-01-04 10:23:21,2018-01-25,1,4.0,26267,1021.975898,12,8,21
76796,1,40039,3.0,2018-06-23 12:20:00,5,6.99,2,5,91356,29928,...,1,2018-06-21 03:22:01,2018-06-25,1,4.0,76797,3503.971679,2,1,4
53831,1,118628,1.0,2018-12-15 17:55:00,2,18.46,2,9,95206,65804,...,1,2018-12-14 08:16:18,2018-12-17,1,4.0,53832,2474.434459,1,1,3


In [None]:
df_ebay.shape


(95042, 22)

In [None]:
#check null again
df_ebay.isna().sum()

b2c_c2c                         0
seller_id                       0
declared_handling_days          0
acceptance_scan_timestamp       0
shipment_method_id              0
shipping_fee                    0
carrier_min_estimate            0
carrier_max_estimate            0
item_zip                        0
buyer_zip                       0
category_id                     0
item_price                      0
quantity                        0
payment_datetime                0
delivery_date                   0
weight_units                    0
package_size                    0
record_number                   0
distance                     1257
handling_date                   0
shipping_date                   0
total_time                      0
dtype: int64

We can see there are some value in the `distance` column is count as NA here. It could be because the Zipcode of the buyer or seller is not in the right format. We are going to drop all of row with NA in distance here since it is only 1257 rows over 95042 rows in total(around 1.3% the amount of data).

In [None]:
df_ebay= df_ebay.dropna()
df_ebay.isna().any().any()

False

Now we export the data to the new CSV then later on we only work with clean data

In [None]:
df_ebay.to_csv('../data/cleaned/Ebay_cleaned.csv')