# Prediction Delivery Time

# EDA Dataset

## Explore

In [127]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns   
from sklearn.preprocessing import OrdinalEncoder
from datetime import datetime, timezone


In [128]:
df_ebay = pd.read_csv('../data/raw/ebayShort.csv', nrows=100000 ,index_col=0)

As the data is too big, we first working in the shortcut of the total data with 1 million rows.

In [129]:
# Look at the first 5 row

df_ebay.head()

Unnamed: 0,b2c_c2c,seller_id,declared_handling_days,acceptance_scan_timestamp,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,item_zip,buyer_zip,category_id,item_price,quantity,payment_datetime,delivery_date,weight,weight_units,package_size,record_number
0,B2C,25454,3.0,2019-03-26 15:11:00.000-07:00,0,0.0,3,5,97219,49040,13,27.95,1,2019-03-24 03:56:49.000-07:00,2019-03-29,5,1,LETTER,1
1,C2C,6727381,2.0,2018-06-02 12:53:00.000-07:00,0,3.0,3,5,11415-3528,62521,0,20.5,1,2018-06-01 13:43:54.000-07:00,2018-06-05,0,1,PACKAGE_THICK_ENVELOPE,2
2,B2C,18507,1.0,2019-01-07 16:22:00.000-05:00,0,4.5,3,5,27292,53010,1,19.9,1,2019-01-06 00:02:00.000-05:00,2019-01-10,9,1,PACKAGE_THICK_ENVELOPE,3
3,B2C,4677,1.0,2018-12-17 16:56:00.000-08:00,0,0.0,3,5,90703,80022,1,35.5,1,2018-12-16 10:28:28.000-08:00,2018-12-21,8,1,PACKAGE_THICK_ENVELOPE,4
4,B2C,4677,1.0,2018-07-27 16:48:00.000-07:00,0,0.0,3,5,90703,55070,1,25.0,1,2018-07-26 18:20:02.000-07:00,2018-07-30,3,1,PACKAGE_THICK_ENVELOPE,5


In [130]:
#Look at 5 random row
df_ebay.sample(5)

Unnamed: 0,b2c_c2c,seller_id,declared_handling_days,acceptance_scan_timestamp,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,item_zip,buyer_zip,category_id,item_price,quantity,payment_datetime,delivery_date,weight,weight_units,package_size,record_number
51270,B2C,34406,1.0,2019-02-08 12:06:00.000-08:00,3,0.0,2,8,98112,49329,14,29.71,1,2019-02-06 15:46:24.000-08:00,2019-02-19,0,1,PACKAGE_THICK_ENVELOPE,51271
30478,B2C,5868,1.0,2018-10-02 15:24:00.000-04:00,0,0.0,3,5,30005,39465,8,2.25,1,2018-10-02 07:50:06.000-04:00,2018-10-04,0,1,PACKAGE_THICK_ENVELOPE,30479
15678,B2C,1255,1.0,2019-05-14 16:25:00.000-04:00,0,0.0,3,5,7004,49112,15,18.85,1,2019-05-13 20:48:18.000-04:00,2019-05-18,0,1,LETTER,15679
97942,B2C,186,0.0,2019-04-29 16:35:00.000-07:00,0,0.0,3,5,90660,18512,1,15.29,1,2019-04-27 17:41:31.000-07:00,2019-05-04,0,1,PACKAGE_THICK_ENVELOPE,97943
29246,B2C,203688,1.0,2018-07-03 15:42:00.000-07:00,3,0.0,2,8,89135,87504,14,17.5,1,2018-07-01 14:47:28.000-07:00,2018-07-06,0,1,PACKAGE_THICK_ENVELOPE,29247


In [131]:
#check the shape of the dataset
df_ebay.shape

(100000, 19)

In [132]:
print(f'There are {df_ebay.shape[0]} columns and {df_ebay.shape[1]} rows')

There are 100000 columns and 19 rows


In [133]:
# Get a quick overview of dataset variables
df_ebay.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 19 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   b2c_c2c                    100000 non-null  object 
 1   seller_id                  100000 non-null  int64  
 2   declared_handling_days     95042 non-null   float64
 3   acceptance_scan_timestamp  100000 non-null  object 
 4   shipment_method_id         100000 non-null  int64  
 5   shipping_fee               100000 non-null  float64
 6   carrier_min_estimate       100000 non-null  int64  
 7   carrier_max_estimate       100000 non-null  int64  
 8   item_zip                   100000 non-null  object 
 9   buyer_zip                  100000 non-null  object 
 10  category_id                100000 non-null  int64  
 11  item_price                 100000 non-null  float64
 12  quantity                   100000 non-null  int64  
 13  payment_datetime           100

In [134]:
# top 5 rows showing only 'object' columns
df_ebay.select_dtypes('object').head()

Unnamed: 0,b2c_c2c,acceptance_scan_timestamp,item_zip,buyer_zip,payment_datetime,delivery_date,package_size
0,B2C,2019-03-26 15:11:00.000-07:00,97219,49040,2019-03-24 03:56:49.000-07:00,2019-03-29,LETTER
1,C2C,2018-06-02 12:53:00.000-07:00,11415-3528,62521,2018-06-01 13:43:54.000-07:00,2018-06-05,PACKAGE_THICK_ENVELOPE
2,B2C,2019-01-07 16:22:00.000-05:00,27292,53010,2019-01-06 00:02:00.000-05:00,2019-01-10,PACKAGE_THICK_ENVELOPE
3,B2C,2018-12-17 16:56:00.000-08:00,90703,80022,2018-12-16 10:28:28.000-08:00,2018-12-21,PACKAGE_THICK_ENVELOPE
4,B2C,2018-07-27 16:48:00.000-07:00,90703,55070,2018-07-26 18:20:02.000-07:00,2018-07-30,PACKAGE_THICK_ENVELOPE


Let see the percent of the kind of customer B2C(Business to Consumer) and C2C(Consumer to Consumer)

In [135]:
fig= px.pie(data_frame= df_ebay,values= df_ebay['b2c_c2c'].value_counts(), labels= df_ebay['b2c_c2c'].unique())

fig.show()

Seem like most of the seller is the business which take about 69.7% of the total amount of sale in the online selling platform. Consumer to consumer takes about 30.3% of the total amount, which is not a small amount, that we can see that there are more and more people start using the online platform to sell their product.

Let see how the seller declared the amount of time they need to proceesing order to make it ready to ship for customer base on there weight of the package>

In [136]:
df_ebay.groupby('declared_handling_days')['weight'].count().sort_values()

declared_handling_days
20.0       64
30.0       68
15.0       99
10.0      411
4.0       760
5.0      1460
0.0      6907
3.0     16582
2.0     17850
1.0     50841
Name: weight, dtype: int64

In [137]:
fig= px.scatter(df_ebay, x= 'declared_handling_days', y= df_ebay['shipment_method_id'])
fig.show()

## Data Cleaning

First let do some common checking in the dataset.
 -  Null value
 -  Duplicate row

In [138]:
#Checking null
df_ebay.isna().sum()

b2c_c2c                         0
seller_id                       0
declared_handling_days       4958
acceptance_scan_timestamp       0
shipment_method_id              0
shipping_fee                    0
carrier_min_estimate            0
carrier_max_estimate            0
item_zip                        0
buyer_zip                       0
category_id                     0
item_price                      0
quantity                        0
payment_datetime                0
delivery_date                   0
weight                          0
weight_units                    0
package_size                    0
record_number                   0
dtype: int64

There is 47180 missing value in total 1 million rows. We could delete it but I decide to have deepper look at it later


In [139]:
# checking duplicate rows
df_ebay.duplicated().any()

False

There are no duplicate row in the database

In [140]:
#recall shape column
df_ebay.shape[1]

19

In [141]:
#Dataset
df_ebay.head(5)

Unnamed: 0,b2c_c2c,seller_id,declared_handling_days,acceptance_scan_timestamp,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,item_zip,buyer_zip,category_id,item_price,quantity,payment_datetime,delivery_date,weight,weight_units,package_size,record_number
0,B2C,25454,3.0,2019-03-26 15:11:00.000-07:00,0,0.0,3,5,97219,49040,13,27.95,1,2019-03-24 03:56:49.000-07:00,2019-03-29,5,1,LETTER,1
1,C2C,6727381,2.0,2018-06-02 12:53:00.000-07:00,0,3.0,3,5,11415-3528,62521,0,20.5,1,2018-06-01 13:43:54.000-07:00,2018-06-05,0,1,PACKAGE_THICK_ENVELOPE,2
2,B2C,18507,1.0,2019-01-07 16:22:00.000-05:00,0,4.5,3,5,27292,53010,1,19.9,1,2019-01-06 00:02:00.000-05:00,2019-01-10,9,1,PACKAGE_THICK_ENVELOPE,3
3,B2C,4677,1.0,2018-12-17 16:56:00.000-08:00,0,0.0,3,5,90703,80022,1,35.5,1,2018-12-16 10:28:28.000-08:00,2018-12-21,8,1,PACKAGE_THICK_ENVELOPE,4
4,B2C,4677,1.0,2018-07-27 16:48:00.000-07:00,0,0.0,3,5,90703,55070,1,25.0,1,2018-07-26 18:20:02.000-07:00,2018-07-30,3,1,PACKAGE_THICK_ENVELOPE,5


Let check out the distributions, we are going to check over all the column in the dataset

In [142]:
#checking the b2c_c2c column
df_ebay['b2c_c2c'].value_counts()

B2C    69720
C2C    30280
Name: b2c_c2c, dtype: int64

Convert `b2c_c2c` column to binary

In [143]:
df_ebay['b2c_c2c']= np.where(df_ebay['b2c_c2c']=='B2C', 1, 0)

Test it

In [144]:
df_ebay.head()

Unnamed: 0,b2c_c2c,seller_id,declared_handling_days,acceptance_scan_timestamp,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,item_zip,buyer_zip,category_id,item_price,quantity,payment_datetime,delivery_date,weight,weight_units,package_size,record_number
0,1,25454,3.0,2019-03-26 15:11:00.000-07:00,0,0.0,3,5,97219,49040,13,27.95,1,2019-03-24 03:56:49.000-07:00,2019-03-29,5,1,LETTER,1
1,0,6727381,2.0,2018-06-02 12:53:00.000-07:00,0,3.0,3,5,11415-3528,62521,0,20.5,1,2018-06-01 13:43:54.000-07:00,2018-06-05,0,1,PACKAGE_THICK_ENVELOPE,2
2,1,18507,1.0,2019-01-07 16:22:00.000-05:00,0,4.5,3,5,27292,53010,1,19.9,1,2019-01-06 00:02:00.000-05:00,2019-01-10,9,1,PACKAGE_THICK_ENVELOPE,3
3,1,4677,1.0,2018-12-17 16:56:00.000-08:00,0,0.0,3,5,90703,80022,1,35.5,1,2018-12-16 10:28:28.000-08:00,2018-12-21,8,1,PACKAGE_THICK_ENVELOPE,4
4,1,4677,1.0,2018-07-27 16:48:00.000-07:00,0,0.0,3,5,90703,55070,1,25.0,1,2018-07-26 18:20:02.000-07:00,2018-07-30,3,1,PACKAGE_THICK_ENVELOPE,5


Check the seller_id Colluum


In [145]:
df_ebay['seller_id'].value_counts()

0         1065
11         191
1          182
4          140
40          99
          ... 
128044       1
301143       1
39414        1
50809        1
311534       1
Name: seller_id, Length: 57971, dtype: int64

In [146]:
#Check null
df_ebay['seller_id'].isna().any()

False

There is 2 `weight_units` is 1 and 2. Let say 1 - 's' and 2 - 'kg'. We are going to convert all the weight units to lb.

In [147]:
#check the record has weigtht unit =2 to able to compare
df_ebay[df_ebay['weight_units']==2].head()

Unnamed: 0,b2c_c2c,seller_id,declared_handling_days,acceptance_scan_timestamp,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,item_zip,buyer_zip,category_id,item_price,quantity,payment_datetime,delivery_date,weight,weight_units,package_size,record_number


In [148]:
df_ebay['weight'] = np.where(df_ebay['weight_units']==1, df_ebay['weight'], df_ebay['weight']*2.20462 )

check the record with `weight_units`=2 to see if the `weight` column is change

In [149]:
df_ebay[df_ebay['weight_units']==2]

Unnamed: 0,b2c_c2c,seller_id,declared_handling_days,acceptance_scan_timestamp,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,item_zip,buyer_zip,category_id,item_price,quantity,payment_datetime,delivery_date,weight,weight_units,package_size,record_number


All the `weight` with the `weight_units` = 2 are convert to the **lb** unit. So now we do not need to keep the weight column since all the weight is in the **lb**. Therefore, we are going to drop the `weight column`.

In [150]:
df_ebay.drop(columns='weight', axis=1, inplace=True)


In [151]:
#check it
df_ebay.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 18 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   b2c_c2c                    100000 non-null  int32  
 1   seller_id                  100000 non-null  int64  
 2   declared_handling_days     95042 non-null   float64
 3   acceptance_scan_timestamp  100000 non-null  object 
 4   shipment_method_id         100000 non-null  int64  
 5   shipping_fee               100000 non-null  float64
 6   carrier_min_estimate       100000 non-null  int64  
 7   carrier_max_estimate       100000 non-null  int64  
 8   item_zip                   100000 non-null  object 
 9   buyer_zip                  100000 non-null  object 
 10  category_id                100000 non-null  int64  
 11  item_price                 100000 non-null  float64
 12  quantity                   100000 non-null  int64  
 13  payment_datetime           100

The `weigth` column has been dropped. Next we are going to calculate the distance between the buyer and the seller buy using there zip code in the `item_zip` and `buyer_zip` columns.

Check the `package_size` column

In [152]:
#How many type of package 
df_ebay['package_size'].value_counts()

PACKAGE_THICK_ENVELOPE    84468
NONE                       7065
LETTER                     5787
LARGE_ENVELOPE             1495
LARGE_PACKAGE              1185
Name: package_size, dtype: int64

There are 5 type of package. so we can use the ordinal encoding to tranfer this column to numeric instead of category column. `'NONE': 0, 'LETTER': 1, 'LARGE_ENVELOPE': 2, 'LARGE_PACKAGE': 3, 'PACKAGE_THICK_ENVELOPE': 4`

In [153]:
oe_package= OrdinalEncoder(categories=[['NONE', 'LETTER', 'LARGE_ENVELOPE',
                                         'LARGE_PACKAGE', 'PACKAGE_THICK_ENVELOPE']])
df_ebay['package_size']=oe_package.fit_transform(pd.DataFrame(df_ebay['package_size'])) 

In [154]:
df_ebay['package_size'].value_counts()

4.0    84468
0.0     7065
1.0     5787
2.0     1495
3.0     1185
Name: package_size, dtype: int64

In [155]:
#check any null value in these columns.
print(df_ebay['item_zip'].isna().any())
print(df_ebay['buyer_zip'].isna().any())

False
False


In [156]:
item_zip= df_ebay['item_zip']
buyer_zip= df_ebay['buyer_zip']

In [157]:
# function to calculate distance between 2 zip codeusing Geocode. this function take more time to process. So We change to other package 
# country = Nominatim(user_agent="US")
# distance= []
# def distance_cal(item_zip, buyer_zip):
# # for i in range(len(buyer_zip)):
#     if buyer_zip and item_zip:
#         location1= country.geocode(item_zip)
#         location2= country.geocode(buyer_zip)
#         if location1 is None or location2 is None:
#             return None
#         lat1 = location1.latitude 
#         lon1 = location1.longitude
#         lat2 = location2.latitude
#         lon2 = location2.longitude
#         # distance between them
#         if lat1 is not None or lon1 is not None or lat2 is not None or lon2 is not None:
#             return geodesic((lat1, lon1), (lat2, lon2)).miles
#         else:
#             return None
#     else:
#         return None   
    
# def add_distance_col(item_zip, buyer_zip):
#     item = item_zip.apply(lambda x: str(x))
#     buyer = buyer_zip.apply(lambda x: str(x))

#     zips= pd.concat([item, buyer], axis=1)
#     zips['distance']= zips.apply(lambda x: distance_cal(x.item_zip, x.buyer_zip), axis=1)
#     return(zips)
# add_distance_col(item_zip, buyer_zip)

In [158]:
import mpu
from uszipcode import SearchEngine

def get_distance(item_zip, buyer_zip):
    """
    We are going to use the package mpu and uszipcode to imporve the speed of calculating the distance between buyer and seller by using zip code. 
    These 2 packages make the speed of calculation much faster compare with using the geopy package
    """
    if item_zip is not None and buyer_zip is not None:
        search = SearchEngine()
        item_location = search.by_zipcode(item_zip[0:5])
        buyer_location =search.by_zipcode(buyer_zip[0:5])
        if item_location is None or buyer_location is None:
            return None
        else:
            lat1 =item_location.lat
            long1 =item_location.lng
            lat2 =buyer_location.lat
            long2 =buyer_location.lng
            if lat1 is None or lat2 is None or long1 is None or long2 is None:
                return None
            return mpu.haversine_distance((lat1,long1),(lat2,long2)) 
    else:
        return None


def add_zip_distance_column(item_zip, buyer_zip):
    item_zip_str = item_zip.apply(lambda x: str(x))
    buyer_zip_str = buyer_zip.apply(lambda x: str(x))

    zips = pd.concat([item_zip_str, buyer_zip_str], axis=1)

    zips['distance'] = zips.apply(lambda x: get_distance(x.item_zip, x.buyer_zip), axis=1)
    
    return zips['distance']

In [159]:
distance = add_zip_distance_column(item_zip, buyer_zip)

Add the value of the distance in to the data frame

In [160]:
df_ebay['distance']= distance
df_ebay.head(10)

Unnamed: 0,b2c_c2c,seller_id,declared_handling_days,acceptance_scan_timestamp,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,item_zip,buyer_zip,category_id,item_price,quantity,payment_datetime,delivery_date,weight_units,package_size,record_number,distance
0,1,25454,3.0,2019-03-26 15:11:00.000-07:00,0,0.0,3,5,97219,49040,13,27.95,1,2019-03-24 03:56:49.000-07:00,2019-03-29,1,1.0,1,3001.839179
1,0,6727381,2.0,2018-06-02 12:53:00.000-07:00,0,3.0,3,5,11415-3528,62521,0,20.5,1,2018-06-01 13:43:54.000-07:00,2018-06-05,1,4.0,2,1282.908017
2,1,18507,1.0,2019-01-07 16:22:00.000-05:00,0,4.5,3,5,27292,53010,1,19.9,1,2019-01-06 00:02:00.000-05:00,2019-01-10,1,4.0,3,1104.444168
3,1,4677,1.0,2018-12-17 16:56:00.000-08:00,0,0.0,3,5,90703,80022,1,35.5,1,2018-12-16 10:28:28.000-08:00,2018-12-21,1,4.0,4,1353.390003
4,1,4677,1.0,2018-07-27 16:48:00.000-07:00,0,0.0,3,5,90703,55070,1,25.0,1,2018-07-26 18:20:02.000-07:00,2018-07-30,1,4.0,5,2456.330752
5,1,10514,1.0,2019-04-19 19:42:00.000-04:00,0,0.0,3,5,43215,77063,3,10.39,1,2019-04-18 14:11:09.000-04:00,2019-04-22,1,4.0,6,1608.340906
6,1,104,1.0,2019-02-08 17:35:00.000-08:00,0,0.0,3,5,91304,60565,11,5.7,1,2019-02-08 09:33:13.000-08:00,2019-02-11,1,4.0,7,2781.527939
7,1,340356,1.0,2018-04-23 17:31:00.000-04:00,0,2.95,3,5,49735,29379,1,6.0,1,2018-04-22 18:32:04.000-04:00,2018-04-25,1,4.0,8,1174.270246
8,1,113915,5.0,2019-10-12 09:22:00.000-04:00,3,0.0,2,8,43606,32958,18,5.55,1,2019-10-11 04:54:25.000-04:00,2019-10-15,1,0.0,9,1568.8678
9,1,130301,1.0,2019-08-09 11:24:00.000-05:00,1,0.0,2,5,35117,84776,13,59.98,1,2019-08-08 12:47:14.000-05:00,2019-08-12,1,4.0,10,2311.01545


Working with `acceptance_scan_timestamp` and `payment_datetime`. convert all in to buyer timezone

In [161]:
df_ebay[['acceptance_scan_timestamp', 'payment_datetime']]

Unnamed: 0,acceptance_scan_timestamp,payment_datetime
0,2019-03-26 15:11:00.000-07:00,2019-03-24 03:56:49.000-07:00
1,2018-06-02 12:53:00.000-07:00,2018-06-01 13:43:54.000-07:00
2,2019-01-07 16:22:00.000-05:00,2019-01-06 00:02:00.000-05:00
3,2018-12-17 16:56:00.000-08:00,2018-12-16 10:28:28.000-08:00
4,2018-07-27 16:48:00.000-07:00,2018-07-26 18:20:02.000-07:00
...,...,...
99995,2018-07-23 17:38:00.000-07:00,2018-07-22 12:35:12.000-07:00
99996,2018-04-17 10:16:00.000-05:00,2018-04-16 08:55:56.000-05:00
99997,2018-04-27 11:41:00.000-04:00,2018-04-26 11:52:36.000-04:00
99998,2018-08-20 21:05:00.000-04:00,2018-08-20 10:31:39.000-04:00


In [162]:
accept_date= pd.to_datetime(df_ebay['acceptance_scan_timestamp'])#.replace(tzinfo= timezone.utc).astimezone(tz=None)


In [163]:
pay_date= pd.to_datetime(df_ebay['payment_datetime'])#.replace(tzinfo= timezone.utc).astimezone(tz=None)

In [164]:
pay_date

0        2019-03-24 03:56:49-07:00
1        2018-06-01 13:43:54-07:00
2        2019-01-06 00:02:00-05:00
3        2018-12-16 10:28:28-08:00
4        2018-07-26 18:20:02-07:00
                   ...            
99995    2018-07-22 12:35:12-07:00
99996    2018-04-16 08:55:56-05:00
99997    2018-04-26 11:52:36-04:00
99998    2018-08-20 10:31:39-04:00
99999    2018-07-21 18:10:24-07:00
Name: payment_datetime, Length: 100000, dtype: object

In [165]:
accept_date

0        2019-03-26 15:11:00-07:00
1        2018-06-02 12:53:00-07:00
2        2019-01-07 16:22:00-05:00
3        2018-12-17 16:56:00-08:00
4        2018-07-27 16:48:00-07:00
                   ...            
99995    2018-07-23 17:38:00-07:00
99996    2018-04-17 10:16:00-05:00
99997    2018-04-27 11:41:00-04:00
99998    2018-08-20 21:05:00-04:00
99999    2018-07-23 11:58:00-07:00
Name: acceptance_scan_timestamp, Length: 100000, dtype: object

In [166]:
#Date convert function
def conver_date_format(date_list):
    output_format = "%Y-%m-%d %H:%M:%S"
    dates=[]
    # input_date= datetime.strftime(date[0], input_format)
    for day in date:    
        dates.append(datetime.strftime(day, output_format))
    return dates

In [167]:
df_ebay['acceptance_scan_timestamp']= conver_date_format(accept_date)

In [168]:
df_ebay['acceptance_scan_timestamp']

0        2019-03-26 15:11:00
1        2018-06-02 12:53:00
2        2019-01-07 16:22:00
3        2018-12-17 16:56:00
4        2018-07-27 16:48:00
                ...         
99995    2018-07-23 17:38:00
99996    2018-04-17 10:16:00
99997    2018-04-27 11:41:00
99998    2018-08-20 21:05:00
99999    2018-07-23 11:58:00
Name: acceptance_scan_timestamp, Length: 100000, dtype: object

In [169]:
df_ebay['payment_datetime']= conver_date_format(pay_date)

In [126]:
df_ebay['payment_datetime']

0        2019-03-26 15:11:00
1        2018-06-02 12:53:00
2        2019-01-07 16:22:00
3        2018-12-17 16:56:00
4        2018-07-27 16:48:00
                ...         
99995    2018-07-23 17:38:00
99996    2018-04-17 10:16:00
99997    2018-04-27 11:41:00
99998    2018-08-20 21:05:00
99999    2018-07-23 11:58:00
Name: payment_datetime, Length: 100000, dtype: object

Take a look at th data  after change

In [171]:
df_ebay.sample(10)

Unnamed: 0,b2c_c2c,seller_id,declared_handling_days,acceptance_scan_timestamp,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,item_zip,buyer_zip,category_id,item_price,quantity,payment_datetime,delivery_date,weight_units,package_size,record_number,distance
36701,1,8681,3.0,2018-05-12 15:48:00,2,0.0,2,9,1012,20770,0,12.14,1,2018-05-12 15:48:00,2018-05-21,1,1.0,36702,508.997576
24821,1,89,1.0,2019-01-15 11:47:00,1,0.0,2,5,30540,62231,9,26.48,1,2019-01-15 11:47:00,2019-01-22,1,4.0,24822,616.493761
21169,1,1346,1.0,2019-01-16 14:40:00,0,0.0,3,5,60016,94591,10,44.0,5,2019-01-16 14:40:00,2019-01-18,1,4.0,21170,2931.785496
34247,1,93242,1.0,2019-10-21 13:59:00,5,0.0,2,5,55304,85206,20,23.27,1,2019-10-21 13:59:00,2019-10-29,1,4.0,34248,2050.219217
45652,1,22554,1.0,2018-11-30 16:00:00,0,3.82,3,5,10306,15005,5,17.5,1,2018-11-30 16:00:00,2018-12-07,1,4.0,45653,513.206469
49391,0,240464,1.0,2018-05-14 15:52:00,2,0.0,2,9,10310,6840,5,18.1,1,2018-05-14 15:52:00,2018-05-16,1,1.0,49392,77.593961
20808,1,6877,1.0,2018-12-17 22:54:00,1,0.0,2,5,46580,79316,2,38.0,1,2018-12-17 22:54:00,2018-12-20,1,4.0,20809,1703.946186
81353,1,2835,1.0,2019-04-03 12:55:00,0,3.95,3,5,60441,45039,30,30.0,1,2019-04-03 12:55:00,2019-04-08,1,4.0,81354,410.058741
58974,1,312,1.0,2019-05-20 14:49:00,0,0.0,3,5,32903,32068,8,9.99,3,2019-05-20 14:49:00,2019-05-22,1,4.0,58975,254.839211
62248,0,6189402,2.0,2018-03-16 12:06:00,1,20.0,2,5,13090,17356,0,102.5,1,2018-03-16 12:06:00,2018-03-21,1,4.0,62249,363.945398
