# Kieran Molloy UCDPA Project 
### Course CIDAB 2022-01-18
***

### Environment Information
<table align="left">
<tr>
    <th>Environment Type</th>
    <th>Anaconda Version</th>
    <th>Anaconda Build Channel</th>
    <th>Python Version</th>
</tr>
<tr>
    <td>Anaconda </td>
    <td>2021.11 </td>
    <td>py39_0 </td>
    <td>3.9.7 </td>
</tr>
</table>

## Links to Kaggle Datasource: 
***

#### _[Transactional Retail Dataset of Electronics Store](https://www.kaggle.com/datasets/muhammadshahrayar/transactional-retail-dataset-of-electronics-store)_

*  _[dirty_data.csv](https://www.kaggle.com/datasets/muhammadshahrayar/transactional-retail-dataset-of-electronics-store?select=dirty_data.csv)_
*  _[missing_data.csv](https://www.kaggle.com/datasets/muhammadshahrayar/transactional-retail-dataset-of-electronics-store?select=missing_data.csv)_
*  _[warehouses.csv](https://www.kaggle.com/datasets/muhammadshahrayar/transactional-retail-dataset-of-electronics-store?select=warehouses.csv)_


## Install Modules

In [1377]:
# %pip install geopy

Note: you may need to restart the kernel to use updated packages.


## Import Modules
***

In [1378]:
from matplotlib import pyplot as plt
from IPython.display import display 

import pandas as pd
import numpy as np
import seaborn as sns

from datetime import datetime

from geopy.distance import geodesic
from geopy.geocoders import Nominatim

## Set Variables
***

In [1216]:
dirty_data = 'dirty_data.csv'
missing_data = 'missing_data.csv'
warehouses = 'warehouses.csv'

## Functions
***

In [1303]:
def get_missing_nearest_warehouse(lat , Long ):
    ''' use coordinates to find the closest warehouse (clrow flies)'''
    
    # customer coordinates are provided
    coords_customer = lat, Long
    
    # get warehouse latitude and longitide from warehouse table
    coords_Nickolson = df_warehouses['lat'].loc[df_warehouses.index[0]], df_warehouses['lon'].loc[df_warehouses.index[0]]
    coords_Thompson = df_warehouses['lat'].loc[df_warehouses.index[1]], df_warehouses['lon'].loc[df_warehouses.index[1]]
    coords_Bakers = df_warehouses['lat'].loc[df_warehouses.index[2]], df_warehouses['lon'].loc[df_warehouses.index[2]]
    
    # use geodesic to do a km distance comparison return the min value
    Nickolson_cust_dist = geodesic(coords_customer, coords_Nickolson).km
    Thompson_cust_dist = geodesic(coords_customer, coords_Thompson).km
    Bakers_cust_dist = geodesic(coords_customer, coords_Bakers).km
    
    val = (Nickolson_cust_dist ,Thompson_cust_dist ,Bakers_cust_dist)
    
    min_dist = val.index(min(val))
    
    if min_dist == 0:
        warehouse = 'Nickolson'
        distance = Nickolson_cust_dist
        return(warehouse , distance)
    elif min_dist == 1:
        warehouse = 'Thompson'
        distance = Nickolson_cust_dist
        return(warehouse , distance)
    else: 
        warehouse = 'Bakers'
        distance = Nickolson_cust_dist
        return(str(warehouse) , distance)

In [1218]:
def get_missing_order_price(order_total, delivery_charges, coupon_discount):
    ''' calculate order_price from order_total, delivery_charges and coupon_discount '''
    order_price = (order_total - delivery_charges) / (100-coupon_discount) * 100
    return(round(order_price, 2))

In [1219]:
def get_missing_order_total(order_price, delivery_charges, coupon_discount):
    ''' calculate order_total from order_price, delivery_charges and coupon_discount'''
    order_total = order_price - (order_price / 100 * coupon_discount) + delivery_charges
    return(round(order_total, 2))

In [1220]:
def get_missing_season(date):
    '''calculate season from date'''
    # Spring = months 9,10,11
    # summer = months 12,1,2
    # Autumn = months 3,4,5
    # Winter = months 6,7,8

    month = date.month

    if month in [9,10,11]:
        season = 'Spring'
    elif month in [12,1,2]:
        season = 'Summer'
    elif month in [3,4,5]:
        season = 'Autumn'
    else:
        season = 'Winter'
    
    return(season)

## CSV Initial Import for Review
***

In [1221]:
# read the CSV from the local Jupyter Notebook directory 
df_dirty_data = pd.read_csv(dirty_data)
df_missing_data = pd.read_csv(missing_data)
df_warehouses = pd.read_csv(warehouses)

## Initial CSV Review
***

### df_dirty_data dataframe review

In [1222]:
type(df_dirty_data)

pandas.core.frame.DataFrame

In [1223]:
print(df_dirty_data.columns)

Index(['order_id', 'customer_id', 'date', 'nearest_warehouse', 'shopping_cart',
       'order_price', 'delivery_charges', 'customer_lat', 'customer_long',
       'coupon_discount', 'order_total', 'season', 'is_expedited_delivery',
       'distance_to_nearest_warehouse', 'latest_customer_review',
       'is_happy_customer'],
      dtype='object')


In [1224]:
df_dirty_data.head(10)

Unnamed: 0,order_id,customer_id,date,nearest_warehouse,shopping_cart,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,season,is_expedited_delivery,distance_to_nearest_warehouse,latest_customer_review,is_happy_customer
0,ORD182494,ID6197211592,2019-06-22,Thompson,"[('Lucent 330S', 1), ('Thunder line', 2), ('iS...",12200,79.89,-37.815105,144.932843,10,11059.89,Winter,True,1.28,perfect phone and trusted seller. phone itself...,True
1,ORD395518,ID0282825849,2019-12-29,Thompson,"[('Thunder line', 1), ('Universe Note', 2)]",9080,62.71,-37.802736,144.951118,0,9142.71,Summer,False,1.1621,it keeps dropping calls the wifi don't work th...,False
2,ORD494479,ID0579391891,2019-03-02,Nickolson,"[('Thunder line', 1), ('pearTV', 2)]",10670,65.87,-37.821302,144.957581,10,9668.87,Autumn,False,1.0949,five stars this is a great cheap phone.,True
3,ORD019224,ID4544561904,2019-01-12,Nickolson,"[('Universe Note', 1), ('Alcon 10', 2), ('Oliv...",24800,57.61,-37.811416,144.973073,15,21137.61,Summer,False,0.8571,charger did not fit the charger didn't fit.,False
4,ORD104032,ID6231506320,2019-11-28,Nickolson,"[('Universe Note', 1), ('Olivia x460', 1), ('i...",9145,75.54,37.823859,144.969892,25,6934.29,Spring,False,0.5867,four stars good,True
5,ORD146760,ID0311654900,2019-09-16,Bakers,"[('Thunder line', 2), ('Universe Note', 1)]",7810,71.22,37.820245,145.014944,10,7100.22,Spring,False,2.0752,stolen phone sold us a stolen phone so we coul...,False
6,ORD337984,ID3394768956,2019-09-14,Thompson,"[('Candle Inferno', 1), ('Alcon 10', 1), ('Tos...",13700,74.84,-37.807743,144.95157,5,13089.84,Spring,False,0.6767,"love our inferno stick,easy to set up and have...",True
7,ORD072312,ID0774517121,2019-05-23,Thompson,"[('Universe Note', 1), ('Thunder line', 2), ('...",7960,52.28,-37.806337,144.959544,5,10789.79,Autumn,False,1.3043,it sucks mine came with dead pixels,False
8,ORD377837,ID4769265355,2019-10-09,Bakers,"[('Alcon 10', 2), ('Thunder line', 1), ('Candl...",25390,107.58,-37.81081,145.014073,10,22958.58,Spring,True,1.6595,this is how top phone should look like! super ...,True
9,ORD462194,ID5301568579,2019-03-21,Thompson,"[('Universe Note', 1), ('Lucent 330S', 1), ('T...",13320,62.26,-37.808675,144.942337,15,11384.26,winter,True,0.6093,does not live up to its reputation. customer s...,False


In [1225]:
# using display command for better formatting in Notebook , use print usually
display(df_dirty_data.describe())

Unnamed: 0,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,distance_to_nearest_warehouse
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,25522.216,76.6582,-35.835234,144.969494,10.89,39209.67,2.204224
std,86333.729169,14.481465,12.045393,0.02272,8.649134,274194.0,8.812416
min,585.0,46.35,-37.827123,144.924967,0.0,639.29,0.1078
25%,7050.0,65.9825,-37.818222,144.953488,5.0,6454.735,0.751425
50%,12807.5,76.31,-37.812165,144.965357,10.0,11293.96,1.0301
75%,20360.0,82.555,-37.805364,144.983985,15.0,18119.19,1.408625
max,947691.0,114.04,37.826339,145.019837,25.0,5688270.0,94.9734


In [1226]:
print(df_dirty_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   order_id                       500 non-null    object 
 1   customer_id                    500 non-null    object 
 2   date                           500 non-null    object 
 3   nearest_warehouse              500 non-null    object 
 4   shopping_cart                  500 non-null    object 
 5   order_price                    500 non-null    int64  
 6   delivery_charges               500 non-null    float64
 7   customer_lat                   500 non-null    float64
 8   customer_long                  500 non-null    float64
 9   coupon_discount                500 non-null    int64  
 10  order_total                    500 non-null    float64
 11  season                         500 non-null    object 
 12  is_expedited_delivery          500 non-null    boo

In [1227]:
print(df_dirty_data.shape)

(500, 16)


In [1228]:
print(df_dirty_data.isna().any())

order_id                         False
customer_id                      False
date                             False
nearest_warehouse                False
shopping_cart                    False
order_price                      False
delivery_charges                 False
customer_lat                     False
customer_long                    False
coupon_discount                  False
order_total                      False
season                           False
is_expedited_delivery            False
distance_to_nearest_warehouse    False
latest_customer_review           False
is_happy_customer                False
dtype: bool


In [1229]:
print(df_dirty_data.isna().sum())

order_id                         0
customer_id                      0
date                             0
nearest_warehouse                0
shopping_cart                    0
order_price                      0
delivery_charges                 0
customer_lat                     0
customer_long                    0
coupon_discount                  0
order_total                      0
season                           0
is_expedited_delivery            0
distance_to_nearest_warehouse    0
latest_customer_review           0
is_happy_customer                0
dtype: int64


### df_missing_data dataframe review

In [1230]:
type(df_missing_data)

pandas.core.frame.DataFrame

In [1231]:
print(df_missing_data.columns)

Index(['order_id', 'customer_id', 'date', 'nearest_warehouse', 'shopping_cart',
       'order_price', 'delivery_charges', 'customer_lat', 'customer_long',
       'coupon_discount', 'order_total', 'season', 'is_expedited_delivery',
       'distance_to_nearest_warehouse', 'latest_customer_review',
       'is_happy_customer'],
      dtype='object')


In [1232]:
df_missing_data.head(10)

Unnamed: 0,order_id,customer_id,date,nearest_warehouse,shopping_cart,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,season,is_expedited_delivery,distance_to_nearest_warehouse,latest_customer_review,is_happy_customer
0,ORD382112,ID0289597187,2019-03-06,Thompson,"[('pearTV', 1), ('Candle Inferno', 2)]",7170.0,61.72,-37.811782,144.951972,15,6156.22,Autumn,True,0.4425,"to many adds in ui it has adds, sends info to ...",False
1,ORD378488,ID1668523020,2019-05-05,Thompson,"[('Thunder line', 1), ('Lucent 330S', 2), ('Al...",13590.0,65.2,-37.808236,144.942181,10,12296.2,Autumn,False,0.6547,s10e haven't had this phone but a short time b...,True
2,ORD279446,ID0370751503,2019-03-24,Bakers,"[('Candle Inferno', 2), ('Universe Note', 1)]",4310.0,78.75,-37.812941,144.985883,15,3742.25,,True,0.8852,best bang for your buck! great overall android...,True
3,ORD277196,ID0634774947,2019-01-12,Thompson,"[('pearTV', 1), ('Thunder line', 1), ('Olivia ...",10940.0,92.59,-37.811748,144.938192,25,8297.59,Summer,True,0.7875,good it was a gift for my niece. it to her in ...,True
4,ORD116193,ID3313210924,2019-09-02,Nickolson,"[('pearTV', 2), ('Universe Note', 2), ('Thunde...",21700.0,95.62,-37.818393,144.967034,5,20710.62,Spring,True,0.2225,"excellent product excellent product, was recei...",True
5,ORD005004,ID0472236192,2019-05-07,Nickolson,"[('Thunder line', 2), ('Lucent 330S', 2)]",6820.0,84.09,-37.801502,144.966104,0,6904.09,Autumn,True,1.9268,i love it very nice and good product would rec...,True
6,ORD296379,ID0591306178,2019-01-19,Nickolson,"[('Thunder line', 1), ('pearTV', 1)]",8490.0,77.51,-37.8237,144.956314,0,8567.51,Summer,False,1.2953,great phone great phone i am still using that ...,True
7,ORD447851,ID0591430562,2019-02-07,,"[('Alcon 10', 2), ('Toshika 750', 2), ('Univer...",32440.0,67.27,-37.817412,144.96759,5,30885.27,Summer,False,0.2169,"five stars it was a gift, my nephew is loving it.",True
8,ORD078449,ID0030287324,2019-10-20,Nickolson,"[('Olivia x460', 2), ('iAssist Line', 2)]",6900.0,105.5,-37.807081,144.967288,15,5970.5,Spring,True,1.2971,definitively would buy again. has been great t...,True
9,ORD018258,ID0245537598,2019-05-27,Nickolson,"[('Universe Note', 1), ('Lucent 330S', 1), ('i...",7205.0,65.39,-37.810164,144.966819,15,6189.64,Autumn,False,0.9688,my best snapchat phone my snapchats are crysta...,True


In [1233]:
# using display command for better formatting in Notebook , use print usually
display(df_missing_data.describe())

Unnamed: 0,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,distance_to_nearest_warehouse
count,490.0,500.0,490.0,490.0,500.0,490.0,490.0
mean,13217.867347,77.68338,-37.812597,144.966114,11.19,11939.84102,1.077335
std,7552.951453,14.566667,0.007593,0.021054,8.734213,7032.605471,0.509914
min,580.0,46.2,-37.828216,144.921217,0.0,568.64,0.0549
25%,7140.0,66.975,-37.818755,144.951541,5.0,6486.5425,0.723625
50%,12180.0,77.275,-37.813053,144.963409,10.0,10756.615,1.04335
75%,18327.5,85.2175,-37.806714,144.980157,15.0,16315.555,1.389525
max,37300.0,110.99,-37.788265,145.017015,25.0,37362.47,3.1388


In [1234]:
print(df_missing_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   order_id                       500 non-null    object 
 1   customer_id                    500 non-null    object 
 2   date                           500 non-null    object 
 3   nearest_warehouse              490 non-null    object 
 4   shopping_cart                  500 non-null    object 
 5   order_price                    490 non-null    float64
 6   delivery_charges               500 non-null    float64
 7   customer_lat                   490 non-null    float64
 8   customer_long                  490 non-null    float64
 9   coupon_discount                500 non-null    int64  
 10  order_total                    490 non-null    float64
 11  season                         490 non-null    object 
 12  is_expedited_delivery          500 non-null    boo

In [1235]:
print(df_missing_data.shape)

(500, 16)


In [1236]:
print(df_missing_data.isna().any())

order_id                         False
customer_id                      False
date                             False
nearest_warehouse                 True
shopping_cart                    False
order_price                       True
delivery_charges                 False
customer_lat                      True
customer_long                     True
coupon_discount                  False
order_total                       True
season                            True
is_expedited_delivery            False
distance_to_nearest_warehouse     True
latest_customer_review           False
is_happy_customer                 True
dtype: bool


In [1237]:
print(df_missing_data.isna().sum())

order_id                          0
customer_id                       0
date                              0
nearest_warehouse                10
shopping_cart                     0
order_price                      10
delivery_charges                  0
customer_lat                     10
customer_long                    10
coupon_discount                   0
order_total                      10
season                           10
is_expedited_delivery             0
distance_to_nearest_warehouse    10
latest_customer_review            0
is_happy_customer                10
dtype: int64


### df_warehouses dataframe review

In [1238]:
type(df_warehouses)

pandas.core.frame.DataFrame

In [1239]:
print(df_warehouses.columns)

Index(['names', 'lat', 'lon'], dtype='object')


In [1240]:
type(df_warehouses)

pandas.core.frame.DataFrame

In [1241]:
df_warehouses.head()

Unnamed: 0,names,lat,lon
0,Nickolson,-37.818595,144.969551
1,Thompson,-37.812673,144.947069
2,Bakers,-37.809996,144.995232


In [1242]:
# using display command for better formatting in Notebook , use print usually
display(df_warehouses.describe())

Unnamed: 0,lat,lon
count,3.0,3.0
mean,-37.813755,144.970617
std,0.0044,0.024099
min,-37.818595,144.947069
25%,-37.815634,144.95831
50%,-37.812673,144.969551
75%,-37.811335,144.982392
max,-37.809996,144.995232


In [1243]:
print(df_warehouses.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   names   3 non-null      object 
 1   lat     3 non-null      float64
 2   lon     3 non-null      float64
dtypes: float64(2), object(1)
memory usage: 200.0+ bytes
None


In [1244]:
print(df_warehouses.shape)

(3, 3)


In [1245]:
print(df_warehouses.isna().any())

names    False
lat      False
lon      False
dtype: bool


In [1246]:
print(df_warehouses.isna().sum())

names    0
lat      0
lon      0
dtype: int64


## Re-import from CSV
***

In [1247]:
# read the CSV from the local Jupyter Notebook directory 
# parsing dates on import since dates were objects also change datatype on order_price
df_dirty_data = pd.read_csv(dirty_data, parse_dates=['date'], dtype={'order_price': 'float'}) 
df_missing_data = pd.read_csv(missing_data, parse_dates=['date']) 

# cant use dtype={'is_happy_customer': 'bool'} because is_happy_customer contains NaN 

In [1248]:
print(df_dirty_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       500 non-null    object        
 1   customer_id                    500 non-null    object        
 2   date                           500 non-null    datetime64[ns]
 3   nearest_warehouse              500 non-null    object        
 4   shopping_cart                  500 non-null    object        
 5   order_price                    500 non-null    float64       
 6   delivery_charges               500 non-null    float64       
 7   customer_lat                   500 non-null    float64       
 8   customer_long                  500 non-null    float64       
 9   coupon_discount                500 non-null    int64         
 10  order_total                    500 non-null    float64       
 11  season             

In [1249]:
print(df_missing_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       500 non-null    object        
 1   customer_id                    500 non-null    object        
 2   date                           500 non-null    datetime64[ns]
 3   nearest_warehouse              490 non-null    object        
 4   shopping_cart                  500 non-null    object        
 5   order_price                    490 non-null    float64       
 6   delivery_charges               500 non-null    float64       
 7   customer_lat                   490 non-null    float64       
 8   customer_long                  490 non-null    float64       
 9   coupon_discount                500 non-null    int64         
 10  order_total                    490 non-null    float64       
 11  season             

In [1250]:
# check for orderid uniqueness in df_dirty_data
duplicate_dirty_data = df_dirty_data[df_dirty_data.duplicated(['order_id'])]
print(duplicate_dirty_data)

Empty DataFrame
Columns: [order_id, customer_id, date, nearest_warehouse, shopping_cart, order_price, delivery_charges, customer_lat, customer_long, coupon_discount, order_total, season, is_expedited_delivery, distance_to_nearest_warehouse, latest_customer_review, is_happy_customer]
Index: []


In [1251]:
# check for orderid uniqueness in df_missing_data 
duplicate_missing_data = df_missing_data[df_missing_data.duplicated(['order_id'])]
print(duplicate_missing_data)

Empty DataFrame
Columns: [order_id, customer_id, date, nearest_warehouse, shopping_cart, order_price, delivery_charges, customer_lat, customer_long, coupon_discount, order_total, season, is_expedited_delivery, distance_to_nearest_warehouse, latest_customer_review, is_happy_customer]
Index: []


In [1252]:
# create an order_id index on df_dirty_data
df_dirty_data.set_index('order_id',inplace=True)

In [1253]:
# create an order_id index on df_missing_data
df_missing_data.set_index('order_id',inplace=True)

In [1254]:
df_dirty_data.index

Index(['ORD182494', 'ORD395518', 'ORD494479', 'ORD019224', 'ORD104032',
       'ORD146760', 'ORD337984', 'ORD072312', 'ORD377837', 'ORD462194',
       ...
       'ORD182549', 'ORD435271', 'ORD082002', 'ORD034351', 'ORD379700',
       'ORD475510', 'ORD086060', 'ORD079320', 'ORD026546', 'ORD085447'],
      dtype='object', name='order_id', length=500)

In [1255]:
df_missing_data.index

Index(['ORD382112', 'ORD378488', 'ORD279446', 'ORD277196', 'ORD116193',
       'ORD005004', 'ORD296379', 'ORD447851', 'ORD078449', 'ORD018258',
       ...
       'ORD273851', 'ORD329425', 'ORD277938', 'ORD297371', 'ORD218866',
       'ORD289820', 'ORD425999', 'ORD252675', 'ORD215989', 'ORD414852'],
      dtype='object', name='order_id', length=500)

## Detect and fix errors in df_dirty_data
***

In [1256]:
# show missing values - none found
print(df_dirty_data.isna().sum())

customer_id                      0
date                             0
nearest_warehouse                0
shopping_cart                    0
order_price                      0
delivery_charges                 0
customer_lat                     0
customer_long                    0
coupon_discount                  0
order_total                      0
season                           0
is_expedited_delivery            0
distance_to_nearest_warehouse    0
latest_customer_review           0
is_happy_customer                0
dtype: int64


In [1257]:
print(df_dirty_data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, ORD182494 to ORD085447
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   customer_id                    500 non-null    object        
 1   date                           500 non-null    datetime64[ns]
 2   nearest_warehouse              500 non-null    object        
 3   shopping_cart                  500 non-null    object        
 4   order_price                    500 non-null    float64       
 5   delivery_charges               500 non-null    float64       
 6   customer_lat                   500 non-null    float64       
 7   customer_long                  500 non-null    float64       
 8   coupon_discount                500 non-null    int64         
 9   order_total                    500 non-null    float64       
 10  season                         500 non-null    object        
 11  is_expedit

In [1258]:
display(df_dirty_data.head())

Unnamed: 0_level_0,customer_id,date,nearest_warehouse,shopping_cart,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,season,is_expedited_delivery,distance_to_nearest_warehouse,latest_customer_review,is_happy_customer
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ORD182494,ID6197211592,2019-06-22,Thompson,"[('Lucent 330S', 1), ('Thunder line', 2), ('iS...",12200.0,79.89,-37.815105,144.932843,10,11059.89,Winter,True,1.28,perfect phone and trusted seller. phone itself...,True
ORD395518,ID0282825849,2019-12-29,Thompson,"[('Thunder line', 1), ('Universe Note', 2)]",9080.0,62.71,-37.802736,144.951118,0,9142.71,Summer,False,1.1621,it keeps dropping calls the wifi don't work th...,False
ORD494479,ID0579391891,2019-03-02,Nickolson,"[('Thunder line', 1), ('pearTV', 2)]",10670.0,65.87,-37.821302,144.957581,10,9668.87,Autumn,False,1.0949,five stars this is a great cheap phone.,True
ORD019224,ID4544561904,2019-01-12,Nickolson,"[('Universe Note', 1), ('Alcon 10', 2), ('Oliv...",24800.0,57.61,-37.811416,144.973073,15,21137.61,Summer,False,0.8571,charger did not fit the charger didn't fit.,False
ORD104032,ID6231506320,2019-11-28,Nickolson,"[('Universe Note', 1), ('Olivia x460', 1), ('i...",9145.0,75.54,37.823859,144.969892,25,6934.29,Spring,False,0.5867,four stars good,True


In [1302]:
columns = df_dirty_data.columns
print(len(columns))

15


### Rename 'nearest_warehouse' values

In [1306]:
# getting the unique values for 'nearest_warehouse'
nearest_warehouse = df_dirty_data['nearest_warehouse'].sort_values().unique()
print(nearest_warehouse)

['Bakers' 'Nickolson' 'Thompson']


In [1305]:
df_dirty_data['nearest_warehouse'] = df_dirty_data['nearest_warehouse'].str.title()

### Rename 'season' values

In [1308]:
# getting the unique values for 'season'
season = df_dirty_data['season'].sort_values().unique()
print(season)

['Autumn' 'Spring' 'Summer' 'Winter']


In [1307]:
df_dirty_data['season'] = df_dirty_data['season'].str.title()

### Check 'order_price' and 'order_total' values dont seem correct

In [1337]:
type(df_dirty_data[['order_price','delivery_charges','coupon_discount','order_total']])

pandas.core.frame.DataFrame

In [1375]:
arr = df_dirty_data[['order_price']]

print("Q2 quantile of order_price : ", np.quantile(arr, .50))
print("Q1 quantile of order_price : ", np.quantile(arr, .25))
print("Q3 quantile of order_price : ", np.quantile(arr, .75))
print("100th quantile of order_price : ", np.quantile(arr, .1))


Q2 quantile of order_price :  12807.5
Q1 quantile of order_price :  7050.0
Q3 quantile of order_price :  20360.0
100th quantile of order_price :  4002.000000000002


In [1365]:
# sorted by 'order_total','order_price'
df_dirty_data[['shopping_cart','order_price','delivery_charges','coupon_discount','order_total']].sort_values(ascending = False, by=['order_total','order_price']).head(40)

Unnamed: 0_level_0,shopping_cart,order_price,delivery_charges,coupon_discount,order_total
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ORD039111,"[('Thunder line', 1), ('Toshika 750', 2)]",10820.0,64.03,0,5688269.6
ORD139508,"[('Toshika 750', 1), ('Candle Inferno', 1), ('Alcon 10', 1), ('Olivia x460', 2)]",16150.0,62.7,5,969651.79
ORD451441,"[('iStream', 1), ('Toshika 750', 2)]",8790.0,85.53,5,967408.88
ORD124395,"[('Alcon 10', 1), ('Universe Note', 1), ('pearTV', 1), ('iStream', 2)]",19010.0,94.75,0,926057.25
ORD015029,"[('Olivia x460', 1), ('Lucent 330S', 1)]",2455.0,77.02,5,909463.94
ORD057375,"[('Thunder line', 1), ('Alcon 10', 1), ('Candle Inferno', 1), ('Universe Note', 2)]",18460.0,93.52,25,665085.66
ORD244594,"[('pearTV', 2), ('Universe Note', 2), ('Thunder line', 1), ('Candle Inferno', 1)]",22130.0,108.92,25,649193.76
ORD450897,"[('Olivia x460', 2), ('pearTV', 2), ('Lucent 330S', 2)]",17530.0,79.05,0,648477.6
ORD481832,"[('Universe Note', 2), ('Candle Inferno', 2), ('pearTV', 1)]",14070.0,73.33,15,620502.3
ORD131352,"[('Candle Inferno', 2), ('Universe Note', 2)]",7760.0,76.34,5,507327.39


In [1363]:
# sorted by 'order_price','order_total'
df_dirty_data[['shopping_cart','order_price','delivery_charges','coupon_discount','order_total']].sort_values(ascending = False, by=['order_price','order_total']).head(40)

Unnamed: 0_level_0,shopping_cart,order_price,delivery_charges,coupon_discount,order_total
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ORD116440,"[('Lucent 330S', 1), ('iAssist Line', 1), ('Candle Inferno', 1), ('pearTV', 1)]",947691.0,90.82,10,9266.32
ORD420200,"[('Alcon 10', 1), ('Olivia x460', 1), ('pearTV', 2)]",865916.0,92.9,15,19468.65
ORD453668,"[('Candle Inferno', 1), ('Toshika 750', 1), ('Lucent 330S', 1), ('Alcon 10', 2)]",672832.0,62.82,25,17972.82
ORD122655,"[('Universe Note', 2), ('Thunder line', 1), ('Candle Inferno', 2), ('Toshika 750', 2)]",655164.0,81.7,10,16803.7
ORD057456,"[('iAssist Line', 2), ('iStream', 1), ('Alcon 10', 2)]",653959.0,66.18,5,21441.18
ORD339649,"[('Alcon 10', 1), ('Thunder line', 2), ('Universe Note', 1), ('Lucent 330S', 2)]",597240.0,79.91,5,18338.91
ORD496181,"[('iAssist Line', 1), ('Alcon 10', 1), ('Olivia x460', 1), ('Candle Inferno', 1)]",461736.0,87.31,25,9709.81
ORD249776,"[('Candle Inferno', 2), ('Lucent 330S', 2), ('pearTV', 2)]",454996.0,79.28,15,13628.28
ORD475510,"[('Alcon 10', 2), ('iAssist Line', 2)]",386776.0,59.7,10,20174.7
ORD130025,"[('Olivia x460', 1), ('Thunder line', 1), ('Lucent 330S', 2), ('Candle Inferno', 1)]",98614.0,98.48,15,5449.23


In [1351]:
# sorted by 'order_total','order_price'
df_dirty_data[['shopping_cart','order_price','delivery_charges','coupon_discount','order_total']].sort_values(ascending = False, by=['order_price','shopping_cart']).head(40)

Unnamed: 0_level_0,shopping_cart,order_price,delivery_charges,coupon_discount,order_total
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ORD116440,"[('Lucent 330S', 1), ('iAssist Line', 1), ('Candle Inferno', 1), ('pearTV', 1)]",947691.0,90.82,10,9266.32
ORD420200,"[('Alcon 10', 1), ('Olivia x460', 1), ('pearTV', 2)]",865916.0,92.9,15,19468.65
ORD453668,"[('Candle Inferno', 1), ('Toshika 750', 1), ('Lucent 330S', 1), ('Alcon 10', 2)]",672832.0,62.82,25,17972.82
ORD122655,"[('Universe Note', 2), ('Thunder line', 1), ('Candle Inferno', 2), ('Toshika 750', 2)]",655164.0,81.7,10,16803.7
ORD057456,"[('iAssist Line', 2), ('iStream', 1), ('Alcon 10', 2)]",653959.0,66.18,5,21441.18
ORD339649,"[('Alcon 10', 1), ('Thunder line', 2), ('Universe Note', 1), ('Lucent 330S', 2)]",597240.0,79.91,5,18338.91
ORD496181,"[('iAssist Line', 1), ('Alcon 10', 1), ('Olivia x460', 1), ('Candle Inferno', 1)]",461736.0,87.31,25,9709.81
ORD249776,"[('Candle Inferno', 2), ('Lucent 330S', 2), ('pearTV', 2)]",454996.0,79.28,15,13628.28
ORD475510,"[('Alcon 10', 2), ('iAssist Line', 2)]",386776.0,59.7,10,20174.7
ORD130025,"[('Olivia x460', 1), ('Thunder line', 1), ('Lucent 330S', 2), ('Candle Inferno', 1)]",98614.0,98.48,15,5449.23


## Fix missing values in df_missing_data
***

In [1376]:
# show missing values
print(df_missing_data.isna().sum())

customer_id                       0
date                              0
nearest_warehouse                 0
shopping_cart                     0
order_price                       0
delivery_charges                  0
customer_lat                     10
customer_long                    10
coupon_discount                   0
order_total                       0
season                            0
is_expedited_delivery             0
distance_to_nearest_warehouse     0
latest_customer_review            0
is_happy_customer                10
dtype: int64


### Fix missing 'nearest_warehouse' values

In [1268]:
# getting rows for the missing values for 'nearest_warehouse'
nearest_warehouse_to_update = df_missing_data[df_missing_data['nearest_warehouse'].isna()]

# using display command for better formatting in Notebook , use print usually
display(nearest_warehouse_to_update)

Unnamed: 0_level_0,customer_id,date,nearest_warehouse,shopping_cart,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,season,is_expedited_delivery,distance_to_nearest_warehouse,latest_customer_review,is_happy_customer
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ORD447851,ID0591430562,2019-02-07,,"[('Alcon 10', 2), ('Toshika 750', 2), ('Univer...",32440.0,67.27,-37.817412,144.96759,5,30885.27,Summer,False,0.2169,"five stars it was a gift, my nephew is loving it.",True
ORD256447,ID1404216319,2019-12-13,,"[('Candle Inferno', 1), ('Olivia x460', 1), ('...",6015.0,77.47,-37.801138,144.988605,25,4588.72,Summer,False,1.1454,i love this phone i love this phone. very fast...,True
ORD159597,ID2632208068,2019-08-18,,"[('Lucent 330S', 2), ('Candle Inferno', 2), ('...",9630.0,80.4,-37.819355,144.989577,10,8747.4,Winter,True,1.1544,quality love it,True
ORD474381,ID2776391626,2019-12-11,,"[('Olivia x460', 2), ('Thunder line', 2), ('Ca...",7670.0,75.06,-37.810971,144.963102,25,5827.56,Summer,False,1.0207,what's not to like? this phone does everything...,True
ORD372144,ID0814520160,2019-12-11,,"[('iAssist Line', 2), ('Olivia x460', 1), ('Al...",18945.0,96.53,-37.812604,144.957172,15,16199.78,Summer,True,0.8885,"it’s good, but it’s not compatible with my car...",True
ORD052805,ID4234925766,2019-05-19,,"[('Universe Note', 2), ('Candle Inferno', 2), ...",9940.0,68.65,-37.804575,144.967468,15,8517.65,Autumn,True,1.5714,accessories not working properly. i have used ...,False
ORD222346,ID0368376953,2019-10-22,,"[('Olivia x460', 2), ('Universe Note', 1), ('L...",8360.0,81.02,-37.802945,144.981699,0,8441.02,Spring,False,1.4257,i liked thank you but don't have headphones 🤔 ...,True
ORD229269,ID0582156569,2019-12-31,,"[('Alcon 10', 1), ('pearTV', 1), ('iAssist Lin...",21805.0,97.96,-37.788265,144.929202,25,16451.71,Summer,False,3.1388,great product. great product came as described...,True
ORD433678,ID0176736472,2019-09-04,,"[('Olivia x460', 1), ('iAssist Line', 1), ('pe...",16070.0,97.81,-37.81412,144.968115,0,16167.81,Spring,True,0.5139,amazing!,True
ORD075737,ID2702095693,2019-10-22,,"[('iAssist Line', 2), ('pearTV', 2), ('Toshika...",25710.0,73.29,-37.810628,144.990476,15,21926.79,Spring,False,0.4241,then this phone is great for drawing on the go...,True


In [1269]:
# getting index for the the missing values for 'nearest_warehouse'
nearest_warehouse_indexto_update = df_missing_data[df_missing_data['nearest_warehouse'].isna()].index
print(nearest_warehouse_indexto_update)

Index(['ORD447851', 'ORD256447', 'ORD159597', 'ORD474381', 'ORD372144',
       'ORD052805', 'ORD222346', 'ORD229269', 'ORD433678', 'ORD075737'],
      dtype='object', name='order_id')


In [1270]:
# updating missing values for 'nearest_warehouse'
for i, row in nearest_warehouse_to_update.iterrows():
    df_missing_data.at[i,'nearest_warehouse'] = get_missing_nearest_warehouse(df_missing_data.at[i,'customer_lat'],df_missing_data.at[i,'customer_long'])[0]

In [1271]:
# no more missing values for 'nearest_warehouse'
print(df_missing_data[df_missing_data['nearest_warehouse'].isna()])

Empty DataFrame
Columns: [customer_id, date, nearest_warehouse, shopping_cart, order_price, delivery_charges, customer_lat, customer_long, coupon_discount, order_total, season, is_expedited_delivery, distance_to_nearest_warehouse, latest_customer_review, is_happy_customer]
Index: []


In [1272]:
#check the indexes for the update 'nearest_warehouse' --  all done
display(df_missing_data[['nearest_warehouse']].loc[nearest_warehouse_indexto_update])

Unnamed: 0_level_0,nearest_warehouse
order_id,Unnamed: 1_level_1
ORD447851,Nickolson
ORD256447,Bakers
ORD159597,Bakers
ORD474381,Nickolson
ORD372144,Thompson
ORD052805,Nickolson
ORD222346,Bakers
ORD229269,Thompson
ORD433678,Nickolson
ORD075737,Bakers


### Fix missing 'order_price' values

In [1273]:
# getting rows for the missing values for 'order_price'
order_price_to_update = df_missing_data[df_missing_data['order_price'].isna()]

# using display command for better formatting in Notebook , use print usually
display(order_price_to_update)

Unnamed: 0_level_0,customer_id,date,nearest_warehouse,shopping_cart,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,season,is_expedited_delivery,distance_to_nearest_warehouse,latest_customer_review,is_happy_customer
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ORD309901,ID1889198159,2019-06-18,Thompson,"[('Olivia x460', 1), ('Alcon 10', 1), ('Univer...",,73.95,-37.81317,144.95144,25,10292.7,Winter,True,0.3884,spectacular but .... beautiful and very fast. ...,True
ORD375694,ID2975216733,2019-03-14,Thompson,"[('iAssist Line', 1), ('Thunder line', 2), ('A...",,68.99,-37.799322,144.959031,0,31453.99,Autumn,True,1.8209,unlock failed i can't unlock for mobile device...,False
ORD276249,ID4735909076,2019-07-07,Thompson,"[('Olivia x460', 2), ('Candle Inferno', 2), ('...",,78.24,-37.804139,144.949835,0,21288.24,Winter,True,0.9806,beautiful just like it,True
ORD052574,ID0385026713,2019-03-14,Bakers,"[('Alcon 10', 1), ('Universe Note', 2), ('Cand...",,65.54,-37.800461,144.995272,25,12598.04,Autumn,False,1.0614,great for use without data plan used it on at&...,True
ORD413785,ID3306876672,2019-11-25,Thompson,"[('Olivia x460', 2), ('pearTV', 2), ('Thunder ...",,75.91,-37.809703,144.955503,10,15600.91,Spring,False,0.8121,good. packaged well. came promptly. works as a...,True
ORD077311,ID0591400647,2019-11-30,Nickolson,"[('Universe Note', 1), ('Lucent 330S', 1)]",,99.99,-37.819764,144.961629,15,4077.99,Spring,True,0.7087,verizon? i just purchased this phone and i'm h...,True
ORD428743,ID2948774567,2019-12-23,Nickolson,"[('iAssist Line', 2), ('pearTV', 2), ('Toshika...",,76.77,-37.808345,144.971555,0,34736.77,Summer,False,1.1546,good phone good phone,True
ORD403951,ID0370750441,2019-12-06,Bakers,"[('Alcon 10', 1), ('iAssist Line', 2), ('Lucen...",,77.56,-37.814721,144.990386,10,13244.56,Summer,True,0.677,a nearly non-functional piece of junk. memory ...,False
ORD271310,ID6167344502,2019-11-09,Nickolson,"[('iAssist Line', 1), ('Lucent 330S', 2)]",,105.27,-37.80711,144.965996,5,4556.02,Spring,True,1.3162,awesome renewed this phone is great,True
ORD042295,ID0579512331,2019-09-18,Thompson,"[('Alcon 10', 2), ('Toshika 750', 2), ('Olivia...",,96.66,-37.813594,144.947873,0,32311.66,Spring,True,0.1246,more bang for the buck love the phone i think ...,True


In [1274]:
# getting index for the the missing values for 'order_price'
order_price_indexto_update = df_missing_data[df_missing_data['order_price'].isna()].index
print(order_price_indexto_update)

Index(['ORD309901', 'ORD375694', 'ORD276249', 'ORD052574', 'ORD413785',
       'ORD077311', 'ORD428743', 'ORD403951', 'ORD271310', 'ORD042295'],
      dtype='object', name='order_id')


In [1275]:
# updating missing values for 'order_price'
for i, row in order_price_to_update.iterrows():
    df_missing_data.at[i,'order_price'] = get_missing_order_price(df_missing_data.at[i,'order_total'],df_missing_data.at[i,'delivery_charges'],df_missing_data.at[i,'coupon_discount'])

In [1276]:
# no more missing values for 'order_price'
print(df_missing_data[df_missing_data['order_price'].isna()])

Empty DataFrame
Columns: [customer_id, date, nearest_warehouse, shopping_cart, order_price, delivery_charges, customer_lat, customer_long, coupon_discount, order_total, season, is_expedited_delivery, distance_to_nearest_warehouse, latest_customer_review, is_happy_customer]
Index: []


In [1277]:
#check the indexes for the update 'order_price' --  all done
display(df_missing_data[['order_price']].loc[order_price_indexto_update])

Unnamed: 0_level_0,order_price
order_id,Unnamed: 1_level_1
ORD309901,13625.0
ORD375694,31385.0
ORD276249,21210.0
ORD052574,16710.0
ORD413785,17250.0
ORD077311,4680.0
ORD428743,34660.0
ORD403951,14630.0
ORD271310,4685.0
ORD042295,32215.0


### Fix missing 'order_total' values

In [1278]:
# getting rows for the missing values for 'order_total'
order_total_to_update = df_missing_data[df_missing_data['order_total'].isna()]

# using display command for better formatting in Notebook , use print usually
display(order_total_to_update)

Unnamed: 0_level_0,customer_id,date,nearest_warehouse,shopping_cart,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,season,is_expedited_delivery,distance_to_nearest_warehouse,latest_customer_review,is_happy_customer
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ORD177154,ID1463547097,2019-06-19,Bakers,"[('Alcon 10', 1), ('iAssist Line', 1)]",11175.0,52.87,-37.818942,145.002326,15,,Winter,False,1.1751,sucks! i have had so many problems and issues ...,False
ORD467869,ID0529205664,2019-10-20,Thompson,"[('Thunder line', 1), ('Toshika 750', 2), ('Ol...",13270.0,83.84,-37.8243,144.936977,0,,Spring,False,1.5693,quality excellent product,True
ORD386228,ID4305737105,2019-10-02,Bakers,"[('Candle Inferno', 2), ('Universe Note', 2), ...",17625.0,104.54,-37.812767,145.01272,25,,Spring,True,1.5685,like it. still working discovery. really too s...,True
ORD323368,ID0746912820,2019-10-16,Nickolson,"[('Lucent 330S', 1), ('pearTV', 2), ('Olivia x...",15075.0,107.82,-37.825823,144.98481,25,,Spring,True,1.5645,five stars great phone and amazing saller jpmo...,True
ORD481503,ID0443274304,2019-12-27,Bakers,"[('Lucent 330S', 1), ('Alcon 10', 1)]",10180.0,81.29,-37.822042,145.003239,10,,Summer,False,1.5146,worth it came sooner than expected and amazing...,True
ORD355999,ID0702373553,2019-02-13,Thompson,"[('pearTV', 1), ('Universe Note', 2), ('Thunde...",15390.0,93.13,-37.8064,144.941489,5,,Summer,True,0.8535,"great, very worthy of purchase. this was a gre...",True
ORD246570,ID2686224593,2019-11-21,Bakers,"[('pearTV', 1), ('Olivia x460', 1), ('Candle I...",11415.0,73.11,-37.81135,145.000987,5,,Spring,False,0.5281,good i just bought this phone a few weeks ago ...,True
ORD109486,ID0443310353,2019-05-07,Bakers,"[('Lucent 330S', 2), ('iAssist Line', 1)]",4685.0,67.05,-37.801873,144.980179,0,,Autumn,False,1.6033,five stars great phone for a replacement and i...,True
ORD354308,ID1668523258,2019-02-18,Nickolson,"[('iStream', 2), ('Olivia x460', 1), ('Univers...",4975.0,72.85,-37.820758,144.961897,15,,Summer,False,0.7148,almost perfect. almost perfect experience. the...,True
ORD489113,ID0702365654,2019-11-05,Thompson,"[('Alcon 10', 2), ('Toshika 750', 1), ('Univer...",29120.0,99.84,-37.806367,144.941655,0,,Spring,True,0.8482,responsible 100% responsible 100%,True


In [1279]:
# getting index for the the missing values for 'order_total'
order_total_indexto_update = df_missing_data[df_missing_data['order_total'].isna()].index
print(order_total_indexto_update)

Index(['ORD177154', 'ORD467869', 'ORD386228', 'ORD323368', 'ORD481503',
       'ORD355999', 'ORD246570', 'ORD109486', 'ORD354308', 'ORD489113'],
      dtype='object', name='order_id')


In [1280]:
# updating missing values for 'order_total'
for i, row in order_total_to_update.iterrows():
    df_missing_data.at[i,'order_total'] = get_missing_order_price(df_missing_data.at[i,'order_price'],df_missing_data.at[i,'delivery_charges'],df_missing_data.at[i,'coupon_discount'])

In [1281]:
# no more missing values for 'order_total'
print(df_missing_data[df_missing_data['order_total'].isna()])

Empty DataFrame
Columns: [customer_id, date, nearest_warehouse, shopping_cart, order_price, delivery_charges, customer_lat, customer_long, coupon_discount, order_total, season, is_expedited_delivery, distance_to_nearest_warehouse, latest_customer_review, is_happy_customer]
Index: []


In [1282]:
#check the indexes for the update 'order_total' --  all done
display(df_missing_data[['order_total']].loc[order_total_indexto_update])

Unnamed: 0_level_0,order_total
order_id,Unnamed: 1_level_1
ORD177154,13084.86
ORD467869,13186.16
ORD386228,23360.61
ORD323368,19956.24
ORD481503,11220.79
ORD355999,16101.97
ORD246570,11938.83
ORD109486,4617.95
ORD354308,5767.24
ORD489113,29020.16


### Fix missing 'season' values

In [1283]:
# getting rows for the missing values for 'season'
season_to_update = df_missing_data[df_missing_data['season'].isna()]

# using display command for better formatting in Notebook , use print usually
display(season_to_update)

Unnamed: 0_level_0,customer_id,date,nearest_warehouse,shopping_cart,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,season,is_expedited_delivery,distance_to_nearest_warehouse,latest_customer_review,is_happy_customer
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ORD279446,ID0370751503,2019-03-24,Bakers,"[('Candle Inferno', 2), ('Universe Note', 1)]",4310.0,78.75,-37.812941,144.985883,15,3742.25,,True,0.8852,best bang for your buck! great overall android...,True
ORD083901,ID5457962366,2019-09-14,Thompson,"[('Toshika 750', 1), ('pearTV', 1), ('iStream'...",10780.0,78.41,-37.822479,144.939978,15,9241.41,,False,1.2571,excelent product excellent relationship qualit...,True
ORD250242,ID0287630803,2019-10-13,Thompson,"[('Olivia x460', 2), ('iStream', 2)]",2750.0,100.85,-37.806362,144.946628,5,2713.35,,True,0.7036,wtc as of right now it's more of a paper weigh...,True
ORD078069,ID1458418088,2019-01-12,Bakers,"[('Lucent 330S', 1), ('Toshika 750', 2), ('pea...",16180.0,101.81,-37.81967,145.010019,0,16281.81,,True,1.6884,excelente el producto llego en perfectas condi...,True
ORD016552,ID0283334264,2019-10-20,Thompson,"[('iAssist Line', 1), ('Universe Note', 1), ('...",10035.0,83.29,-37.80134,144.95612,25,7609.54,,False,1.4917,saves you money and trustworthy. i love my pho...,True
ORD429692,ID0575539547,2019-10-27,Nickolson,"[('iAssist Line', 1), ('Alcon 10', 2), ('pearT...",32745.0,79.86,-37.823686,144.982575,15,27913.11,,False,1.2778,overall good phone fingerprint not so good,True
ORD434639,ID1449297341,2019-06-03,Nickolson,"[('Thunder line', 1), ('iStream', 2), ('Univer...",9380.0,51.44,-37.815768,144.980178,25,7086.44,,False,0.986,doesn't work arrived with completely dead batt...,False
ORD032506,ID0628177290,2019-06-28,Thompson,"[('iStream', 2), ('Alcon 10', 2), ('Candle Inf...",18630.0,61.78,-37.811359,144.947461,5,17760.28,,False,0.1503,photo quality is amazing. the phone is fast an...,True
ORD027479,ID0493051199,2019-06-19,Bakers,"[('Candle Inferno', 2), ('Alcon 10', 1), ('iSt...",9960.0,63.76,-37.807695,144.989743,5,9525.76,,False,0.5465,perfect starter phone fantastic phone for y 11...,True
ORD437147,ID0052450505,2019-10-31,Thompson,"[('iAssist Line', 2), ('Alcon 10', 2)]",22350.0,85.96,-37.795479,144.936073,15,19083.46,,False,2.1445,this was a gift for a family member they reall...,True


In [1284]:
# getting index for the the missing values for 'order_total'
season_indexto_update = df_missing_data[df_missing_data['season'].isna()].index
print(season_indexto_update)

Index(['ORD279446', 'ORD083901', 'ORD250242', 'ORD078069', 'ORD016552',
       'ORD429692', 'ORD434639', 'ORD032506', 'ORD027479', 'ORD437147'],
      dtype='object', name='order_id')


In [1285]:
# getting the unique values for 'season'
seasons = df_missing_data['season'].sort_values().unique()
print(seasons)

['Autumn' 'Spring' 'Summer' 'Winter' nan]


In [1286]:
# get the season months from the grouping of the orders..obviously Southern hemisphere - a random lat long shows its Melbourne, Australia area
season_months = df_missing_data.groupby([pd.Grouper(key='date', axis=0, freq='M'),'season']).sum().filter(items=['date','season'])
(season_months)
# summer = months 12,1,2
# Autumn = months 3,4,5
# Winter = months 6,7,8
# Spring = months 9,10,11

date,season
2019-01-31,Summer
2019-02-28,Summer
2019-03-31,Autumn
2019-04-30,Autumn
2019-05-31,Autumn
2019-06-30,Winter
2019-07-31,Winter
2019-08-31,Winter
2019-09-30,Spring
2019-10-31,Spring


In [1287]:
# updating missing values for 'season'
for i, row in season_to_update.iterrows():
    df_missing_data.at[i,'season'] = get_missing_season(df_missing_data.at[i,'date'])

In [1288]:
# no more missing values for 'season'
print(df_missing_data[df_missing_data['season'].isna()])

Empty DataFrame
Columns: [customer_id, date, nearest_warehouse, shopping_cart, order_price, delivery_charges, customer_lat, customer_long, coupon_discount, order_total, season, is_expedited_delivery, distance_to_nearest_warehouse, latest_customer_review, is_happy_customer]
Index: []


In [1289]:
#check the indexes for the update 'season' --  all done
display(df_missing_data[['season']].loc[order_total_indexto_update])

Unnamed: 0_level_0,season
order_id,Unnamed: 1_level_1
ORD177154,Winter
ORD467869,Spring
ORD386228,Spring
ORD323368,Spring
ORD481503,Summer
ORD355999,Summer
ORD246570,Spring
ORD109486,Autumn
ORD354308,Summer
ORD489113,Spring


### Fix missing 'distance_to_nearest_warehouse' values

In [1290]:
# getting rows for the missing values for 'distance_to_nearest_warehouse'
distance_to_nearest_warehouse_to_update = df_missing_data[df_missing_data['distance_to_nearest_warehouse'].isna()]

# using display command for better formatting in Notebook , use print usually
display(distance_to_nearest_warehouse_to_update)

Unnamed: 0_level_0,customer_id,date,nearest_warehouse,shopping_cart,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,season,is_expedited_delivery,distance_to_nearest_warehouse,latest_customer_review,is_happy_customer
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ORD265586,ID2189485869,2019-03-21,Nickolson,"[('iAssist Line', 1), ('Olivia x460', 1)]",3450.0,66.56,-37.81115,144.972957,5,3344.06,Autumn,False,,everything i expected overall i love the phone...,True
ORD224296,ID0441586985,2019-01-04,Thompson,"[('iAssist Line', 2), ('Olivia x460', 2), ('Un...",22440.0,74.73,-37.813423,144.938133,0,22514.73,Summer,False,,refurbished or used?? a good phone for the pri...,True
ORD247311,ID3146808067,2019-05-10,Bakers,"[('Toshika 750', 1), ('pearTV', 2), ('iAssist ...",21345.0,77.04,-37.814703,144.990221,0,21422.04,Autumn,True,,telephone definitely was worth the money.i qou...,True
ORD465260,ID0248747000,2019-03-11,Bakers,"[('pearTV', 1), ('Candle Inferno', 2), ('iAssi...",9395.0,66.56,-37.820683,144.993264,25,7112.81,Autumn,False,,"nice phone nice product, looks like new. batte...",True
ORD160120,ID1497170573,2019-05-25,Bakers,"[('Toshika 750', 2), ('iAssist Line', 2)]",13090.0,53.95,-37.823615,144.990227,25,9871.45,Autumn,False,,hot trash! this phone is horrible do not buy! ...,False
ORD174660,ID6207067487,2019-03-10,Nickolson,"[('Toshika 750', 1), ('pearTV', 1), ('Universe...",14080.0,69.4,-37.802712,144.965156,10,12741.4,Autumn,False,,came in mint condition maybe one scratch on it...,True
ORD048679,ID0575524447,2019-07-27,Nickolson,"[('iStream', 1), ('Toshika 750', 2)]",8790.0,67.67,-37.821312,144.983293,5,8418.17,Winter,False,,good,True
ORD392649,ID0579988733,2019-05-11,Thompson,"[('Alcon 10', 1), ('Candle Inferno', 1), ('Tos...",18320.0,51.34,-37.807869,144.94198,25,13791.34,Autumn,False,,one star i got light blub instead of a phone a...,False
ORD137128,ID1224804764,2019-02-19,Thompson,"[('iAssist Line', 2), ('Toshika 750', 2), ('Th...",16130.0,57.79,-37.813324,144.938427,5,15381.29,Summer,False,,"not good it was a used phone , scratch , apps ...",False
ORD132417,ID0579498247,2019-11-29,Thompson,"[('Candle Inferno', 2), ('pearTV', 2), ('Unive...",25880.0,98.32,-37.814614,144.949895,10,23390.32,Spring,True,,"solid phone for the $$$ great phone, can't bel...",True


In [1291]:
# getting index for the the missing values for 'distance_to_nearest_warehouse'
distance_to_nearest_warehouse_indexto_update = df_missing_data[df_missing_data['distance_to_nearest_warehouse'].isna()].index
print(distance_to_nearest_warehouse_indexto_update)

Index(['ORD265586', 'ORD224296', 'ORD247311', 'ORD465260', 'ORD160120',
       'ORD174660', 'ORD048679', 'ORD392649', 'ORD137128', 'ORD132417'],
      dtype='object', name='order_id')


In [1292]:
# updating  missing values for 'nearest_warehouse'
# for i in distance_to_nearest_warehouse_indexto_update:
# df_missing_data.at[i,'distance_to_nearest_warehouse'] = get_missing_nearest_warehouse(df_missing_data.at[i,'customer_lat'],df_missing_data.at[i,'customer_long'])[1]

# updating missing values for 'distance_to_nearest_warehouse'
for i, row in distance_to_nearest_warehouse_to_update.iterrows():
    df_missing_data.at[i,'distance_to_nearest_warehouse'] = get_missing_nearest_warehouse(df_missing_data.at[i,'customer_lat'],df_missing_data.at[i,'customer_long'])[1]

In [1293]:
# no more missing values for 'distance_to_nearest_warehouse'
print(df_missing_data[df_missing_data['distance_to_nearest_warehouse'].isna()])

Empty DataFrame
Columns: [customer_id, date, nearest_warehouse, shopping_cart, order_price, delivery_charges, customer_lat, customer_long, coupon_discount, order_total, season, is_expedited_delivery, distance_to_nearest_warehouse, latest_customer_review, is_happy_customer]
Index: []


In [1294]:
#check the indexes for the update 'season' --  all done
display(df_missing_data[['distance_to_nearest_warehouse']].loc[order_total_indexto_update])

Unnamed: 0_level_0,distance_to_nearest_warehouse
order_id,Unnamed: 1_level_1
ORD177154,1.1751
ORD467869,1.5693
ORD386228,1.5685
ORD323368,1.5645
ORD481503,1.5146
ORD355999,0.8535
ORD246570,0.5281
ORD109486,1.6033
ORD354308,0.7148
ORD489113,0.8482


### Fix missing 'customer_lat' values

In [1295]:
# show missing 'customer_lat' rows
df_missing_data[df_missing_data['customer_lat'].isna()]

Unnamed: 0_level_0,customer_id,date,nearest_warehouse,shopping_cart,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,season,is_expedited_delivery,distance_to_nearest_warehouse,latest_customer_review,is_happy_customer
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ORD006145,ID0373644746,2019-05-14,Thompson,"[('Alcon 10', 2), ('Candle Inferno', 2), ('Tos...",23230.0,71.31,,144.921217,0,23301.31,Autumn,True,2.3245,had to buy it myself and i got know money back...,False
ORD225147,ID0660546021,2019-07-04,Bakers,"[('pearTV', 1), ('Universe Note', 2), ('Lucent...",18760.0,59.27,,144.990883,15,16005.27,Winter,True,0.443,deceptive ad wrong items,False
ORD387776,ID0289602641,2019-07-06,Thompson,"[('Lucent 330S', 1), ('pearTV', 1)]",7540.0,62.35,,144.950976,10,6848.35,Winter,False,0.4297,good quality product. clear display and easy t...,True
ORD008298,ID2383215099,2019-01-01,Thompson,"[('Olivia x460', 1), ('Alcon 10', 1)]",10175.0,78.57,,144.958015,25,7709.82,Summer,False,1.1923,five stars great product and fast shipping!,True
ORD417492,ID4544035096,2019-12-16,Nickolson,"[('Alcon 10', 2), ('Thunder line', 2), ('Toshi...",31330.0,73.47,,144.963861,15,26703.97,Summer,False,0.9011,really great quality! getting another for myself!,True
ORD296102,ID0114607245,2019-03-14,Nickolson,"[('Thunder line', 2), ('iStream', 1), ('Alcon ...",17910.0,63.16,,144.965071,5,17077.66,Autumn,False,0.4306,great phone but still not super fast this is o...,True
ORD452316,ID0268533994,2019-03-07,Nickolson,"[('Olivia x460', 2), ('Universe Note', 2), ('p...",21970.0,64.95,,144.978023,10,19837.95,Autumn,False,1.1201,five stars good phone. i have had no issues wi...,True
ORD285564,ID0576824418,2019-11-19,Nickolson,"[('Toshika 750', 2), ('iAssist Line', 2)]",13090.0,78.7,,144.980934,0,13168.7,Spring,False,1.0095,"easy to use, work's perfect with the usa netwo...",True
ORD297989,ID0441996141,2019-11-03,Bakers,"[('Toshika 750', 2), ('Universe Note', 1), ('C...",15410.0,103.63,,145.001089,15,13202.13,Spring,True,1.2009,five stars phone worked great no problems.,True
ORD157248,ID0634784116,2019-12-09,Thompson,"[('Alcon 10', 1), ('Thunder line', 1)]",11130.0,73.4,,144.937599,0,11203.4,Summer,False,0.8391,"five stars works great! easy to use, perfect f...",True


In [1296]:
# getting the index for the missing values for customer_lat
customer_lat_indexto_update = df_missing_data[df_missing_data['customer_lat'].isna()].index
customer_lat_indexto_update

Index(['ORD006145', 'ORD225147', 'ORD387776', 'ORD008298', 'ORD417492',
       'ORD296102', 'ORD452316', 'ORD285564', 'ORD297989', 'ORD157248'],
      dtype='object', name='order_id')

In [1297]:
df_missing_data[df_missing_data.duplicated(['customer_id'])]

Unnamed: 0_level_0,customer_id,date,nearest_warehouse,shopping_cart,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,season,is_expedited_delivery,distance_to_nearest_warehouse,latest_customer_review,is_happy_customer
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ORD329262,ID1492175313,2019-03-17,Thompson,"[('Thunder line', 2), ('Candle Inferno', 1)]",4790.0,68.21,-37.802653,144.963491,5,4618.71,Autumn,True,1.8248,one star im in the process of returning it bec...,False
ORD129280,ID0660546021,2019-07-12,Bakers,"[('Toshika 750', 2), ('iStream', 2)]",8940.0,60.33,-37.812003,144.990883,25,6765.33,Winter,True,0.443,selling lost/stolen items i received this toda...,False
ORD029359,ID0305909619,2019-04-28,Bakers,"[('Alcon 10', 1), ('Thunder line', 2)]",13310.0,70.42,-37.802017,145.006551,5,12714.92,Autumn,False,1.3342,works great good price work great,True
ORD380695,ID0052450505,2019-09-06,Thompson,"[('Lucent 330S', 1), ('Olivia x460', 2)]",3680.0,86.92,-37.795479,144.936073,0,3766.92,Spring,False,2.1445,perfectly as described perfectly as described,True
ORD273851,ID0844490198,2019-01-13,Thompson,"[('Universe Note', 1), ('iStream', 2), ('Candl...",13130.0,94.86,-37.822222,144.949747,15,11255.36,Summer,True,1.0887,concern about cosmetic condition of phone was ...,True


### Fix missing 'customer_long' values

In [1298]:
# show missing 'customer_long' rows
df_missing_data[df_missing_data['customer_long'].isna()]

Unnamed: 0_level_0,customer_id,date,nearest_warehouse,shopping_cart,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,season,is_expedited_delivery,distance_to_nearest_warehouse,latest_customer_review,is_happy_customer
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ORD131598,ID2759826600,2019-06-04,Nickolson,"[('Thunder line', 1), ('Olivia x460', 2), ('Lu...",7950.0,63.85,-37.810558,,0,8013.85,Winter,True,1.1134,the phone arrived locked. after all the hassle...,False
ORD287765,ID0257505383,2019-11-06,Thompson,"[('Olivia x460', 1), ('Thunder line', 2), ('Al...",23485.0,96.76,-37.799204,,25,17710.51,Spring,True,2.06,this phone sucks the wifi and the bluetooth di...,False
ORD159527,ID4283908181,2019-04-17,Bakers,"[('Universe Note', 1), ('iStream', 2), ('pearT...",16370.0,81.92,-37.811419,,15,13996.42,Autumn,True,1.2881,"great phone great phone still, but i'm a nerd.",True
ORD052599,ID4520380532,2019-01-16,Thompson,"[('pearTV', 2), ('Universe Note', 2)]",19520.0,93.58,-37.815708,,25,14733.58,Summer,True,1.0381,ideal for landscapers this is perfect for anyo...,True
ORD015960,ID0247024616,2019-05-19,Nickolson,"[('Thunder line', 1), ('iStream', 1), ('Lucent...",11690.0,63.78,-37.815455,,5,11169.28,Autumn,False,0.3585,nice cell phone... great value! so far so good...,True
ORD466918,ID0049489089,2019-08-27,Nickolson,"[('Thunder line', 2), ('Alcon 10', 1), ('iStre...",13610.0,63.43,-37.820885,,10,12312.43,Winter,False,0.7075,i couldn't be more pleased. i've only had this...,True
ORD125582,ID4315827380,2019-04-05,Bakers,"[('Thunder line', 1), ('Candle Inferno', 2), (...",4270.0,52.19,-37.822651,,0,4322.19,Autumn,False,1.4366,this phone would never update fully. constantl...,False
ORD305340,ID3810637333,2019-06-05,Thompson,"[('Alcon 10', 1), ('Lucent 330S', 1)]",10180.0,67.34,-37.820684,,10,9229.34,Winter,False,1.023,omg i'm in love with this phone omg i'm in lov...,True
ORD166029,ID0373635383,2019-04-09,Thompson,"[('iStream', 2), ('Alcon 10', 2), ('Olivia x46...",23110.0,67.12,-37.822575,,5,22021.62,Autumn,False,1.2819,good so far so good,True
ORD487171,ID0595594352,2019-10-26,Nickolson,"[('iAssist Line', 2), ('pearTV', 1)]",10760.0,102.05,-37.810906,,25,8172.05,Spring,True,0.8814,purchased for my granddaughter and it was a gr...,True


In [1299]:
# getting the index for the missing values for customer_long
customer_long_indexto_update = df_missing_data[df_missing_data['customer_long'].isna()].index
customer_long_indexto_update

Index(['ORD131598', 'ORD287765', 'ORD159527', 'ORD052599', 'ORD015960',
       'ORD466918', 'ORD125582', 'ORD305340', 'ORD166029', 'ORD487171'],
      dtype='object', name='order_id')

In [1300]:
# show missing values
print(df_missing_data.isna().sum())

customer_id                       0
date                              0
nearest_warehouse                 0
shopping_cart                     0
order_price                       0
delivery_charges                  0
customer_lat                     10
customer_long                    10
coupon_discount                   0
order_total                       0
season                            0
is_expedited_delivery             0
distance_to_nearest_warehouse     0
latest_customer_review            0
is_happy_customer                10
dtype: int64


In [None]:
## Merge Dataframes

In [None]:
df_missing_dirty = df_missing_data.merge(df_dirty_data, on=)

## Analyse
***

In [1301]:
print(df_missing_data.columns)

Index(['customer_id', 'date', 'nearest_warehouse', 'shopping_cart',
       'order_price', 'delivery_charges', 'customer_lat', 'customer_long',
       'coupon_discount', 'order_total', 'season', 'is_expedited_delivery',
       'distance_to_nearest_warehouse', 'latest_customer_review',
       'is_happy_customer'],
      dtype='object')


## Visualise
***