# Kieran Molloy UCDPA Project 
### Course CIDAB 2022-01-18
***

### Environment Information
<table align="left">
<tr>
    <th>Environment Type</th>
    <th>Anaconda Version</th>
    <th>Anaconda Build Channel</th>
    <th>Python Version</th>
</tr>
<tr>
    <td>Anaconda </td>
    <td>2021.11 </td>
    <td>py39_0 </td>
    <td>3.9.7 </td>
</tr>
</table>

## Links to Kaggle Datasource: 
***

#### _[Transactional Retail Dataset of Electronics Store](https://www.kaggle.com/datasets/muhammadshahrayar/transactional-retail-dataset-of-electronics-store)_

*  _[dirty_data.csv](https://www.kaggle.com/datasets/muhammadshahrayar/transactional-retail-dataset-of-electronics-store?select=dirty_data.csv)_
*  _[missing_data.csv](https://www.kaggle.com/datasets/muhammadshahrayar/transactional-retail-dataset-of-electronics-store?select=missing_data.csv)_
*  _[warehouses.csv](https://www.kaggle.com/datasets/muhammadshahrayar/transactional-retail-dataset-of-electronics-store?select=warehouses.csv)_


## Install Modules

In [910]:
# %pip install geopy

## Import Modules
***

In [911]:
from matplotlib import pyplot as plt
from IPython.display import display 

import pandas as pd
import numpy as np
import seaborn as sns

from datetime import datetime

from geopy.distance import geodesic

## Set Variables
***

In [912]:
dirty_data = 'dirty_data.csv'
missing_data = 'missing_data.csv'
warehouses = 'warehouses.csv'

## Functions
***

In [913]:
def get_missing_nearest_warehouse(lat , Long ):
    ''' use coordinates to find the closest warehouse (clrow flies)'''
    
    # customer coordinates are provided
    coords_customer = lat, Long
    
    # get warehouse latitude and longitide from warehouse table
    coords_Nickolson = df_warehouses['lat'].loc[df_warehouses.index[0]], df_warehouses['lon'].loc[df_warehouses.index[0]]
    coords_Thompson = df_warehouses['lat'].loc[df_warehouses.index[1]], df_warehouses['lon'].loc[df_warehouses.index[1]]
    coords_Bakers = df_warehouses['lat'].loc[df_warehouses.index[2]], df_warehouses['lon'].loc[df_warehouses.index[2]]
    
    # use geodesic to do a km distance comparison
    Nickolson_cust_dist = geodesic(coords_customer, coords_Nickolson).km
    Thompson_cust_dist = geodesic(coords_customer, coords_Thompson).km
    Bakers_cust_dist = geodesic(coords_customer, coords_Bakers).km
    
    val = (Nickolson_cust_dist ,Thompson_cust_dist ,Bakers_cust_dist)
    
    x = val.index(min(val))
    
    if x == 0:
        warehouse = 'Nickolson'
        distance = Nickolson_cust_dist
        return(warehouse , distance)
    elif x == 1:
        warehouse = 'Thompson'
        distance = Nickolson_cust_dist
        return(warehouse , distance)
    else: 
        warehouse = 'Bakers'
        distance = Nickolson_cust_dist
        return(str(warehouse) , distance)

In [914]:
def get_missing_order_price(order_total, delivery_charges, coupon_discount):
    ''' calculate order_price from order_total, delivery_charges and coupon_discount '''
    order_price = (order_total - delivery_charges) / (100-coupon_discount) * 100
    return(round(order_price, 2))

In [915]:
def get_missing_order_total(order_price, delivery_charges, coupon_discount):
    ''' calculate order_total from order_price, delivery_charges and coupon_discount'''
    order_total = order_price - (order_price / 100 * coupon_discount) + delivery_charges
    return(round(order_total, 2))

In [916]:
def get_missing_season(date):
    '''calculate season from date'''
    # Spring = months 9,10,11
    # summer = months 12,1,2
    # Autumn = months 3,4,5
    # Winter = months 6,7,8

    month = date.month

    if month in [9,10,11]:
        season = 'Spring'
    elif month in [12,1,2]:
        season = 'Summer'
    elif month in [3,4,5]:
        season = 'Autumn'
    else:
        season = 'Winter'
    
    return(season)

## CSV import 
***

In [917]:
# read the CSV from the local Jupyter Notebook directory 
df_dirty_data = pd.read_csv(dirty_data)
df_missing_data = pd.read_csv(missing_data)
df_warehouses = pd.read_csv(warehouses)

## Initial CSV Review
***

### df_dirty_data dataframe review

In [918]:
type(df_dirty_data)

pandas.core.frame.DataFrame

In [919]:
print(df_dirty_data.columns)

Index(['order_id', 'customer_id', 'date', 'nearest_warehouse', 'shopping_cart',
       'order_price', 'delivery_charges', 'customer_lat', 'customer_long',
       'coupon_discount', 'order_total', 'season', 'is_expedited_delivery',
       'distance_to_nearest_warehouse', 'latest_customer_review',
       'is_happy_customer'],
      dtype='object')


In [920]:
df_dirty_data.head(10)

Unnamed: 0,order_id,customer_id,date,nearest_warehouse,shopping_cart,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,season,is_expedited_delivery,distance_to_nearest_warehouse,latest_customer_review,is_happy_customer
0,ORD182494,ID6197211592,2019-06-22,Thompson,"[('Lucent 330S', 1), ('Thunder line', 2), ('iS...",12200,79.89,-37.815105,144.932843,10,11059.89,Winter,True,1.28,perfect phone and trusted seller. phone itself...,True
1,ORD395518,ID0282825849,2019-12-29,Thompson,"[('Thunder line', 1), ('Universe Note', 2)]",9080,62.71,-37.802736,144.951118,0,9142.71,Summer,False,1.1621,it keeps dropping calls the wifi don't work th...,False
2,ORD494479,ID0579391891,2019-03-02,Nickolson,"[('Thunder line', 1), ('pearTV', 2)]",10670,65.87,-37.821302,144.957581,10,9668.87,Autumn,False,1.0949,five stars this is a great cheap phone.,True
3,ORD019224,ID4544561904,2019-01-12,Nickolson,"[('Universe Note', 1), ('Alcon 10', 2), ('Oliv...",24800,57.61,-37.811416,144.973073,15,21137.61,Summer,False,0.8571,charger did not fit the charger didn't fit.,False
4,ORD104032,ID6231506320,2019-11-28,Nickolson,"[('Universe Note', 1), ('Olivia x460', 1), ('i...",9145,75.54,37.823859,144.969892,25,6934.29,Spring,False,0.5867,four stars good,True
5,ORD146760,ID0311654900,2019-09-16,Bakers,"[('Thunder line', 2), ('Universe Note', 1)]",7810,71.22,37.820245,145.014944,10,7100.22,Spring,False,2.0752,stolen phone sold us a stolen phone so we coul...,False
6,ORD337984,ID3394768956,2019-09-14,Thompson,"[('Candle Inferno', 1), ('Alcon 10', 1), ('Tos...",13700,74.84,-37.807743,144.95157,5,13089.84,Spring,False,0.6767,"love our inferno stick,easy to set up and have...",True
7,ORD072312,ID0774517121,2019-05-23,Thompson,"[('Universe Note', 1), ('Thunder line', 2), ('...",7960,52.28,-37.806337,144.959544,5,10789.79,Autumn,False,1.3043,it sucks mine came with dead pixels,False
8,ORD377837,ID4769265355,2019-10-09,Bakers,"[('Alcon 10', 2), ('Thunder line', 1), ('Candl...",25390,107.58,-37.81081,145.014073,10,22958.58,Spring,True,1.6595,this is how top phone should look like! super ...,True
9,ORD462194,ID5301568579,2019-03-21,Thompson,"[('Universe Note', 1), ('Lucent 330S', 1), ('T...",13320,62.26,-37.808675,144.942337,15,11384.26,winter,True,0.6093,does not live up to its reputation. customer s...,False


In [921]:
# using display for better formatting in Notebook , use print usually
display(df_dirty_data.describe())

Unnamed: 0,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,distance_to_nearest_warehouse
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,25522.216,76.6582,-35.835234,144.969494,10.89,39209.67,2.204224
std,86333.729169,14.481465,12.045393,0.02272,8.649134,274194.0,8.812416
min,585.0,46.35,-37.827123,144.924967,0.0,639.29,0.1078
25%,7050.0,65.9825,-37.818222,144.953488,5.0,6454.735,0.751425
50%,12807.5,76.31,-37.812165,144.965357,10.0,11293.96,1.0301
75%,20360.0,82.555,-37.805364,144.983985,15.0,18119.19,1.408625
max,947691.0,114.04,37.826339,145.019837,25.0,5688270.0,94.9734


In [922]:
print(df_dirty_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   order_id                       500 non-null    object 
 1   customer_id                    500 non-null    object 
 2   date                           500 non-null    object 
 3   nearest_warehouse              500 non-null    object 
 4   shopping_cart                  500 non-null    object 
 5   order_price                    500 non-null    int64  
 6   delivery_charges               500 non-null    float64
 7   customer_lat                   500 non-null    float64
 8   customer_long                  500 non-null    float64
 9   coupon_discount                500 non-null    int64  
 10  order_total                    500 non-null    float64
 11  season                         500 non-null    object 
 12  is_expedited_delivery          500 non-null    boo

In [923]:
print(df_dirty_data.shape)

(500, 16)


In [924]:
print(df_dirty_data.isna().any())

order_id                         False
customer_id                      False
date                             False
nearest_warehouse                False
shopping_cart                    False
order_price                      False
delivery_charges                 False
customer_lat                     False
customer_long                    False
coupon_discount                  False
order_total                      False
season                           False
is_expedited_delivery            False
distance_to_nearest_warehouse    False
latest_customer_review           False
is_happy_customer                False
dtype: bool


In [925]:
print(df_dirty_data.isnull().sum())

order_id                         0
customer_id                      0
date                             0
nearest_warehouse                0
shopping_cart                    0
order_price                      0
delivery_charges                 0
customer_lat                     0
customer_long                    0
coupon_discount                  0
order_total                      0
season                           0
is_expedited_delivery            0
distance_to_nearest_warehouse    0
latest_customer_review           0
is_happy_customer                0
dtype: int64


### df_missing_data dataframe review

In [926]:
type(df_missing_data)

pandas.core.frame.DataFrame

In [927]:
print(df_missing_data.columns)

Index(['order_id', 'customer_id', 'date', 'nearest_warehouse', 'shopping_cart',
       'order_price', 'delivery_charges', 'customer_lat', 'customer_long',
       'coupon_discount', 'order_total', 'season', 'is_expedited_delivery',
       'distance_to_nearest_warehouse', 'latest_customer_review',
       'is_happy_customer'],
      dtype='object')


In [928]:
df_missing_data.head(10)

Unnamed: 0,order_id,customer_id,date,nearest_warehouse,shopping_cart,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,season,is_expedited_delivery,distance_to_nearest_warehouse,latest_customer_review,is_happy_customer
0,ORD382112,ID0289597187,2019-03-06,Thompson,"[('pearTV', 1), ('Candle Inferno', 2)]",7170.0,61.72,-37.811782,144.951972,15,6156.22,Autumn,True,0.4425,"to many adds in ui it has adds, sends info to ...",False
1,ORD378488,ID1668523020,2019-05-05,Thompson,"[('Thunder line', 1), ('Lucent 330S', 2), ('Al...",13590.0,65.2,-37.808236,144.942181,10,12296.2,Autumn,False,0.6547,s10e haven't had this phone but a short time b...,True
2,ORD279446,ID0370751503,2019-03-24,Bakers,"[('Candle Inferno', 2), ('Universe Note', 1)]",4310.0,78.75,-37.812941,144.985883,15,3742.25,,True,0.8852,best bang for your buck! great overall android...,True
3,ORD277196,ID0634774947,2019-01-12,Thompson,"[('pearTV', 1), ('Thunder line', 1), ('Olivia ...",10940.0,92.59,-37.811748,144.938192,25,8297.59,Summer,True,0.7875,good it was a gift for my niece. it to her in ...,True
4,ORD116193,ID3313210924,2019-09-02,Nickolson,"[('pearTV', 2), ('Universe Note', 2), ('Thunde...",21700.0,95.62,-37.818393,144.967034,5,20710.62,Spring,True,0.2225,"excellent product excellent product, was recei...",True
5,ORD005004,ID0472236192,2019-05-07,Nickolson,"[('Thunder line', 2), ('Lucent 330S', 2)]",6820.0,84.09,-37.801502,144.966104,0,6904.09,Autumn,True,1.9268,i love it very nice and good product would rec...,True
6,ORD296379,ID0591306178,2019-01-19,Nickolson,"[('Thunder line', 1), ('pearTV', 1)]",8490.0,77.51,-37.8237,144.956314,0,8567.51,Summer,False,1.2953,great phone great phone i am still using that ...,True
7,ORD447851,ID0591430562,2019-02-07,,"[('Alcon 10', 2), ('Toshika 750', 2), ('Univer...",32440.0,67.27,-37.817412,144.96759,5,30885.27,Summer,False,0.2169,"five stars it was a gift, my nephew is loving it.",True
8,ORD078449,ID0030287324,2019-10-20,Nickolson,"[('Olivia x460', 2), ('iAssist Line', 2)]",6900.0,105.5,-37.807081,144.967288,15,5970.5,Spring,True,1.2971,definitively would buy again. has been great t...,True
9,ORD018258,ID0245537598,2019-05-27,Nickolson,"[('Universe Note', 1), ('Lucent 330S', 1), ('i...",7205.0,65.39,-37.810164,144.966819,15,6189.64,Autumn,False,0.9688,my best snapchat phone my snapchats are crysta...,True


In [929]:
# using display for better formatting in Notebook , use print usually
display(df_missing_data.describe())

Unnamed: 0,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,distance_to_nearest_warehouse
count,490.0,500.0,490.0,490.0,500.0,490.0,490.0
mean,13217.867347,77.68338,-37.812597,144.966114,11.19,11939.84102,1.077335
std,7552.951453,14.566667,0.007593,0.021054,8.734213,7032.605471,0.509914
min,580.0,46.2,-37.828216,144.921217,0.0,568.64,0.0549
25%,7140.0,66.975,-37.818755,144.951541,5.0,6486.5425,0.723625
50%,12180.0,77.275,-37.813053,144.963409,10.0,10756.615,1.04335
75%,18327.5,85.2175,-37.806714,144.980157,15.0,16315.555,1.389525
max,37300.0,110.99,-37.788265,145.017015,25.0,37362.47,3.1388


In [930]:
print(df_missing_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   order_id                       500 non-null    object 
 1   customer_id                    500 non-null    object 
 2   date                           500 non-null    object 
 3   nearest_warehouse              490 non-null    object 
 4   shopping_cart                  500 non-null    object 
 5   order_price                    490 non-null    float64
 6   delivery_charges               500 non-null    float64
 7   customer_lat                   490 non-null    float64
 8   customer_long                  490 non-null    float64
 9   coupon_discount                500 non-null    int64  
 10  order_total                    490 non-null    float64
 11  season                         490 non-null    object 
 12  is_expedited_delivery          500 non-null    boo

In [931]:
print(df_missing_data.shape)

(500, 16)


In [932]:
print(df_missing_data.isna().any())

order_id                         False
customer_id                      False
date                             False
nearest_warehouse                 True
shopping_cart                    False
order_price                       True
delivery_charges                 False
customer_lat                      True
customer_long                     True
coupon_discount                  False
order_total                       True
season                            True
is_expedited_delivery            False
distance_to_nearest_warehouse     True
latest_customer_review           False
is_happy_customer                 True
dtype: bool


In [933]:
print(df_missing_data.isnull().sum())

order_id                          0
customer_id                       0
date                              0
nearest_warehouse                10
shopping_cart                     0
order_price                      10
delivery_charges                  0
customer_lat                     10
customer_long                    10
coupon_discount                   0
order_total                      10
season                           10
is_expedited_delivery             0
distance_to_nearest_warehouse    10
latest_customer_review            0
is_happy_customer                10
dtype: int64


### df_warehouses dataframe review

In [934]:
type(df_warehouses)

pandas.core.frame.DataFrame

In [935]:
print(df_warehouses.columns)

Index(['names', 'lat', 'lon'], dtype='object')


In [936]:
type(df_warehouses)

pandas.core.frame.DataFrame

In [937]:
df_warehouses.head()

Unnamed: 0,names,lat,lon
0,Nickolson,-37.818595,144.969551
1,Thompson,-37.812673,144.947069
2,Bakers,-37.809996,144.995232


In [938]:
# using display for better formatting in Notebook , use print usually
display(df_warehouses.describe())

Unnamed: 0,lat,lon
count,3.0,3.0
mean,-37.813755,144.970617
std,0.0044,0.024099
min,-37.818595,144.947069
25%,-37.815634,144.95831
50%,-37.812673,144.969551
75%,-37.811335,144.982392
max,-37.809996,144.995232


In [939]:
print(df_warehouses.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   names   3 non-null      object 
 1   lat     3 non-null      float64
 2   lon     3 non-null      float64
dtypes: float64(2), object(1)
memory usage: 200.0+ bytes
None


In [940]:
print(df_warehouses.shape)

(3, 3)


In [941]:
print(df_warehouses.isna().any())

names    False
lat      False
lon      False
dtype: bool


In [942]:
print(df_warehouses.isnull().sum())

names    0
lat      0
lon      0
dtype: int64


## Re-import from CSV 
***

In [943]:
# read the CSV from the local Jupyter Notebook directory 
# parsing dates on import since dates were objects change datatype on order_price
df_dirty_data = pd.read_csv(dirty_data, parse_dates=['date'], dtype={'order_price': 'float'}) 
df_missing_data = pd.read_csv(missing_data, parse_dates=['date']) 

#  cant use dtype={'is_happy_customer': 'bool'} because is_happy_customer contains NaN 

In [944]:
print(df_dirty_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       500 non-null    object        
 1   customer_id                    500 non-null    object        
 2   date                           500 non-null    datetime64[ns]
 3   nearest_warehouse              500 non-null    object        
 4   shopping_cart                  500 non-null    object        
 5   order_price                    500 non-null    float64       
 6   delivery_charges               500 non-null    float64       
 7   customer_lat                   500 non-null    float64       
 8   customer_long                  500 non-null    float64       
 9   coupon_discount                500 non-null    int64         
 10  order_total                    500 non-null    float64       
 11  season             

In [945]:
print(df_missing_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       500 non-null    object        
 1   customer_id                    500 non-null    object        
 2   date                           500 non-null    datetime64[ns]
 3   nearest_warehouse              490 non-null    object        
 4   shopping_cart                  500 non-null    object        
 5   order_price                    490 non-null    float64       
 6   delivery_charges               500 non-null    float64       
 7   customer_lat                   490 non-null    float64       
 8   customer_long                  490 non-null    float64       
 9   coupon_discount                500 non-null    int64         
 10  order_total                    490 non-null    float64       
 11  season             

In [946]:
# check for orderid uniqueness in df_dirty_data
duplicate_dirty_data = df_dirty_data[df_dirty_data.duplicated(['order_id'])]
print(duplicate_dirty_data)

Empty DataFrame
Columns: [order_id, customer_id, date, nearest_warehouse, shopping_cart, order_price, delivery_charges, customer_lat, customer_long, coupon_discount, order_total, season, is_expedited_delivery, distance_to_nearest_warehouse, latest_customer_review, is_happy_customer]
Index: []


In [947]:
# check for orderid uniqueness in df_missing_data
duplicate_missing_data = df_missing_data[df_missing_data.duplicated(['order_id'])]
print(duplicate_missing_data)

Empty DataFrame
Columns: [order_id, customer_id, date, nearest_warehouse, shopping_cart, order_price, delivery_charges, customer_lat, customer_long, coupon_discount, order_total, season, is_expedited_delivery, distance_to_nearest_warehouse, latest_customer_review, is_happy_customer]
Index: []


In [948]:
# create an order_id index on df_dirty_data
df_dirty_data.set_index('order_id',inplace=True)

In [949]:
# create an order_id index on df_missing_data
df_missing_data.set_index('order_id',inplace=True)

In [950]:
df_dirty_data.index

Index(['ORD182494', 'ORD395518', 'ORD494479', 'ORD019224', 'ORD104032',
       'ORD146760', 'ORD337984', 'ORD072312', 'ORD377837', 'ORD462194',
       ...
       'ORD182549', 'ORD435271', 'ORD082002', 'ORD034351', 'ORD379700',
       'ORD475510', 'ORD086060', 'ORD079320', 'ORD026546', 'ORD085447'],
      dtype='object', name='order_id', length=500)

In [951]:
df_missing_data.index

Index(['ORD382112', 'ORD378488', 'ORD279446', 'ORD277196', 'ORD116193',
       'ORD005004', 'ORD296379', 'ORD447851', 'ORD078449', 'ORD018258',
       ...
       'ORD273851', 'ORD329425', 'ORD277938', 'ORD297371', 'ORD218866',
       'ORD289820', 'ORD425999', 'ORD252675', 'ORD215989', 'ORD414852'],
      dtype='object', name='order_id', length=500)

## Detect and fix errors in dirty_data
***

## Fix missing values in missing_data
***

In [907]:
# show missing values
print(df_missing_data.isnull().sum())

customer_id                       0
date                              0
nearest_warehouse                 0
shopping_cart                     0
order_price                      10
delivery_charges                  0
customer_lat                     10
customer_long                    10
coupon_discount                   0
order_total                      10
season                           10
is_expedited_delivery             0
distance_to_nearest_warehouse    10
latest_customer_review            0
is_happy_customer                10
dtype: int64


### missing 'nearest_warehouse'

In [952]:
# getting rows for the missing values for nearest_warehouse
nearest_warehouse_to_update = df_missing_data[df_missing_data['nearest_warehouse'].isnull()]

# using display command for better formatting in Notebook , use print usually
display(nearest_warehouse_to_update)

Unnamed: 0_level_0,customer_id,date,nearest_warehouse,shopping_cart,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,season,is_expedited_delivery,distance_to_nearest_warehouse,latest_customer_review,is_happy_customer
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ORD447851,ID0591430562,2019-02-07,,"[('Alcon 10', 2), ('Toshika 750', 2), ('Univer...",32440.0,67.27,-37.817412,144.96759,5,30885.27,Summer,False,0.2169,"five stars it was a gift, my nephew is loving it.",True
ORD256447,ID1404216319,2019-12-13,,"[('Candle Inferno', 1), ('Olivia x460', 1), ('...",6015.0,77.47,-37.801138,144.988605,25,4588.72,Summer,False,1.1454,i love this phone i love this phone. very fast...,True
ORD159597,ID2632208068,2019-08-18,,"[('Lucent 330S', 2), ('Candle Inferno', 2), ('...",9630.0,80.4,-37.819355,144.989577,10,8747.4,Winter,True,1.1544,quality love it,True
ORD474381,ID2776391626,2019-12-11,,"[('Olivia x460', 2), ('Thunder line', 2), ('Ca...",7670.0,75.06,-37.810971,144.963102,25,5827.56,Summer,False,1.0207,what's not to like? this phone does everything...,True
ORD372144,ID0814520160,2019-12-11,,"[('iAssist Line', 2), ('Olivia x460', 1), ('Al...",18945.0,96.53,-37.812604,144.957172,15,16199.78,Summer,True,0.8885,"it’s good, but it’s not compatible with my car...",True
ORD052805,ID4234925766,2019-05-19,,"[('Universe Note', 2), ('Candle Inferno', 2), ...",9940.0,68.65,-37.804575,144.967468,15,8517.65,Autumn,True,1.5714,accessories not working properly. i have used ...,False
ORD222346,ID0368376953,2019-10-22,,"[('Olivia x460', 2), ('Universe Note', 1), ('L...",8360.0,81.02,-37.802945,144.981699,0,8441.02,Spring,False,1.4257,i liked thank you but don't have headphones 🤔 ...,True
ORD229269,ID0582156569,2019-12-31,,"[('Alcon 10', 1), ('pearTV', 1), ('iAssist Lin...",21805.0,97.96,-37.788265,144.929202,25,16451.71,Summer,False,3.1388,great product. great product came as described...,True
ORD433678,ID0176736472,2019-09-04,,"[('Olivia x460', 1), ('iAssist Line', 1), ('pe...",16070.0,97.81,-37.81412,144.968115,0,16167.81,Spring,True,0.5139,amazing!,True
ORD075737,ID2702095693,2019-10-22,,"[('iAssist Line', 2), ('pearTV', 2), ('Toshika...",25710.0,73.29,-37.810628,144.990476,15,21926.79,Spring,False,0.4241,then this phone is great for drawing on the go...,True


In [953]:
# checking the missing values for nearest_warehouse
nearest_warehouse_indexto_update = df_missing_data[df_missing_data['nearest_warehouse'].isnull()].index

print(nearest_warehouse_indexto_update)

Index(['ORD447851', 'ORD256447', 'ORD159597', 'ORD474381', 'ORD372144',
       'ORD052805', 'ORD222346', 'ORD229269', 'ORD433678', 'ORD075737'],
      dtype='object', name='order_id')


In [958]:
# updating missing values for season
for i, row in nearest_warehouse_to_update.iterrows():
    df_missing_data.at[i,'nearest_warehouse'] = get_missing_nearest_warehouse(df_missing_data.at[i,'customer_lat'],df_missing_data.at[i,'customer_long'])[0]

In [959]:
# no more missing values for season
print(df_missing_data[df_missing_data['nearest_warehouse'].isnull()])

Empty DataFrame
Columns: [customer_id, date, nearest_warehouse, shopping_cart, order_price, delivery_charges, customer_lat, customer_long, coupon_discount, order_total, season, is_expedited_delivery, distance_to_nearest_warehouse, latest_customer_review, is_happy_customer]
Index: []


In [960]:
#check the indexes for the update 'nearest_warehouse' --  all done
df_missing_data.loc[nearest_warehouse_indexto_update]

Unnamed: 0_level_0,customer_id,date,nearest_warehouse,shopping_cart,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,season,is_expedited_delivery,distance_to_nearest_warehouse,latest_customer_review,is_happy_customer
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ORD447851,ID0591430562,2019-02-07,Nickolson,"[('Alcon 10', 2), ('Toshika 750', 2), ('Univer...",32440.0,67.27,-37.817412,144.96759,5,30885.27,Summer,False,0.2169,"five stars it was a gift, my nephew is loving it.",True
ORD256447,ID1404216319,2019-12-13,Bakers,"[('Candle Inferno', 1), ('Olivia x460', 1), ('...",6015.0,77.47,-37.801138,144.988605,25,4588.72,Summer,False,1.1454,i love this phone i love this phone. very fast...,True
ORD159597,ID2632208068,2019-08-18,Bakers,"[('Lucent 330S', 2), ('Candle Inferno', 2), ('...",9630.0,80.4,-37.819355,144.989577,10,8747.4,Winter,True,1.1544,quality love it,True
ORD474381,ID2776391626,2019-12-11,Nickolson,"[('Olivia x460', 2), ('Thunder line', 2), ('Ca...",7670.0,75.06,-37.810971,144.963102,25,5827.56,Summer,False,1.0207,what's not to like? this phone does everything...,True
ORD372144,ID0814520160,2019-12-11,Thompson,"[('iAssist Line', 2), ('Olivia x460', 1), ('Al...",18945.0,96.53,-37.812604,144.957172,15,16199.78,Summer,True,0.8885,"it’s good, but it’s not compatible with my car...",True
ORD052805,ID4234925766,2019-05-19,Nickolson,"[('Universe Note', 2), ('Candle Inferno', 2), ...",9940.0,68.65,-37.804575,144.967468,15,8517.65,Autumn,True,1.5714,accessories not working properly. i have used ...,False
ORD222346,ID0368376953,2019-10-22,Bakers,"[('Olivia x460', 2), ('Universe Note', 1), ('L...",8360.0,81.02,-37.802945,144.981699,0,8441.02,Spring,False,1.4257,i liked thank you but don't have headphones 🤔 ...,True
ORD229269,ID0582156569,2019-12-31,Thompson,"[('Alcon 10', 1), ('pearTV', 1), ('iAssist Lin...",21805.0,97.96,-37.788265,144.929202,25,16451.71,Summer,False,3.1388,great product. great product came as described...,True
ORD433678,ID0176736472,2019-09-04,Nickolson,"[('Olivia x460', 1), ('iAssist Line', 1), ('pe...",16070.0,97.81,-37.81412,144.968115,0,16167.81,Spring,True,0.5139,amazing!,True
ORD075737,ID2702095693,2019-10-22,Bakers,"[('iAssist Line', 2), ('pearTV', 2), ('Toshika...",25710.0,73.29,-37.810628,144.990476,15,21926.79,Spring,False,0.4241,then this phone is great for drawing on the go...,True


Unnamed: 0_level_0,customer_id,date,nearest_warehouse,shopping_cart,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,season,is_expedited_delivery,distance_to_nearest_warehouse,latest_customer_review,is_happy_customer
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1


### missing 'order_price'

In [None]:
# show missing 'order_price' rows
df_missing_data[df_missing_data['order_price'].isnull()]

In [None]:
# getting the index for the missing values for order_price
order_price_indexto_update = df_missing_data[df_missing_data['order_price'].isnull()].index

In [None]:
# updating the missing values for order_price
for i in order_price_indexto_update:
    df_missing_data.at[i,'order_price'] = get_missing_order_price(df_missing_data.at[i,'order_total'],df_missing_data.at[i,'delivery_charges'],df_missing_data.at[i,'coupon_discount'])

In [None]:
# no more missing values for order_price
df_missing_data[df_missing_data['order_price'].isnull()]

In [None]:
# check the indexes 'order_price' --  all done
for i in order_price_indexto_update:
    print(df_missing_data.at[i,'order_price'])

In [None]:
df_missing_data.filter(items=["order_price"])

### missing 'order_total'

In [None]:
# show missing 'order_total' rows
df_missing_data[df_missing_data['order_total'].isnull()]

In [None]:
# getting the index for the missing values for order_total
order_total_indexto_update = df_missing_data[df_missing_data['order_total'].isnull()].index

In [None]:
# updating the missing values for order_total
for i in order_total_indexto_update:
    df_missing_data.at[i,'order_total'] = get_missing_order_price(df_missing_data.at[i,'order_price'],df_missing_data.at[i,'delivery_charges'],df_missing_data.at[i,'coupon_discount'])

In [None]:
# no more missing values for order_total
df_missing_data[df_missing_data['order_total'].isnull()]

In [None]:
# check the indexes 'order_total' --  all done
for i in order_total_indexto_update:
    print(df_missing_data.at[i,'order_total'])

In [None]:
df_missing_data.filter(items=["order_total"])

### missing 'season'

In [871]:
# show missing 'season' rows
df_missing_data[df_missing_data['season'].isnull()]

Unnamed: 0_level_0,customer_id,date,nearest_warehouse,shopping_cart,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,season,is_expedited_delivery,distance_to_nearest_warehouse,latest_customer_review,is_happy_customer
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ORD279446,ID0370751503,2019-03-24,Bakers,"[('Candle Inferno', 2), ('Universe Note', 1)]",4310.0,78.75,-37.812941,144.985883,15,3742.25,,True,0.8852,best bang for your buck! great overall android...,True
ORD083901,ID5457962366,2019-09-14,Thompson,"[('Toshika 750', 1), ('pearTV', 1), ('iStream'...",10780.0,78.41,-37.822479,144.939978,15,9241.41,,False,1.2571,excelent product excellent relationship qualit...,True
ORD250242,ID0287630803,2019-10-13,Thompson,"[('Olivia x460', 2), ('iStream', 2)]",2750.0,100.85,-37.806362,144.946628,5,2713.35,,True,0.7036,wtc as of right now it's more of a paper weigh...,True
ORD078069,ID1458418088,2019-01-12,Bakers,"[('Lucent 330S', 1), ('Toshika 750', 2), ('pea...",16180.0,101.81,-37.81967,145.010019,0,16281.81,,True,1.6884,excelente el producto llego en perfectas condi...,True
ORD016552,ID0283334264,2019-10-20,Thompson,"[('iAssist Line', 1), ('Universe Note', 1), ('...",10035.0,83.29,-37.80134,144.95612,25,7609.54,,False,1.4917,saves you money and trustworthy. i love my pho...,True
ORD429692,ID0575539547,2019-10-27,Nickolson,"[('iAssist Line', 1), ('Alcon 10', 2), ('pearT...",32745.0,79.86,-37.823686,144.982575,15,27913.11,,False,1.2778,overall good phone fingerprint not so good,True
ORD434639,ID1449297341,2019-06-03,Nickolson,"[('Thunder line', 1), ('iStream', 2), ('Univer...",9380.0,51.44,-37.815768,144.980178,25,7086.44,,False,0.986,doesn't work arrived with completely dead batt...,False
ORD032506,ID0628177290,2019-06-28,Thompson,"[('iStream', 2), ('Alcon 10', 2), ('Candle Inf...",18630.0,61.78,-37.811359,144.947461,5,17760.28,,False,0.1503,photo quality is amazing. the phone is fast an...,True
ORD027479,ID0493051199,2019-06-19,Bakers,"[('Candle Inferno', 2), ('Alcon 10', 1), ('iSt...",9960.0,63.76,-37.807695,144.989743,5,9525.76,,False,0.5465,perfect starter phone fantastic phone for y 11...,True
ORD437147,ID0052450505,2019-10-31,Thompson,"[('iAssist Line', 2), ('Alcon 10', 2)]",22350.0,85.96,-37.795479,144.936073,15,19083.46,,False,2.1445,this was a gift for a family member they reall...,True


In [872]:
# getting the distinct values for season
print(df_missing_data['season'].sort_values().unique())

['Autumn' 'Spring' 'Summer' 'Winter' nan]


In [873]:
# get the seasons months from the grouping of the orders..obviously Southern hemisphere - a random lat long shows its Melbourne, Australia area
df_missing_data.groupby([pd.Grouper(key='date', axis=0, freq='M'),'season']).sum().filter(items=['date','season'])
# summer = months 12,1,2
# Autumn = months 3,4,5
# Winter = months 6,7,8
# Spring = months 9,10,11

date,season
2019-01-31,Summer
2019-02-28,Summer
2019-03-31,Autumn
2019-04-30,Autumn
2019-05-31,Autumn
2019-06-30,Winter
2019-07-31,Winter
2019-08-31,Winter
2019-09-30,Spring
2019-10-31,Spring


In [874]:
# getting rows for the missing values for season
season_to_update = df_missing_data[df_missing_data['season'].isnull()]

In [875]:
# getting the index for the missing values for season
season_to_update_index = df_missing_data[df_missing_data['season'].isnull()].index

In [887]:
# updating missing values for season
for i, row in season_indexto_update.iterrows():
    df_missing_data.at[i,'season'] = get_missing_season(row['date'])

In [888]:
# no more missing values for season
df_missing_data[df_missing_data['season'].isnull()]

Unnamed: 0_level_0,customer_id,date,nearest_warehouse,shopping_cart,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,season,is_expedited_delivery,distance_to_nearest_warehouse,latest_customer_review,is_happy_customer
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ORD279446,ID0370751503,2019-03-24,Bakers,"[('Candle Inferno', 2), ('Universe Note', 1)]",4310.0,78.75,-37.812941,144.985883,15,3742.25,,True,0.8852,best bang for your buck! great overall android...,True
ORD083901,ID5457962366,2019-09-14,Thompson,"[('Toshika 750', 1), ('pearTV', 1), ('iStream'...",10780.0,78.41,-37.822479,144.939978,15,9241.41,,False,1.2571,excelent product excellent relationship qualit...,True
ORD250242,ID0287630803,2019-10-13,Thompson,"[('Olivia x460', 2), ('iStream', 2)]",2750.0,100.85,-37.806362,144.946628,5,2713.35,,True,0.7036,wtc as of right now it's more of a paper weigh...,True
ORD078069,ID1458418088,2019-01-12,Bakers,"[('Lucent 330S', 1), ('Toshika 750', 2), ('pea...",16180.0,101.81,-37.81967,145.010019,0,16281.81,,True,1.6884,excelente el producto llego en perfectas condi...,True
ORD016552,ID0283334264,2019-10-20,Thompson,"[('iAssist Line', 1), ('Universe Note', 1), ('...",10035.0,83.29,-37.80134,144.95612,25,7609.54,,False,1.4917,saves you money and trustworthy. i love my pho...,True
ORD429692,ID0575539547,2019-10-27,Nickolson,"[('iAssist Line', 1), ('Alcon 10', 2), ('pearT...",32745.0,79.86,-37.823686,144.982575,15,27913.11,,False,1.2778,overall good phone fingerprint not so good,True
ORD434639,ID1449297341,2019-06-03,Nickolson,"[('Thunder line', 1), ('iStream', 2), ('Univer...",9380.0,51.44,-37.815768,144.980178,25,7086.44,,False,0.986,doesn't work arrived with completely dead batt...,False
ORD032506,ID0628177290,2019-06-28,Thompson,"[('iStream', 2), ('Alcon 10', 2), ('Candle Inf...",18630.0,61.78,-37.811359,144.947461,5,17760.28,,False,0.1503,photo quality is amazing. the phone is fast an...,True
ORD027479,ID0493051199,2019-06-19,Bakers,"[('Candle Inferno', 2), ('Alcon 10', 1), ('iSt...",9960.0,63.76,-37.807695,144.989743,5,9525.76,,False,0.5465,perfect starter phone fantastic phone for y 11...,True
ORD437147,ID0052450505,2019-10-31,Thompson,"[('iAssist Line', 2), ('Alcon 10', 2)]",22350.0,85.96,-37.795479,144.936073,15,19083.46,,False,2.1445,this was a gift for a family member they reall...,True


In [None]:
#check the indexes for the update 'season' --  all done
for row in season_to_update_index.iterrows():
    row['season']

### missing 'distance_to_nearest_warehouse'

In [None]:
# show missing 'distance_to_nearest_warehouse' rows
df_missing_data[df_missing_data['distance_to_nearest_warehouse'].isnull()]

In [None]:
# getting the index for the missing values for nearest_warehouse
distance_to_nearest_warehouse_indexto_update = df_missing_data[df_missing_data['distance_to_nearest_warehouse'].isnull()].index

In [None]:
# updating  missing values for nearest_warehouse
for i in distance_to_nearest_warehouse_indexto_update:
    df_missing_data.at[i,'distance_to_nearest_warehouse'] = get_missing_nearest_warehouse(df_missing_data.at[i,'customer_lat'],df_missing_data.at[i,'customer_long'])[1]

In [None]:
# no more missing values for nearest_warehouse
df_missing_data[df_missing_data['distance_to_nearest_warehouse'].isnull()].index

In [None]:
# check the indexes 'nearest_warehouse' --  all done
for i in nearest_warehouse_indexto_update:
    print(df_missing_data.at[i,'distance_to_nearest_warehouse'])

In [None]:
df_missing_data.filter(items=["distance_to_nearest_warehouse"])

### missing 'customer_lat'

In [None]:
# show missing 'customer_lat' rows
df_missing_data[df_missing_data['customer_lat'].isnull()]

In [None]:
# getting the index for the missing values for customer_lat
customer_lat_indexto_update = df_missing_data[df_missing_data['customer_lat'].isnull()].index
customer_lat_indexto_update

In [None]:
df_missing_data[df_missing_data.duplicated(['customer_id'])]

### missing 'customer_long'

In [None]:
# show missing 'customer_long' rows
df_missing_data[df_missing_data['customer_long'].isnull()]

In [None]:
# getting the index for the missing values for customer_long
customer_long_indexto_update = df_missing_data[df_missing_data['customer_long'].isnull()].index
customer_long_indexto_update

## Analyse
***

In [None]:
print(df_missing_data.columns)

## Visualise
***