# Kieran Molloy UCDPA Project 
### Course CIDAB 2022-01-18
***

### Environment Information
<table align="left">
<tr>
    <th>Environment Type</th>
    <th>Anaconda Version</th>
    <th>Anaconda Build Channel</th>
    <th>Python Version</th>
</tr>
<tr>
    <td>Anaconda </td>
    <td>2021.11 </td>
    <td>py39_0 </td>
    <td>3.9.7 </td>
</tr>
</table>

## Links to Kaggle Datasource: 
***

#### _[Transactional Retail Dataset of Electronics Store](https://www.kaggle.com/datasets/muhammadshahrayar/transactional-retail-dataset-of-electronics-store)_

*  _[dirty_data.csv](https://www.kaggle.com/datasets/muhammadshahrayar/transactional-retail-dataset-of-electronics-store?select=dirty_data.csv)_
*  _[missing_data.csv](https://www.kaggle.com/datasets/muhammadshahrayar/transactional-retail-dataset-of-electronics-store?select=missing_data.csv)_
*  _[warehouses.csv](https://www.kaggle.com/datasets/muhammadshahrayar/transactional-retail-dataset-of-electronics-store?select=warehouses.csv)_


## Install Modules

In [None]:
# %pip install geopy

## Import Modules
***

In [None]:
from matplotlib import pyplot as plt
from IPython.display import display 

import pandas as pd
import numpy as np
import seaborn as sns

from geopy.distance import geodesic

## Set Variables
***

In [None]:
dirty_data = 'dirty_data.csv'
missing_data = 'missing_data.csv'
warehouses = 'warehouses.csv'

## Functions
***

In [None]:
def get_missing_nearest_warehouse(lat , Long ):
    ''' use coordinates to find the closest warehouse '''
    
    # customer coordinates are provided
    coords_customer = lat, Long
    
    # get warehouse latitude and longitide from warehouse table
    coords_Nickolson = df_warehouses['lat'].loc[df_warehouses.index[0]], df_warehouses['lon'].loc[df_warehouses.index[0]]
    coords_Thompson = df_warehouses['lat'].loc[df_warehouses.index[1]], df_warehouses['lon'].loc[df_warehouses.index[1]]
    coords_Bakers = df_warehouses['lat'].loc[df_warehouses.index[2]], df_warehouses['lon'].loc[df_warehouses.index[2]]
    
    # use geodesic to do a km distance comparison
    Nickolson_cust_dist = geodesic(coords_customer, coords_Nickolson).km
    Thompson_cust_dist = geodesic(coords_customer, coords_Thompson).km
    Bakers_cust_dist = geodesic(coords_customer, coords_Bakers).km
    
    val = (Nickolson_cust_dist ,Thompson_cust_dist ,Bakers_cust_dist)
    
    x = val.index(min(val))
    
    if x == 0:
        warehouse = 'Nickolson'
        distance = Nickolson_cust_dist
        return(warehouse , distance)
    elif x == 1:
        warehouse = 'Thompson'
        distance = Nickolson_cust_dist
        return(warehouse , distance)
    else: 
        warehouse = 'Bakers'
        distance = Nickolson_cust_dist
        return(warehouse , distance)

In [None]:
def get_missing_order_price(order_total, delivery_charges, coupon_discount):
    ''' calculate order_price from order_total, delivery_charges and coupon_discount '''
    order_price = (order_total - delivery_charges) / (100-coupon_discount) * 100
    return(round(order_price, 2))

In [None]:
def get_missing_order_total(order_price, delivery_charges, coupon_discount):
    ''' calculate order_total from order_price, delivery_charges and coupon_discount'''
    order_total = order_price - (order_price / 100 * coupon_discount) + delivery_charges
    return(round(order_total, 2))

In [None]:
get_missing_order_total(9080,62.71,0)

## CSV import 
***

In [None]:
# read the CSV from the local Jupyter Notebook directory 
df_dirty_data = pd.read_csv(dirty_data)
df_missing_data = pd.read_csv(missing_data)
df_warehouses = pd.read_csv(warehouses)

## Initial CSV Review
***

### df_dirty_data dataframe

In [None]:
type(df_dirty_data)

In [None]:
print(df_dirty_data.columns)

In [None]:
df_dirty_data.head(10)

In [None]:
# using display for better formatting in Notebook , use print usually
display(df_dirty_data.describe())

In [None]:
print(df_dirty_data.info())

In [None]:
print(df_dirty_data.shape)

In [None]:
print(df_dirty_data.isna().any())

In [None]:
print(df_dirty_data.isnull().sum())

### df_missing_data dataframe

In [None]:
type(df_missing_data)

In [None]:
print(df_missing_data.columns)

In [None]:
df_missing_data.head(10)

In [None]:
# using display for better formatting in Notebook , use print usually
display(df_missing_data.describe())

In [None]:
print(df_missing_data.info())

In [None]:
print(df_missing_data.shape)

In [None]:
print(df_missing_data.isna().any())

In [321]:
print(df_missing_data.isnull().sum())

order_id                          0
customer_id                       0
date                              0
nearest_warehouse                 0
shopping_cart                     0
order_price                       0
delivery_charges                  0
customer_lat                     10
customer_long                    10
coupon_discount                   0
order_total                       0
season                           10
is_expedited_delivery             0
distance_to_nearest_warehouse     0
latest_customer_review            0
is_happy_customer                10
dtype: int64


### df_warehouses dataframe

In [None]:
type(df_warehouses)

In [None]:
print(df_warehouses.columns)

In [None]:
type(df_warehouses)

In [None]:
df_warehouses.head()

In [None]:
# using display for better formatting in Notebook , use print usually
display(df_warehouses.describe())

In [None]:
print(df_warehouses.info())

In [None]:
print(df_warehouses.shape)

In [None]:
print(df_warehouses.isna().any())

In [None]:
print(df_warehouses.isnull().sum())

## Re-import from CSV 
***

In [None]:
# read the CSV from the local Jupyter Notebook directory 
# parsing dates on import since dates were objects
df_dirty_data = pd.read_csv(dirty_data, parse_dates=['date'], dtype={'order_price': 'float'}) 
df_missing_data = pd.read_csv(missing_data, parse_dates=['date']) 
#  cant use dtype={'is_happy_customer': 'bool'} because is_happy_customer contains NaN 

In [None]:
print(df_dirty_data.info())

In [None]:
print(df_missing_data.info())

## Detect and fix errors in dirty_data
***

## Fix missing values in missing_data
***

### missing 'nearest_warehouse'

In [None]:
# checking the missing values for nearest_warehouse
nearest_warehouse_indexto_update = df_missing_data[df_missing_data['nearest_warehouse'].isnull()].index

for i in nearest_warehouse_indexto_update:
    df_missing_data.at[i,'nearest_warehouse'] = get_missing_nearest_warehouse(df_missing_data.at[i,'customer_lat'],df_missing_data.at[i,'customer_long'])[0]

In [None]:
# no more missing values for nearest_warehouse
df_missing_data[df_missing_data['nearest_warehouse'].isnull()].index

In [None]:
# check the indexes 'nearest_warehouse' --  all done
for i in nearest_warehouse_indexto_update:
    print(df_missing_data.at[i,'nearest_warehouse'])

In [None]:
df_missing_data.filter(items=["nearest_warehouse"])

### missing 'order_price'

In [None]:
# checking the missing values for order_price
order_price_indexto_update = df_missing_data[df_missing_data['order_price'].isnull()].index

for i in order_price_indexto_update:
    df_missing_data.at[i,'order_price'] = get_missing_order_price(df_missing_data.at[i,'order_total'],df_missing_data.at[i,'delivery_charges'],df_missing_data.at[i,'coupon_discount'])

In [None]:
# no more missing values for order_price
df_missing_data[df_missing_data['order_price'].isnull()]

In [None]:
# check the indexes 'nearest_warehouse' --  all done
for i in order_price_indexto_update:
    print(df_missing_data.at[i,'order_price'])

In [None]:
df_missing_data.filter(items=["order_price"])

### missing 'order_total'

In [None]:
# checking the missing values for order_total
order_total_indexto_update = df_missing_data[df_missing_data['order_total'].isnull()].index

for i in order_total_indexto_update:
    df_missing_data.at[i,'order_total'] = get_missing_order_price(df_missing_data.at[i,'order_price'],df_missing_data.at[i,'delivery_charges'],df_missing_data.at[i,'coupon_discount'])

In [None]:
# no more missing values for order_price
df_missing_data[df_missing_data['order_total'].isnull()]

In [None]:
# check the indexes 'nearest_warehouse' --  all done
for i in order_total_indexto_update:
    print(df_missing_data.at[i,'order_total'])

In [None]:
df_missing_data.filter(items=["order_total"])

### missing 'season'

In [None]:
# checking the missing values for nearest_warehouse
df_missing_data[df_missing_data['season'].isnull()]

In [327]:
print(df_missing_data['season'].sort_values().unique())

['Autumn' 'Spring' 'Summer' 'Winter' nan]


In [344]:
df_missing_data.groupby([pd.Grouper(key='date', axis=0, freq='M'),'season']).sum()
# i can get the seasons months from the grouping of the orders
# summer = months 12,1,2
# Autumn = months 3,4,5
# Winter = months 6,7,8
# Spring = months 9,10,11

Unnamed: 0_level_0,Unnamed: 1_level_0,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,is_expedited_delivery,distance_to_nearest_warehouse
date,season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-01-31,Summer,495455.0,3283.48,-1399.086287,5363.466155,495,438423.98,26,46.467505
2019-02-28,Summer,478370.0,3003.04,-1399.112954,5363.818217,420,428813.77,17,41.350006
2019-03-31,Autumn,511585.0,2601.76,-1323.372207,5363.803675,400,460778.01,18,43.782307
2019-04-30,Autumn,447310.0,2358.29,-1285.562329,4493.944657,310,416252.79,18,37.2884
2019-05-31,Autumn,651095.0,3026.07,-1625.952979,6233.78913,420,586820.97,21,52.880411
2019-06-30,Winter,432455.0,2100.05,-1172.216819,4203.865735,365,386955.04,15,29.8413
2019-07-31,Winter,444170.0,2685.33,-1436.784366,5798.578496,435,396865.83,18,42.722201
2019-08-31,Winter,662775.0,3570.97,-1928.486378,7248.339875,530,591239.22,24,58.9299
2019-09-30,Spring,518470.0,3002.75,-1323.447961,5073.708553,370,471345.25,16,33.2314
2019-10-31,Spring,581710.0,4058.65,-1701.720205,6378.557649,640,520021.21,27,44.3109


In [None]:
df_missing_data['season']

### missing 'distance_to_nearest_warehouse'

In [None]:
# checking the missing values for nearest_warehouse
distance_to_nearest_warehouse_indexto_update = df_missing_data[df_missing_data['distance_to_nearest_warehouse'].isnull()].index

for i in distance_to_nearest_warehouse_indexto_update:
    df_missing_data.at[i,'distance_to_nearest_warehouse'] = get_missing_nearest_warehouse(df_missing_data.at[i,'customer_lat'],df_missing_data.at[i,'customer_long'])[1]

In [None]:
# no more missing values for nearest_warehouse
df_missing_data[df_missing_data['distance_to_nearest_warehouse'].isnull()].index

In [None]:
# check the indexes 'nearest_warehouse' --  all done
for i in nearest_warehouse_indexto_update:
    print(df_missing_data.at[i,'distance_to_nearest_warehouse'])

In [None]:
df_missing_data.filter(items=["distance_to_nearest_warehouse"])

## Analyse
***

In [None]:
print(df_missing_data.columns)

## Visualise
***