# Kieran Molloy UCDPA Project 
### Course CIDAB 2022-01-18
***

### Environment Information
<table align="left">
<tr>
    <th>Environment Type</th>
    <th>Anaconda Version</th>
    <th>Anaconda Build Channel</th>
    <th>Python Version</th>
</tr>
<tr>
    <td>Anaconda </td>
    <td>2021.11 </td>
    <td>py39_0 </td>
    <td>3.9.7 </td>
</tr>
</table>

## Links to Kaggle Datasource: 
***

#### _[Transactional Retail Dataset of Electronics Store](https://www.kaggle.com/datasets/muhammadshahrayar/transactional-retail-dataset-of-electronics-store)_

*  _[dirty_data.csv](https://www.kaggle.com/datasets/muhammadshahrayar/transactional-retail-dataset-of-electronics-store?select=dirty_data.csv)_
*  _[missing_data.csv](https://www.kaggle.com/datasets/muhammadshahrayar/transactional-retail-dataset-of-electronics-store?select=missing_data.csv)_
*  _[warehouses.csv](https://www.kaggle.com/datasets/muhammadshahrayar/transactional-retail-dataset-of-electronics-store?select=warehouses.csv)_


## Install Modules

In [None]:
# %pip install geopy

## Import Modules
***

In [None]:
from matplotlib import pyplot as plt
from IPython.display import display 

import pandas as pd
import numpy as np
import seaborn as sns

from datetime import datetime

from geopy.distance import geodesic

## Set Variables
***

In [None]:
dirty_data = 'dirty_data.csv'
missing_data = 'missing_data.csv'
warehouses = 'warehouses.csv'

## Functions
***

In [None]:
def get_missing_nearest_warehouse(lat , Long ):
    ''' use coordinates to find the closest warehouse (clrow flies)'''
    
    # customer coordinates are provided
    coords_customer = lat, Long
    
    # get warehouse latitude and longitide from warehouse table
    coords_Nickolson = df_warehouses['lat'].loc[df_warehouses.index[0]], df_warehouses['lon'].loc[df_warehouses.index[0]]
    coords_Thompson = df_warehouses['lat'].loc[df_warehouses.index[1]], df_warehouses['lon'].loc[df_warehouses.index[1]]
    coords_Bakers = df_warehouses['lat'].loc[df_warehouses.index[2]], df_warehouses['lon'].loc[df_warehouses.index[2]]
    
    # use geodesic to do a km distance comparison
    Nickolson_cust_dist = geodesic(coords_customer, coords_Nickolson).km
    Thompson_cust_dist = geodesic(coords_customer, coords_Thompson).km
    Bakers_cust_dist = geodesic(coords_customer, coords_Bakers).km
    
    val = (Nickolson_cust_dist ,Thompson_cust_dist ,Bakers_cust_dist)
    
    x = val.index(min(val))
    
    if x == 0:
        warehouse = 'Nickolson'
        distance = Nickolson_cust_dist
        return(warehouse , distance)
    elif x == 1:
        warehouse = 'Thompson'
        distance = Nickolson_cust_dist
        return(warehouse , distance)
    else: 
        warehouse = 'Bakers'
        distance = Nickolson_cust_dist
        return(warehouse , distance)

In [None]:
def get_missing_order_price(order_total, delivery_charges, coupon_discount):
    ''' calculate order_price from order_total, delivery_charges and coupon_discount '''
    order_price = (order_total - delivery_charges) / (100-coupon_discount) * 100
    return(round(order_price, 2))

In [None]:
def get_missing_order_total(order_price, delivery_charges, coupon_discount):
    ''' calculate order_total from order_price, delivery_charges and coupon_discount'''
    order_total = order_price - (order_price / 100 * coupon_discount) + delivery_charges
    return(round(order_total, 2))

In [None]:
def get_missing_season(date):
    '''calculate season from date'''
    # Spring = months 9,10,11
    # summer = months 12,1,2
    # Autumn = months 3,4,5
    # Winter = months 6,7,8

    month = date.month

    if month in [9,10,11]:
        season = 'Spring'
    elif month in [12,1,2]:
        season = 'Summer'
    elif month in [3,4,5]:
        season = 'Autumn'
    else:
        season = 'Winter'
    
    return(season)

## CSV import 
***

In [None]:
# read the CSV from the local Jupyter Notebook directory 
df_dirty_data = pd.read_csv(dirty_data)
df_missing_data = pd.read_csv(missing_data)
df_warehouses = pd.read_csv(warehouses)

## Initial CSV Review
***

### df_dirty_data dataframe

In [None]:
type(df_dirty_data)

In [None]:
print(df_dirty_data.columns)

In [None]:
df_dirty_data.head(10)

In [None]:
# using display for better formatting in Notebook , use print usually
display(df_dirty_data.describe())

In [None]:
print(df_dirty_data.info())

In [None]:
print(df_dirty_data.shape)

In [None]:
print(df_dirty_data.isna().any())

In [None]:
print(df_dirty_data.isnull().sum())

### df_missing_data dataframe

In [None]:
type(df_missing_data)

In [None]:
print(df_missing_data.columns)

In [None]:
df_missing_data.head(10)

In [None]:
# using display for better formatting in Notebook , use print usually
display(df_missing_data.describe())

In [None]:
print(df_missing_data.info())

In [None]:
print(df_missing_data.shape)

In [None]:
print(df_missing_data.isna().any())

In [None]:
print(df_missing_data.isnull().sum())

### df_warehouses dataframe

In [None]:
type(df_warehouses)

In [None]:
print(df_warehouses.columns)

In [None]:
type(df_warehouses)

In [None]:
df_warehouses.head()

In [None]:
# using display for better formatting in Notebook , use print usually
display(df_warehouses.describe())

In [None]:
print(df_warehouses.info())

In [None]:
print(df_warehouses.shape)

In [None]:
print(df_warehouses.isna().any())

In [None]:
print(df_warehouses.isnull().sum())

## Re-import from CSV 
***

In [None]:
# read the CSV from the local Jupyter Notebook directory 
# parsing dates on import since dates were objects
df_dirty_data = pd.read_csv(dirty_data, parse_dates=['date'], dtype={'order_price': 'float'}) 
df_missing_data = pd.read_csv(missing_data, parse_dates=['date']) 
#  cant use dtype={'is_happy_customer': 'bool'} because is_happy_customer contains NaN 

In [None]:
print(df_dirty_data.info())

In [None]:
print(df_missing_data.info())

In [None]:
# check for orderid uniqueness in df_dirty_data
duplicate_dirty_data = df_dirty_data[df_dirty_data.duplicated(['order_id'])]
print(duplicate_dirty_data)

In [None]:
# check for orderid uniqueness in df_missing_data
duplicate_missing_data = df_missing_data[df_missing_data.duplicated(['order_id'])]
print(duplicate_missing_data)

In [None]:
# create an order_id index on df_dirty_data
df_dirty_data.set_index('order_id',inplace=True)

In [None]:
# create an order_id index on df_missing_data
df_missing_data.set_index('order_id',inplace=True)

In [None]:
df_dirty_data.index

In [None]:
df_missing_data.index

## Detect and fix errors in dirty_data
***

## Fix missing values in missing_data
***

In [None]:
# show missing values
print(df_missing_data.isnull().sum())

### missing 'nearest_warehouse'

In [None]:
# show missing 'nearest_warehouse' rows
df_missing_data[df_missing_data['nearest_warehouse'].isnull()]

In [None]:
# checking the missing values for nearest_warehouse
nearest_warehouse_indexto_update = df_missing_data[df_missing_data['nearest_warehouse'].isnull()].index

In [None]:
# updating the missing values for nearest_warehouse
for i in nearest_warehouse_indexto_update:
    df_missing_data.at[i,'nearest_warehouse'] = get_missing_nearest_warehouse(df_missing_data.at[i,'customer_lat'],df_missing_data.at[i,'customer_long'])[0]

In [None]:
# no more missing values for nearest_warehouse
df_missing_data[df_missing_data['nearest_warehouse'].isnull()].index

In [None]:
# check the indexes 'nearest_warehouse' --  all done
for i in nearest_warehouse_indexto_update:
    print(df_missing_data.at[i,'nearest_warehouse'])

In [None]:
df_missing_data.filter(items=["nearest_warehouse"])

### missing 'order_price'

In [None]:
# show missing 'order_price' rows
df_missing_data[df_missing_data['order_price'].isnull()]

In [None]:
# getting the index for the missing values for order_price
order_price_indexto_update = df_missing_data[df_missing_data['order_price'].isnull()].index

In [None]:
# updating the missing values for order_price
for i in order_price_indexto_update:
    df_missing_data.at[i,'order_price'] = get_missing_order_price(df_missing_data.at[i,'order_total'],df_missing_data.at[i,'delivery_charges'],df_missing_data.at[i,'coupon_discount'])

In [None]:
# no more missing values for order_price
df_missing_data[df_missing_data['order_price'].isnull()]

In [None]:
# check the indexes 'order_price' --  all done
for i in order_price_indexto_update:
    print(df_missing_data.at[i,'order_price'])

In [None]:
df_missing_data.filter(items=["order_price"])

### missing 'order_total'

In [None]:
# show missing 'order_total' rows
df_missing_data[df_missing_data['order_total'].isnull()]

In [None]:
# getting the index for the missing values for order_total
order_total_indexto_update = df_missing_data[df_missing_data['order_total'].isnull()].index

In [None]:
# updating the missing values for order_total
for i in order_total_indexto_update:
    df_missing_data.at[i,'order_total'] = get_missing_order_price(df_missing_data.at[i,'order_price'],df_missing_data.at[i,'delivery_charges'],df_missing_data.at[i,'coupon_discount'])

In [None]:
# no more missing values for order_total
df_missing_data[df_missing_data['order_total'].isnull()]

In [None]:
# check the indexes 'order_total' --  all done
for i in order_total_indexto_update:
    print(df_missing_data.at[i,'order_total'])

In [None]:
df_missing_data.filter(items=["order_total"])

### missing 'season'

In [None]:
# show missing 'season' rows
df_missing_data[df_missing_data['season'].isnull()]

In [None]:
# getting the distinct values for season
print(df_missing_data['season'].sort_values().unique())

In [None]:
# get the seasons months from the grouping of the orders..obviously Southern hemisphere - a random lat long shows its Melbourne, Australia area
df_missing_data.groupby([pd.Grouper(key='date', axis=0, freq='M'),'season']).sum().filter(items=['date','season'])
# summer = months 12,1,2
# Autumn = months 3,4,5
# Winter = months 6,7,8
# Spring = months 9,10,11

In [None]:
# getting the index and rows for the missing values for season
season_to_update = df_missing_data[df_missing_data['season'].isnull()]
season_to_update_index = df_missing_data[df_missing_data['season'].isnull()].index

In [None]:
# updating missing values for season
for row in season_indexto_update.iterrows():
    row['season'] = (get_missing_season([row['date'])

In [None]:
# no more missing values for season
df_missing_data[df_missing_data['season'].isnull()]

In [None]:
#check the indexes for the update 'season' --  all done
for row in season_to_update_index.iterrows():
    row['season']

### missing 'distance_to_nearest_warehouse'

In [None]:
# show missing 'distance_to_nearest_warehouse' rows
df_missing_data[df_missing_data['distance_to_nearest_warehouse'].isnull()]

In [None]:
# getting the index for the missing values for nearest_warehouse
distance_to_nearest_warehouse_indexto_update = df_missing_data[df_missing_data['distance_to_nearest_warehouse'].isnull()].index

In [None]:
# updating  missing values for nearest_warehouse
for i in distance_to_nearest_warehouse_indexto_update:
    df_missing_data.at[i,'distance_to_nearest_warehouse'] = get_missing_nearest_warehouse(df_missing_data.at[i,'customer_lat'],df_missing_data.at[i,'customer_long'])[1]

In [None]:
# no more missing values for nearest_warehouse
df_missing_data[df_missing_data['distance_to_nearest_warehouse'].isnull()].index

In [None]:
# check the indexes 'nearest_warehouse' --  all done
for i in nearest_warehouse_indexto_update:
    print(df_missing_data.at[i,'distance_to_nearest_warehouse'])

In [None]:
df_missing_data.filter(items=["distance_to_nearest_warehouse"])

### missing 'customer_lat'

In [None]:
# show missing 'customer_lat' rows
df_missing_data[df_missing_data['customer_lat'].isnull()]

In [None]:
# getting the index for the missing values for customer_lat
customer_lat_indexto_update = df_missing_data[df_missing_data['customer_lat'].isnull()].index
customer_lat_indexto_update

In [None]:
df_missing_data[df_missing_data.duplicated(['customer_id'])]

### missing 'customer_long'

In [None]:
# show missing 'customer_long' rows
df_missing_data[df_missing_data['customer_long'].isnull()]

In [None]:
# getting the index for the missing values for customer_long
customer_long_indexto_update = df_missing_data[df_missing_data['customer_long'].isnull()].index
customer_long_indexto_update

## Analyse
***

In [None]:
print(df_missing_data.columns)

## Visualise
***