# Data Preparation and cleaning

## Libraries

In [13]:
import numpy as np
import pandas as pd
import datetime as dt

import os
import matplotlib.pyplot as plt

%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

## Preparing the dataset

In [16]:
orders_path = '../orders.csv'
date_parser = pd.to_datetime
parse_dates = [1, 5]

df_orders = pd.read_csv(
    orders_path,
    header=None,
    parse_dates=parse_dates,
    date_parser=date_parser,
    infer_datetime_format=True,
    sep='|',
    index_col=None,
    encoding='latin1',
    dtype={0: object,
           2: 'category',
           3: object,
           4: object,
           6: object,
           7: 'category',
           8: 'category',
           9: np.int64,
           10: np.float64,
           11: np.int64,
           12: np.int64,
           13: object,
           14: np.int64,
           15: np.float64})

df_orders.drop(df_orders.columns[-1], axis=1, inplace=True)
df_orders.reset_index()

df_orders.columns = ['Customer number', 'DoB', 'Gender', 'PoR', 'Order number',
           'Order date', 'Product number', 'Sub category', 'Category',
           'Count', 'Price', 'EDT', 'ADT', 'RoR', 'Rating']

KeyboardInterrupt: 

Dob - date of birth;

PoR - customer place of residence;

EDT - expected delivery time;

ADT - actual delivery time;

RoR - reason of return

In [12]:
df_orders["DoB"] = df_orders["DoB"].dt.strftime("%d-%m-%Y")
df_orders["Order date"] = df_orders["Order date"].dt.strftime("%d-%m-%Y")

AttributeError: Can only use .dt accessor with datetimelike values

## Null values
    -> The only null values are found in the column for Reason of return
    -> We will deal with this matter in the Feature engineering section

In [None]:
null_columns=df_orders.columns[df_orders.isnull().any()]
df_orders[null_columns].isnull().sum()

In [None]:
df_orders.count()

## Feature engineering

### "Returned"
    -> binary value
    -> is the product retured or not

In [None]:
df_orders.loc[df_orders['RoR'].isnull(), 'Returned'] = 0
df_orders.loc[df_orders['RoR'].notnull(), 'Returned'] = 1
df_orders = df_orders.drop('RoR', axis=1)

### "Price per product"
    -> what is the price of the product ordered

In [None]:
df_orders['Price per Product'] = df_orders['Price'] / df_orders['Count']

### "Man" / "Woman"
    -> boolean values
    -> is the gender man or woman
    -> the method is called One-Hot-Encoding. It is done instead of changing the genders to 0 and 1 in the original column, so the models in the future can diferentiate them like categories, not numbers

In [None]:
df_orders.loc[df_orders['Gender'] == "Man", 'Man'] = 1
df_orders.loc[df_orders['Gender'] == "Woman", 'Man'] = 0
df_orders.loc[df_orders['Gender'] == "Man", 'Woman'] = 0
df_orders.loc[df_orders['Gender'] == "Woman", 'Woman'] = 1

### "Age" 
    -> based on the date of birth

In [None]:
today = datetime.datetime.today()

df_orders["DoB"] = pd.to_datetime(df_orders["DoB"], format="%d-%m-%Y")
df_orders["Age"] = today.year - df_orders["DoB"].dt.year - ((today.month <= df_orders["DoB"].dt.month)
                                                            & (today.day <= df_orders["DoB"].dt.day))

### "Order month" and "Order year"
    -> separate the month and the year of an order for easier analysis

In [None]:
df_orders["Order date"] = pd.to_datetime(df_orders["Order date"], format="%d-%m-%Y")
df_orders["Order month"] = df_orders["Order date"].dt.month
df_orders["Order year"] = df_orders["Order date"].dt.year

### "DeltaT"
    -> number representation of the days
    -> the 0 day is 01-01-2013 and each next day is +1

In [None]:
first_order = pd.to_datetime("01-01-2013", format="%d-%m-%Y")
df_orders["DeltaT"] = (df_orders["Order date"] - first_order).dt.days

## Current information about the dataset

In [None]:
df_orders.info()

## Export the cleaned dataset

In [None]:
df_orders.to_csv("../orders_cleaned.csv", index=False)

# Exploratory Data Analysis (EDA)

## Gender distribution

In [None]:
unique_customer_df = df_orders[['Customer number', 'Gender']]
unique_customer_df = unique_customer_df.drop_duplicates()

male_unique_customer_df = unique_customer_df[unique_customer_df['Gender'] == 'Man']
female_unique_customer_df = unique_customer_df[unique_customer_df['Gender'] == 'Woman']

In [None]:
labels = 'Male', 'Female'
sizes = [len(male_unique_customer_df), len(female_unique_customer_df)]

fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
ax1.axis('equal')

plt.show()

## Age distribution

In [None]:
customers_age = df_orders[['Customer number', 'Age']]
customers_age = customers_age.drop_duplicates()

In [None]:
sns.set_style('darkgrid')
x = pd.Series(customers_age['Age'], name="Ages")
plt.subplots(figsize=(10,6))
sns.distplot(x)
plt.title("Distribution of ages")
print()

## Orders distribution

In [None]:
orders_per_customer = df_orders.groupby('Customer number')['Customer number', 'Order number']

In [None]:
sns.set_style('darkgrid')
x = pd.Series(orders_per_customer['Order number'], name="Order count")
plt.subplots(figsize=(10.6))
sns.distplot(x)
plt.title("Distribution of orders")
print()

## Categories and subcategories

In [None]:
df_ordered_categories = df_orders.groupby(['Category', 'Sub category'])
print(df_ordered_categories[['Category', 'Sub category']].nunique().to_string())

## Money earned per category

In [None]:
money_per_category = df_orders.groupby(['Category'])['Price'].sum()
np.set_printoptions(suppress=True)
print(money_per_category)

# Churn analysis