# Preparing and cleaning the data

In [1]:
import numpy as np
import pandas as pd
import datetime as dt

In [2]:
orders_path = '../orders.csv'
date_parser = pd.to_datetime
parse_dates = [1, 5]

df_orders = pd.read_csv(
    orders_path,
    header=None,
    parse_dates=parse_dates,
    date_parser=date_parser,
    infer_datetime_format=True,
    sep='|',
    index_col=None,
    encoding='latin1',
    dtype={0: object,
           2: 'category',
           3: object,
           4: object,
           6: object,
           7: 'category',
           8: 'category',
           9: np.int64,
           10: np.float64,
           11: np.int64,
           12: np.int64,
           13: object,
           14: np.int64,
           15: object})

df_orders.drop(df_orders.columns[-1], axis=1, inplace=True)
df_orders.reset_index()

df_orders.columns = ['Customer number', 'DoB', 'Gender', 'PoR', 'Order number',
           'Order date', 'Product number', 'Sub category', 'Category',
           'Count', 'Price', 'EDT', 'ADT', 'RoR', 'Rating']

In [3]:
df_orders["DoB"] = df_orders["DoB"].dt.strftime("%d-%m-%Y")
df_orders["Order date"] = df_orders["Order date"].dt.strftime("%d-%m-%Y")

## Null values
    -> The only null values are found in the column for Reason of return
    -> We will deal with this matter in the Feature engineering section

In [4]:
null_columns=df_orders.columns[df_orders.isnull().any()]
df_orders[null_columns].isnull().sum()

RoR    4303432
dtype: int64

In [5]:
df_orders.count()

Customer number    4523276
DoB                4523276
Gender             4523276
PoR                4523276
Order number       4523276
Order date         4523276
Product number     4523276
Sub category       4523276
Category           4523276
Count              4523276
Price              4523276
EDT                4523276
ADT                4523276
RoR                 219844
Rating             4523276
dtype: int64

# Exporting the clean data

In [6]:
df_orders.to_csv("../orders_cleaned.csv", index=False)