## Importing the needed libraries

In [1]:
import numpy as np
import pandas as pd
import datetime

## Importing the *orders* data set

In [2]:
orders_path = '../orders_edited.csv'

Dob - date of birth;

PoR - customer place of residence;

EDT - expected delivery time;

ADT - actual delivery time; 

RoR - reason of return

In [3]:
df_orders = pd.read_csv(orders_path,
                        #parse_dates=['DoB', 'Order date'], //Way too slow
                        dtype=
                        {'Customer number': np.int64,
                         'Gender': object,
                         'PoR': object,
                         'Order number': np.int64,
                         'Product number': object,
                         'Sub category': object,
                         'Category': object,
                         'Count': np.int16,
                         'Price': np.float64,
                         'EDT': np.int16,
                         'ADT': np.int16,
                         'RoR': object,
                         'Rating': np.int8})

In [4]:
df_orders.head(10)

Unnamed: 0,Customer number,DoB,Gender,PoR,Order number,Order date,Product number,Sub category,Category,Count,Price,EDT,ADT,RoR,Rating
0,1063418,15-01-1944,Woman,Bloemendaal,1251137,20-12-2014,5146308036084,Garden chairs,Garden furniture,1,1090.82,2,2,,-1
1,1062078,17-01-1934,Man,De Bilt,1658762,03-11-2017,8119187109467,Torches,Garden heating,6,385.92,4,4,,-1
2,1007060,14-01-1947,Man,Doetinchem,106526,08-06-2013,6110364066490,Leaf blowers,Garden reamers,2,173.78,3,-1,No reason given,-1
3,1063418,15-01-1944,Woman,Bloemendaal,1251137,20-12-2014,8172375031575,Hoes,Garden hand tools,4,45.64,5,-1,Article is defect,-1
4,1063290,08-01-1971,Woman,Franekeradeel,439149,06-01-2015,8111132296154,Insects and vermin,Control,1,114.24,1,1,,-1
5,1016474,16-01-1937,Man,Hollands Kroon,294578,19-08-2014,8188604007365,Garden chairs,Garden furniture,1,940.43,4,4,,-1
6,1016152,10-01-1961,Man,Ameland,342406,17-11-2014,8121894263936,Sprinklers,Watering,5,315.2,2,2,,-1
7,1021820,10-01-1963,Man,Vlagtwedde,1590282,11-11-2016,5181402017768,Pruning shears,Pruning,1,103.16,1,1,,4
8,1018918,07-01-1974,Woman,Coevorden,1202222,19-09-2014,3193523049599,Garden sets,Garden furniture,1,1079.03,3,-1,No reason given,4
9,1028820,06-01-1979,Man,Houten,1433754,29-12-2015,7146777181382,Sunshades,Sun protection,5,126.35,3,3,,-1


In [5]:
# Add returned column
df_orders.loc[df_orders['RoR'].isnull(), 'Returned'] = 0
df_orders.loc[df_orders['RoR'].notnull(), 'Returned'] = 1

In [6]:
# Add price per product column
df_orders['Price per Product'] = df_orders['Price'] / df_orders['Count']

In [7]:
# Add Man and Woman colums

df_orders.loc[df_orders['Gender'] == "Man", 'Man'] = 1
df_orders.loc[df_orders['Gender'] == "Woman", 'Man'] = 0
df_orders.loc[df_orders['Gender'] == "Man", 'Woman'] = 0
df_orders.loc[df_orders['Gender'] == "Woman", 'Woman'] = 1

In [8]:
df_orders[['RoR']].groupby('RoR').sum()

Article is defect
Article seems different than online
Damaged package
Delivery took to long
Disapointing quality
Doesn't meet expectations
Don't like the article
Manufacturing error
No reason given
Ordered the wrong article by accident
Wrong or missing article


In [9]:
# Calculate age
today = datetime.datetime.today()
df_orders["DoB"] = pd.to_datetime(df_orders["DoB"], format="%d-%m-%Y")
df_orders["Age"] = today.year - df_orders["DoB"].dt.year - ((today.month <= df_orders["DoB"].dt.month) & (today.day <= df_orders["DoB"].dt.day))

In [10]:
# Split order month and year
df_orders["Order date"] = pd.to_datetime(df_orders["Order date"], format="%d-%m-%Y")
df_orders["Order month"] = df_orders["Order date"].dt.month
df_orders["Order year"] = df_orders["Order date"].dt.year

In [11]:
first_order = pd.to_datetime("01-01-2013", format="%d-%m-%Y")
df_orders["DeltaT"] = (df_orders["Order date"] - first_order).dt.days

In [12]:
df_orders.head(10)

Unnamed: 0,Customer number,DoB,Gender,PoR,Order number,Order date,Product number,Sub category,Category,Count,...,RoR,Rating,Returned,Price per Product,Man,Woman,Age,Order month,Order year,DeltaT
0,1063418,1944-01-15,Woman,Bloemendaal,1251137,2014-12-20,5146308036084,Garden chairs,Garden furniture,1,...,,-1,0.0,1090.82,0.0,1.0,74,12,2014,718
1,1062078,1934-01-17,Man,De Bilt,1658762,2017-11-03,8119187109467,Torches,Garden heating,6,...,,-1,0.0,64.32,1.0,0.0,84,11,2017,1767
2,1007060,1947-01-14,Man,Doetinchem,106526,2013-06-08,6110364066490,Leaf blowers,Garden reamers,2,...,No reason given,-1,1.0,86.89,1.0,0.0,71,6,2013,158
3,1063418,1944-01-15,Woman,Bloemendaal,1251137,2014-12-20,8172375031575,Hoes,Garden hand tools,4,...,Article is defect,-1,1.0,11.41,0.0,1.0,74,12,2014,718
4,1063290,1971-01-08,Woman,Franekeradeel,439149,2015-01-06,8111132296154,Insects and vermin,Control,1,...,,-1,0.0,114.24,0.0,1.0,47,1,2015,735
5,1016474,1937-01-16,Man,Hollands Kroon,294578,2014-08-19,8188604007365,Garden chairs,Garden furniture,1,...,,-1,0.0,940.43,1.0,0.0,81,8,2014,595
6,1016152,1961-01-10,Man,Ameland,342406,2014-11-17,8121894263936,Sprinklers,Watering,5,...,,-1,0.0,63.04,1.0,0.0,57,11,2014,685
7,1021820,1963-01-10,Man,Vlagtwedde,1590282,2016-11-11,5181402017768,Pruning shears,Pruning,1,...,,4,0.0,103.16,1.0,0.0,55,11,2016,1410
8,1018918,1974-01-07,Woman,Coevorden,1202222,2014-09-19,3193523049599,Garden sets,Garden furniture,1,...,No reason given,4,1.0,1079.03,0.0,1.0,44,9,2014,626
9,1028820,1979-01-06,Man,Houten,1433754,2015-12-29,7146777181382,Sunshades,Sun protection,5,...,,-1,0.0,25.27,1.0,0.0,39,12,2015,1092


In [13]:
df_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4523276 entries, 0 to 4523275
Data columns (total 23 columns):
Customer number      int64
DoB                  datetime64[ns]
Gender               object
PoR                  object
Order number         int64
Order date           datetime64[ns]
Product number       object
Sub category         object
Category             object
Count                int16
Price                float64
EDT                  int16
ADT                  int16
RoR                  object
Rating               int8
Returned             float64
Price per Product    float64
Man                  float64
Woman                float64
Age                  int64
Order month          int64
Order year           int64
DeltaT               int64
dtypes: datetime64[ns](2), float64(5), int16(3), int64(6), int8(1), object(6)
memory usage: 685.9+ MB


In [14]:
df_orders.to_csv("../orders_after_cleaning.csv", index=False)