In [108]:
import datetime
import statistics

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.display import display
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression


%matplotlib inline
plt.rcParams["figure.figsize"] = (12,6)

In [109]:
df_og = pd.read_csv('data/BADS_WS2021_known.csv')
# df_uk = pd.read_csv('data/BADS_WS2021_unknown.csv')

X_train, X_test, y_train, y_test = train_test_split(df_og['return'], df_og.drop('return', axis=1), test_size=0.001)
y_test.insert(len(y_test.columns), 'return', X_test)
df = y_test

In [110]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 62630 to 49795
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   order_item_id  100 non-null    int64  
 1   order_date     100 non-null    object 
 2   delivery_date  87 non-null     object 
 3   item_id        100 non-null    int64  
 4   item_size      100 non-null    object 
 5   item_color     100 non-null    object 
 6   brand_id       100 non-null    int64  
 7   item_price     100 non-null    float64
 8   user_id        100 non-null    int64  
 9   user_title     100 non-null    object 
 10  user_dob       91 non-null     object 
 11  user_state     100 non-null    object 
 12  user_reg_date  100 non-null    object 
 13  return         100 non-null    int64  
dtypes: float64(1), int64(5), object(8)
memory usage: 11.7+ KB
None


In [111]:
df.item_size.value_counts()
df.item_color.value_counts()
df.user_title.value_counts()
df.user_state.value_counts()

North Rhine-Westphalia           30
Bavaria                          20
Lower Saxony                     14
Hesse                             6
Saxony                            4
Rhineland-Palatinate              4
Berlin                            3
Baden-Wuerttemberg                3
Hamburg                           3
Thuringia                         2
Schleswig-Holstein                2
Saxony-Anhalt                     2
Bremen                            2
Mecklenburg-Western Pomerania     2
Brandenburg                       2
Saarland                          1
Name: user_state, dtype: int64

In [112]:
df.user_title = df.user_title.astype('category')
df.item_size = df.item_size.astype('category')
df.item_color = df.item_color.astype('category')
df.user_state = df.user_state.astype('category')


df['order_date'] = pd.to_datetime(df['order_date'])
df['delivery_date'] = pd.to_datetime(df['delivery_date'])
df['user_reg_date'] = pd.to_datetime(df['user_reg_date'])
df['user_dob'] = pd.to_datetime(df['user_dob'])

df['return'] = df['return'].astype('bool')
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 62630 to 49795
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   order_item_id  100 non-null    int64         
 1   order_date     100 non-null    datetime64[ns]
 2   delivery_date  87 non-null     datetime64[ns]
 3   item_id        100 non-null    int64         
 4   item_size      100 non-null    category      
 5   item_color     100 non-null    category      
 6   brand_id       100 non-null    int64         
 7   item_price     100 non-null    float64       
 8   user_id        100 non-null    int64         
 9   user_title     100 non-null    category      
 10  user_dob       91 non-null     datetime64[ns]
 11  user_state     100 non-null    category      
 12  user_reg_date  100 non-null    datetime64[ns]
 13  return         100 non-null    bool          
dtypes: bool(1), category(4), datetime64[ns](4), float64(1), int64(4)
mem

In [113]:
# finding missing values

print(df.isnull().sum())
# sns.heatmap(df.isnull(), cbar=False)  # quick visualization of the missing values in our data set

order_item_id     0
order_date        0
delivery_date    13
item_id           0
item_size         0
item_color        0
brand_id          0
item_price        0
user_id           0
user_title        0
user_dob          9
user_state        0
user_reg_date     0
return            0
dtype: int64


In [114]:
# check relation between null values in delivery_date and return value
# assumption: delivery_date=null --> item not delivered --> item was returned before it was delivered

df.loc[df.delivery_date.isnull(), 'return'].value_counts()
# all orders where delivery_date is null (missing), were not returned, assumption was wrong.
# I assume that all those orders were delivered


False    13
Name: return, dtype: int64

In [115]:
print('amount of orders delivered on 1994-12-31: {}'.format(df_og.loc[df_og.delivery_date=='1994-12-31', 'delivery_date'].count()))
print('[delivery_date, amount] of orders where the delivery_date is before the order_date: {}'.format(df_og.loc[df_og.delivery_date<df_og.order_date, 'delivery_date'].value_counts()))

# all delivery_date where the delivery date is 1994-12-31 need to be replaced changed together with the missing values

amount of orders delivered on 1994-12-31: 1072
[delivery_date, amount] of orders where the delivery_date is before the order_date: 1994-12-31    1072
Name: delivery_date, dtype: int64


In [116]:
df.head()

Unnamed: 0,order_item_id,order_date,delivery_date,item_id,item_size,item_color,brand_id,item_price,user_id,user_title,user_dob,user_state,user_reg_date,return
62630,62631,2016-08-10,2016-08-13,250,36,black,7,12.9,15545,Mrs,1901-09-25,Lower Saxony,2015-02-17,False
49177,49178,2016-08-02,2016-09-17,1750,l,olive,30,34.95,39059,Mr,NaT,North Rhine-Westphalia,2016-08-03,False
6466,6467,2016-06-27,2016-06-28,245,38,black,1,59.9,4146,Mrs,1947-04-13,North Rhine-Westphalia,2015-02-17,False
9198,9199,2016-06-27,2016-07-23,137,39,blue,38,54.9,32429,Mr,NaT,North Rhine-Westphalia,2016-06-28,False
1989,1990,2016-06-23,2016-06-26,666,10,brown,19,69.9,31245,Mrs,1962-12-20,Bavaria,2015-02-17,True


In [117]:
# change dates
# calculate delivery_times

In [118]:
# df.insert(3, 'delivery_time', [0]*len(df))

In [119]:
# replace delivery_times with None where delivery_date=='1994-12-31' (negative delivery_time) in order to replace the value with mean
df.loc[df.delivery_date=='1994-12-31', 'delivery_date'] = None

#create new column delivery_time, for null values in delivery_date, the mean delivery_time will be placed
df['delivery_time'] = [(d_date - o_date).days for d_date, o_date in zip(df.delivery_date, df.order_date)]

delivery_time_mean = round(df.delivery_time.mean(skipna=True))
df.delivery_time.fillna(delivery_time_mean, inplace=True)


In [120]:
# check if there is a relation between missing dob and user_title (i.e. companies dont have birth dates)
df['user_title'][df.user_dob.isnull()].value_counts()

# no pattern recognizable

Mrs    7
Mr     2
Name: user_title, dtype: int64

In [121]:
df['user_age'] = [(o_date - dob).days / 365 for o_date, dob in zip(df.order_date, df.user_dob)]
user_age_mean = round(df.user_age.mean(skipna=True))
df.user_age.fillna(user_age_mean)
df.user_age = df.user_age.round()
df.head()


Unnamed: 0,order_item_id,order_date,delivery_date,item_id,item_size,item_color,brand_id,item_price,user_id,user_title,user_dob,user_state,user_reg_date,return,delivery_time,user_age
62630,62631,2016-08-10,2016-08-13,250,36,black,7,12.9,15545,Mrs,1901-09-25,Lower Saxony,2015-02-17,False,3.0,115.0
49177,49178,2016-08-02,2016-09-17,1750,l,olive,30,34.95,39059,Mr,NaT,North Rhine-Westphalia,2016-08-03,False,46.0,
6466,6467,2016-06-27,2016-06-28,245,38,black,1,59.9,4146,Mrs,1947-04-13,North Rhine-Westphalia,2015-02-17,False,1.0,69.0
9198,9199,2016-06-27,2016-07-23,137,39,blue,38,54.9,32429,Mr,NaT,North Rhine-Westphalia,2016-06-28,False,26.0,
1989,1990,2016-06-23,2016-06-26,666,10,brown,19,69.9,31245,Mrs,1962-12-20,Bavaria,2015-02-17,True,3.0,54.0


In [122]:
# check if there are negative price_values
len(df.loc[df.item_price < 0, 'item_price'])

0

In [123]:
# y = df['return']
# X = df.drop(['order_item_id', 'order_date', 'delivery_date', 'user_id', 'user_dob','return'], axis=1)
#
# X_train, X_test, y_train, y_test = train_test_split(X, y)

In [124]:

# estimator = LogisticRegression()
# parameter = {"penalty":["none", "l2"]}
# logit = GridSearchCV(estimator, parameter)
# logit.fit(X_train, y_train)

In [125]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 62630 to 49795
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   order_item_id  100 non-null    int64         
 1   order_date     100 non-null    datetime64[ns]
 2   delivery_date  86 non-null     datetime64[ns]
 3   item_id        100 non-null    int64         
 4   item_size      100 non-null    category      
 5   item_color     100 non-null    category      
 6   brand_id       100 non-null    int64         
 7   item_price     100 non-null    float64       
 8   user_id        100 non-null    int64         
 9   user_title     100 non-null    category      
 10  user_dob       91 non-null     datetime64[ns]
 11  user_state     100 non-null    category      
 12  user_reg_date  100 non-null    datetime64[ns]
 13  return         100 non-null    bool          
 14  delivery_time  100 non-null    float64       
 15  user_age       91

In [139]:
res = pd.crosstab(df_og.item_size, df_og['return'])
res['odds_ratio'] = res[0] / res[1]
res.sort_values('odds_ratio', inplace=True)
display(res.to_stata)


'return        0     1  odds_ratio\nitem_size                        \n4034          0     1    0.000000\n3132          0     1    0.000000\n2+            0     2    0.000000\n80            0     2    0.000000\n84            0     1    0.000000\n12+           0     1    0.000000\n44+           3     7    0.428571\n22           49    80    0.612500\n13           20    31    0.645161\n58            2     3    0.666667\n4032          4     6    0.666667\n36+           9    13    0.692308\n40+          80   107    0.747664\n14            3     4    0.750000\n3632          7     9    0.777778\n48          428   550    0.778182\n46          582   743    0.783311\n23           46    57    0.807018\n6+          199   237    0.839662\n54           11    13    0.846154\n6           203   239    0.849372\n50           68    80    0.850000\n3+            6     7    0.857143\n10          128   149    0.859060\n8           232   270    0.859259\n100           7     8    0.875000\n42+          58    

In [133]:
remap_dict = {
    '4034': '4034, 3132, 2+, 80, 84, 12+',
    '3132': '4034, 3132, 2+, 80, 84, 12+',
    '2+': '4034, 3132, 2+, 80, 84, 12+',
    '80': '4034, 3132, 2+, 80, 84, 12+',
    '84': '4034, 3132, 2+, 80, 84, 12+',
    '12+': '4034, 3132, 2+, 80, 84, 12+',

}

return      0  1  odds_ratio
item_size                   
4034        0  1         0.0
3132        0  1         0.0
2+          0  2         0.0
80          0  2         0.0
84          0  1         0.0
...        .. ..         ...
49          1  0         inf
3432       13  0         inf
105         1  0         inf
3634        1  0         inf
3834        1  0         inf

[102 rows x 3 columns]


l       12347
xl      10979
m       10190
xxl      8966
40       7693
        ...  
3834        1
84          1
4034        1
3634        1
3132        1
Name: item_size, Length: 102, dtype: int64