In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression

### Load data

In [3]:
df_train = pd.read_csv("data/train.csv", sep=";")
df_test = pd.read_csv("data/test.csv", sep=";")
print(df_train.shape)
print(df_test.shape)

(22400, 5)
(9600, 5)


In [4]:
df_train.dtypes

transactionId      int64
basket            object
customerType      object
totalAmount      float64
returnLabel        int64
dtype: object

In [5]:
df_train.head()

Unnamed: 0,transactionId,basket,customerType,totalAmount,returnLabel
0,9534310106,"[4, 3, 4]",new,252.0,1
1,7202594767,"[4, 2, 0, 2, 5]",existing,70.0,0
2,2737331698,[5],existing,84.0,0
3,4868011733,"[1, 4, 2, 4]",existing,116.0,0
4,7622406570,"[2, 5, 3, 2, 3, 2, 0]",existing,378.0,0


In [68]:
df_train.dtypes

transactionId      int64
basket            object
customerType      object
totalAmount      float64
returnLabel        int64
dtype: object

### Fill missing data

In [8]:
df_train = df_train.dropna()

In [9]:
df_train.isna().sum()

transactionId    0
basket           0
customerType     0
totalAmount      0
returnLabel      0
dtype: int64

### Transform features

In [12]:
one_hot_ct = pd.get_dummies(df_train["customerType"], prefix="customerType")
df_train_all = pd.concat([df_train, one_hot_ct], axis=1)
df_train_all['c_0'] = df_train_all.basket.map(lambda x: x.count("0"))
df_train_all['c_1'] = df_train_all.basket.map(lambda x: x.count("1"))
df_train_all['c_2'] = df_train_all.basket.map(lambda x: x.count("2"))
df_train_all['c_3'] = df_train_all.basket.map(lambda x: x.count("3"))
df_train_all['c_4'] = df_train_all.basket.map(lambda x: x.count("4"))
df_train_all['c_5'] = df_train_all.basket.map(lambda x: x.count("5"))
df_train_all.head(3)

Unnamed: 0,transactionId,basket,customerType,totalAmount,returnLabel,customerType_existing,customerType_new,c_0,c_1,c_2,c_3,c_4,c_5
0,9534310106,"[4, 3, 4]",new,252.0,1,0,1,0,0,0,1,2,0
1,7202594767,"[4, 2, 0, 2, 5]",existing,70.0,0,1,0,1,0,2,0,1,1
2,2737331698,[5],existing,84.0,0,1,0,0,0,0,0,0,1


In [13]:
X_train = df_train_all[["totalAmount","customerType_existing", "customerType_new", "c_0","c_1","c_2","c_3","c_4","c_5"]]
y_train = df_train_all["returnLabel"]

### Model training

In [15]:
gbt = GradientBoostingClassifier(random_state=0)
gbt.fit(X_train, y_train)

GradientBoostingClassifier(random_state=0)

In [17]:
logReg = LogisticRegression(max_iter=1000, solver="liblinear")
logReg.fit(X_train, y_train)

LogisticRegression(max_iter=1000, solver='liblinear')

In [18]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

### 6. Evaluation

In [24]:
df_test = df_test.dropna()
one_hot_ct = pd.get_dummies(df_test["customerType"], prefix="customerType")
df_test_all = pd.concat([df_test, one_hot_ct], axis=1)
df_test_all['c_0'] = df_test_all.basket.map(lambda x: x.count("0"))
df_test_all['c_1'] = df_test_all.basket.map(lambda x: x.count("1"))
df_test_all['c_2'] = df_test_all.basket.map(lambda x: x.count("2"))
df_test_all['c_3'] = df_test_all.basket.map(lambda x: x.count("3"))
df_test_all['c_4'] = df_test_all.basket.map(lambda x: x.count("4"))
df_test_all['c_5'] = df_test_all.basket.map(lambda x: x.count("5"))
df_test_all.head(3)

Unnamed: 0,transactionId,basket,customerType,totalAmount,returnLabel,customerType_existing,customerType_new,c_0,c_1,c_2,c_3,c_4,c_5
0,4132523932,"[4, 3, 4, 3, 2, 3]",existing,366.0,1,1,0,0,0,1,3,2,0
1,8998574539,"[3, 4, 4, 3, 5]",existing,85.0,0,1,0,0,0,0,2,2,1
2,9346688547,"[1, 1, 2, 2, 4, 4, 3, 1, 1, 0, 3]",existing,275.0,0,1,0,1,4,2,2,2,0


In [25]:
X_test = df_test_all[["totalAmount","customerType_existing", "customerType_new", "c_0","c_1","c_2","c_3","c_4","c_5"]]
y_test = df_test_all["returnLabel"]

In [30]:
def evaluate(model, X, y):
    predictions = model.predict(X)
    print(f" == Performance {model} == ")
    print("Accuracy score is:" + str(accuracy_score(y, predictions)))
    print("Recall is:"+ str(recall_score(y, predictions)))
    print("Precision is:" + str(precision_score(y, predictions)))
    print("\n")

In [31]:
evaluate(logReg, X_test, y_test)
evaluate(gbt, X_test, y_test)
evaluate(rf, X_test, y_test)

 == Performance LogisticRegression(max_iter=1000, solver='liblinear') == 
Accuracy score is:0.8729676999783221
Recall is:0.7400145243282498
Precision is:0.8171611868484362


 == Performance GradientBoostingClassifier(random_state=0) == 
Accuracy score is:0.8717754172989378
Recall is:0.7549019607843137
Precision is:0.803633552377271


 == Performance RandomForestClassifier(random_state=0) == 
Accuracy score is:0.85020593973553
Recall is:0.7389251997095134
Precision is:0.7542624166048925




In [32]:
evaluate(logReg, X_train, y_train)
evaluate(gbt, X_train, y_train)
evaluate(rf, X_train, y_train)

 == Performance LogisticRegression(max_iter=1000, solver='liblinear') == 
Accuracy score is:0.8791883842144452
Recall is:0.7560632138945392
Precision is:0.8233089112284887


 == Performance GradientBoostingClassifier(random_state=0) == 
Accuracy score is:0.8832836932241251
Recall is:0.7768737286809576
Precision is:0.8210682983297503


 == Performance RandomForestClassifier(random_state=0) == 
Accuracy score is:0.9631887565152644
Recall is:0.9244249726177437
Precision is:0.9504504504504504


