### Train and save prediction model

In [2]:
import pandas as pd
import glob

dfList = []
for file in glob.glob("./data/return-data/*"):
    one_day_df = pd.read_json(file, lines=True)
    dfList.append(one_day_df)
df = pd.concat(dfList).reset_index(drop=True)

In [3]:
for i in range(0, 6):
    df[f"c_{i}"] = df["basket"].map(lambda x: x.count(i))

In [4]:
df['zipCode'] = pd.Categorical(df['zipCode'], categories=list(range(100, 1000)))
dummies = pd.get_dummies(df.zipCode)
df2 = pd.concat([df, dummies], axis=1)
df3 = df2.drop(["basket", "zipCode", "transactionId"], axis=1)

In [5]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df3, test_size=0.3, random_state=0, shuffle=False)

In [6]:
X_train = train.drop("returnLabel", axis=1)
y_train = train["returnLabel"]
X_test = test.drop("returnLabel", axis=1)
y_test = test["returnLabel"]

In [7]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [8]:
import joblib
joblib.dump(logreg, "model/model.pkl")

['model/model.pkl']

### Learn and save imputer

In [9]:
X_train.head()

Unnamed: 0,totalAmount,c_0,c_1,c_2,c_3,c_4,c_5,100,101,102,...,990,991,992,993,994,995,996,997,998,999
0,120,3,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,440,0,3,2,2,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,954,2,2,1,1,1,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,624,0,0,1,3,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,168,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
imputer = {}

In [11]:
imputer["totalAmount"] = X_train["totalAmount"].mean()
print(imputer["totalAmount"])

238.28907834101383


In [12]:
imputer["zipCode"] = df["zipCode"].value_counts().idxmax()
print(imputer["zipCode"])

163


In [13]:
imputer["basket"] = []

In [14]:
joblib.dump(imputer, "model/imputer.pkl")

['model/imputer.pkl']