In [130]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [131]:
df = pd.read_csv("train.csv", parse_dates=["date"])
df.drop("id", axis=1, inplace=True)

In [132]:
df = df[df["num_sold"].notna()]
df = pd.get_dummies(df, dtype=int)
df["date"] = (df["date"] - df["date"].min()).dt.days
df["date"] = (df["date"] - df["date"].mean())/df["date"].std()

In [133]:
from sklearn.preprocessing import StandardScaler
original_num_sold = df["num_sold"].copy()
df["num_sold"] = np.log(df["num_sold"])

y_scaler = StandardScaler()
df["num_sold"] = y_scaler.fit_transform(df[["num_sold"]])

In [134]:
# train test split
train = df.sample(frac=0.8, random_state=42)
test = df.drop(train.index)
X_train, y_train = train.drop('num_sold', axis=1), train['num_sold']
X_test, y_test = test.drop('num_sold', axis=1), test['num_sold']

In [135]:
df["num_sold"].describe()

count    2.212590e+05
mean     4.495907e-17
std      1.000002e+00
min     -2.800837e+00
25%     -3.481272e-01
50%      3.112851e-01
75%      7.074451e-01
max      1.793480e+00
Name: num_sold, dtype: float64

In [139]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

random_grid = {
    'bootstrap': [True, False]
    }

model = RandomForestRegressor(max_depth=40, min_samples_split=10)
random_search = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=100, cv=3, verbose=2,
                                    random_state=42, scoring="neg_mean_absolute_percentage_error")
random_search.fit(X_train, y_train)
#model.fit(X_train, y_train)



Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] END .....................................bootstrap=True; total time=  17.7s
[CV] END .....................................bootstrap=True; total time=  17.6s
[CV] END .....................................bootstrap=True; total time=  17.6s
[CV] END ....................................bootstrap=False; total time=  27.3s
[CV] END ....................................bootstrap=False; total time=  27.3s
[CV] END ....................................bootstrap=False; total time=  27.4s


In [140]:
random_search.best_params_

{'bootstrap': True}

In [141]:
from sklearn.metrics import mean_absolute_percentage_error as mape
def reverse_transforms(data, scaler, func):
    return func(scaler.inverse_transform(data.reshape(-1, 1)))

y_pred = random_search.predict(X_test)
y_pred = reverse_transforms(y_pred, y_scaler, np.exp)
y_pred[:5]

array([[1830.23271624],
       [2189.37885953],
       [ 922.49519422],
       [ 813.61390991],
       [ 469.40488554]])

In [144]:
y_test = y_test.to_numpy()
y_test = reverse_transforms(y_test, y_scaler, np.exp)
y_test[:5]

array([[1837.],
       [2212.],
       [ 926.],
       [ 774.],
       [ 450.]])

In [145]:
mape(y_test, y_pred)

0.07801647329602823

In [146]:
np.mean((y_pred - y_test)/y_test) * 100

0.47934824015148525

In [114]:
y_test[:5]

array([[61835168.09795655],
       [82329880.80401753],
       [21517404.56601407],
       [16322697.65587727],
       [ 7076882.21254428]])

In [110]:
y_pred[:5]

array([[1847.39085984],
       [2200.96354967],
       [ 938.05238898],
       [ 820.92087542],
       [ 468.75955999]])

In [147]:
test_df = pd.read_csv('test.csv', parse_dates=["date"])
test_df.head()

Unnamed: 0,id,date,country,store,product
0,230130,2017-01-01,Canada,Discount Stickers,Holographic Goose
1,230131,2017-01-01,Canada,Discount Stickers,Kaggle
2,230132,2017-01-01,Canada,Discount Stickers,Kaggle Tiers
3,230133,2017-01-01,Canada,Discount Stickers,Kerneler
4,230134,2017-01-01,Canada,Discount Stickers,Kerneler Dark Mode


In [148]:
test_df = pd.get_dummies(test_df, dtype=int)
test_df["date"] = (test_df["date"] - test_df["date"].min()).dt.days
test_df["date"] = (test_df["date"] - test_df["date"].mean())/test_df["date"].std()

In [149]:
final_preds = random_search.predict(test_df.drop(columns=["id"]))

In [150]:
final_preds = reverse_transforms(final_preds, y_scaler, np.exp)
submission_df = pd.DataFrame({'id': test_df['id'], 'num_sold': final_preds.flatten()})
submission_df.head()

Unnamed: 0,id,num_sold
0,230130,117.405322
1,230131,725.026548
2,230132,618.667847
3,230133,329.154508
4,230134,372.43895


In [151]:
submission_df.to_csv("submission.csv", index=False)