In [4]:
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle

test = pd.read_csv('test.csv', index_col='id')
train = pd.read_csv('train.csv', index_col='id')

country_types = {'Argentina': 0, 'Canada': 1, 'Estonia': 2, 'Japan': 3, 'Spain': 4,}
store_types = {'Kaggle Learn': 0, 'Kaggle Store': 1, 'Kagglazon': 2}
product_types = {'Using LLMs to Improve Your Coding': 0, 'Using LLMs to Train More LLMs': 1, 'Using LLMs to Win Friends and Influence People': 2, 'Using LLMs to Win More Kaggle Competitions': 3, 'Using LLMs to Write Better': 4}

train.country = [country_types[i] for i in train.country]
train.store = [store_types[i] for i in train.store]
train['product'] = [product_types[i] for i in train['product']]

train['date'] = pd.to_datetime(train['date'])
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train.drop('date', axis=1, inplace=True)

test.country = [country_types[i] for i in test.country]
test.store = [store_types[i] for i in test.store]
test['product'] = [product_types[i] for i in test['product']]
test['date'] = pd.to_datetime(test['date'])
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test.drop('date', axis=1, inplace=True)

# normalize training data
train['num_sold'] = train['num_sold'] / 1500
train['country'] = train['country'] / 4
train['store'] = train['store'] / 2
train['product'] = train['product'] / 4
train['month'] = train['month'] / 12
train['day'] = train['day'] / 31

# normalize testing data
test['country'] = test['country'] / 4
test['store'] = test['store'] / 2
test['product'] = test['product'] / 4
test['month'] = test['month'] / 12
test['day'] = test['day'] / 31

features = ['country', 'store', 'product', 'month', 'day']
target = 'num_sold'

X_train = train[features]
y_train = train[target]

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.4, random_state=42)

print(train.head())
print(test.head())

    country  store  product  num_sold     month       day
id                                                       
0       0.0    0.0     0.00  0.042000  0.083333  0.032258
1       0.0    0.0     0.25  0.044000  0.083333  0.032258
2       0.0    0.0     0.50  0.006000  0.083333  0.032258
3       0.0    0.0     0.75  0.039333  0.083333  0.032258
4       0.0    0.0     1.00  0.032667  0.083333  0.032258
        country  store  product     month       day
id                                                 
136950      0.0    0.0     0.00  0.083333  0.032258
136951      0.0    0.0     0.25  0.083333  0.032258
136952      0.0    0.0     0.50  0.083333  0.032258
136953      0.0    0.0     0.75  0.083333  0.032258
136954      0.0    0.0     1.00  0.083333  0.032258


In [5]:
# https://github.com/beanbeah/ML/blob/main/sklearn-ml-bruteforce.py

from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import LassoLarsIC, GammaRegressor, TweedieRegressor, BayesianRidge, ARDRegression,  LinearRegression, Ridge, RidgeCV, SGDRegressor, ElasticNet, RANSACRegressor, TheilSenRegressor, PoissonRegressor, PassiveAggressiveRegressor, OrthogonalMatchingPursuit
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

regressors = {
    "AdaBoost (square)" : AdaBoostRegressor(random_state=0, n_estimators=100, loss="square"),
    "AdaBoost (linear)" : AdaBoostRegressor(random_state=0, n_estimators=100, loss="linear"),
    "Adaboost (exponential)" : AdaBoostRegressor(random_state=0, n_estimators=100, loss="exponential"),
    "Bagging" : BaggingRegressor(n_estimators=10, random_state=0),
    "ExtraTrees (sq err)" : ExtraTreesRegressor(criterion = "squared_error", n_estimators=100, random_state=0),
    "GradientBoosting (huber)" : GradientBoostingRegressor(random_state=0,loss="huber"),
    "GradientBoosting (sq err)" : GradientBoostingRegressor(random_state=0,loss="squared_error"),
    "GradientBoosting (abs err)" : GradientBoostingRegressor(random_state=0,loss="absolute_error"),
    "Random Forest (sq err)" : RandomForestRegressor(max_depth=2, random_state=0,criterion="squared_error"),
    "Random Forest (poisson)" : RandomForestRegressor(max_depth=2, random_state=0,criterion="poisson"),
    "HistGradientBoosting (sq err)" : HistGradientBoostingRegressor(loss="squared_error"),
    "HistGradientBoosting (abs err)" : HistGradientBoostingRegressor(loss="absolute_error"),
    "HistGradientBoosting (poisson)" : HistGradientBoostingRegressor(loss="poisson"),
    "Linear" : LinearRegression(),
    "Ridge (Linear)" : Ridge(),
    "RidgeCV" : RidgeCV(),
    "SGDRegressor (elasticnet)" : make_pipeline(StandardScaler(),SGDRegressor(max_iter=1000, tol=1e-3,penalty="elasticnet")),
    "SGDRegressor (l2)" : make_pipeline(StandardScaler(),SGDRegressor(max_iter=1000, tol=1e-3,penalty="l2")),
    "SGDRegressor (l1)" : make_pipeline(StandardScaler(),SGDRegressor(max_iter=1000, tol=1e-3,penalty="l1")),
    "Elastic Net (random)" : ElasticNet(random_state=0,selection="random"),
    "Elastic Net (cyclic)" : ElasticNet(random_state=0,selection="cyclic"),
    "ARD" : ARDRegression(),
    "BayesianRidge" : BayesianRidge(),
    "RANSAC": RANSACRegressor(random_state=0),
    "TheilSenRegressor" : TheilSenRegressor(random_state=0),
    "PoissonRegressor" : PoissonRegressor(),
    "TweedieRegressor (auto)" : TweedieRegressor(link="auto"),
    "TweedieRegressor (identity)" : TweedieRegressor(link="identity"),
    "TweedieRegressor (log)" : TweedieRegressor(link="log"),
    "GammaRegressor" : GammaRegressor(),
    "PassiveAggressiveRegressor (epsilon_insensitive)" :  PassiveAggressiveRegressor(max_iter=100, random_state=0, tol=1e-3, loss="epsilon_insensitive"),
    "PassiveAggressiveRegressor (squared_epsilon_insensitive)" :  PassiveAggressiveRegressor(max_iter=100, random_state=0, tol=1e-3, loss="squared_epsilon_insensitive"),
    "KNeighbors" : KNeighborsRegressor(n_neighbors=3),
    "Radius Neighbors" : RadiusNeighborsRegressor(radius=1.0),
    "MLP" : MLPRegressor(random_state=1, max_iter=500),
    "DecisionTree" : DecisionTreeRegressor(random_state=0),
    "Extra Tree" : ExtraTreeRegressor(random_state=0),
    "Linear SVR (epsilon_insensitive)" : make_pipeline(StandardScaler(), LinearSVR(random_state=0, tol=1e-5, loss="epsilon_insensitive")),
    "Linear SVR (squared_epsilon_insensitive)" : make_pipeline(StandardScaler(), LinearSVR(random_state=0, tol=1e-5, loss="squared_epsilon_insensitive")),
    "SVR" : make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2)),
    "LassoLarsIC (bic)" : LassoLarsIC(criterion='bic', normalize=False),
    "LassoLarsIC (aic)" : LassoLarsIC(criterion='aic', normalize=False),
    "PLS" : PLSRegression(n_components=2),
    "OrthogonalMatchingPursuit" : OrthogonalMatchingPursuit(),
}

def test_regressor(regressor_type, X_train, X_test, y_train, y_test):
    print("Regressor", regressor_type, "training..")
    reg = regressors[regressor_type]
    reg.fit(X_train, y_train)
    y_predicted = reg.predict(X_test)
    err = metrics.mean_absolute_error(y_test, y_predicted) * 100
    pickle.dump(
        reg, open(f'trainedmodels/{regressor_type}.pkl', 'wb'))
    print("Regressor", regressor_type, "error is", err)
    return [regressor_type, err]

def test_regressors(X_train, X_test, y_train, y_test):
    result_queue = []
    multiple_results = [
        (test_regressor(key, X_train, X_test, y_train, y_test)) for key in regressors]
    for res in multiple_results:
        if res:
            try:
                tmp = res[0]
                if tmp is not None:
                    result_queue.append(tmp)
            except TimeoutError:
                print("\nClassifier", res[1], "exceeded the time limit.")
            except MemoryError:
                print("\nClassifier", res[1], "exceeded the memory limit.")

    mae = {}
    for value in multiple_results:
        mae[value[0]] = value[1]
    mae = {k: v for k, v in sorted(
        mae.items(), key=lambda item: item[1], reverse=False)}
    returning = ""
    returning += "---"*20
    returning += "\nResults (smaller error better): "
    i = 1
    for key in mae:
        returnin = "\n" + (str(i).zfill(2) + ' ' + key + ' ' +
                    '{:.2f}'.format(mae[key]) + '%')
        returning += returnin
        i += 1
    returning += "\n" + "---"*20
    return returning

output = test_regressors(X_train, X_test, y_train, y_test)
with open("trainedmodels/results.txt", "w") as f:
    f.write(output)

Regressor AdaBoost (square) training..
Regressor AdaBoost (square) error is 4.985049226420246
Regressor AdaBoost (linear) training..
Regressor AdaBoost (linear) error is 7.3411327706182155
Regressor Adaboost (exponential) training..
Regressor Adaboost (exponential) error is 8.992186745887498
Regressor Bagging training..
Regressor Bagging error is 1.4153253616696202
Regressor ExtraTrees (sq err) training..
Regressor ExtraTrees (sq err) error is 1.475516546184739
Regressor GradientBoosting (huber) training..
Regressor GradientBoosting (huber) error is 1.7817152356747517
Regressor GradientBoosting (sq err) training..
Regressor GradientBoosting (sq err) error is 1.807616700783599
Regressor GradientBoosting (abs err) training..
Regressor GradientBoosting (abs err) error is 1.8423264157616404
Regressor Random Forest (sq err) training..
Regressor Random Forest (sq err) error is 4.905215613798217
Regressor Random Forest (poisson) training..
Regressor Random Forest (poisson) error is 4.90521561



Regressor Linear SVR (epsilon_insensitive) error is 6.430908211784199
Regressor Linear SVR (squared_epsilon_insensitive) training..
Regressor Linear SVR (squared_epsilon_insensitive) error is 6.764375284714902
Regressor SVR training..
Regressor SVR error is 8.284432011219618
Regressor LassoLarsIC (bic) training..
Regressor LassoLarsIC (bic) error is 6.764388780914205
Regressor LassoLarsIC (aic) training..
Regressor LassoLarsIC (aic) error is 6.764388780914205
Regressor PLS training..
Regressor PLS error is 6.764382193587918
Regressor OrthogonalMatchingPursuit training..
Regressor OrthogonalMatchingPursuit error is 6.918958285651764




In [3]:
with open('trainedmodels/HistGradientBoosting (poisson).pkl', 'rb') as f:
    model = pickle.load(f)

test_results = model.predict(test) * 1500

output = pd.DataFrame({'id': test.index, 'num_sold': test_results})

output['num_sold'] = output['num_sold'].astype('int')

output.to_csv('submission3.csv', index=False)