In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

pd.options.mode.chained_assignment = None

In [2]:
# load sample submission
sample_submission = pd.read_csv("data/sample_submission.csv")
sample_submission.set_index("ID", inplace=True)

# load cleaned train data
train = pd.read_csv("train_cleaned.csv")
# split into train and test
test_X = train.loc[(train["date_block_num"] == train["date_block_num"].max())]
train_X = train.loc[(train["date_block_num"] < train["date_block_num"].max())]
train_y = train_X.pop("item_cnt_day")
test_y = test_X.pop("item_cnt_day")
# drop date_block_num column since we dont need it anymore
train_X.drop(columns=["date_block_num"], inplace=True)
test_X.drop(columns=["date_block_num"], inplace=True)

# load data we want to predict
pred_data = pd.read_csv("test_cleaned.csv")

In [3]:
# declare linear regression model we will use to make predictions
model = LinearRegression()
# fit model to training data
model.fit(train_X.values, train_y.values)

LinearRegression()

In [4]:

# evaluate model accuracy

# True when we want to round the numbers to Integers
round_predictions = False

test_predictions = model.predict(test_X)

if round_predictions:
    test_predictions = test_predictions.round()
# difference between predictions and real sales
err = np.absolute(test_predictions - test_y.values)
# avg of difference
std_err = err.sum() / err.shape[0]
# root mean squared error
rmse = mean_squared_error(test_y.values, test_predictions, squared=False)
print(f"Model accuracy: {std_err}, RMSE: {rmse}")

Model accuracy: 1.3814830235900262, RMSE: 5.095005229373506


In [5]:
# make predictions for submission
predictions = model.predict(pred_data.values)
if round_predictions:
    predictions = predictions.round()
sample_submission["item_cnt_month"] = predictions
sample_submission.to_csv("submission.csv")