In [1]:
import pandas as pd

train_data = pd.read_csv('train.csv')  # Training data
test_data = pd.read_csv('test.csv')    # Test data

print("Training Data:")
print(train_data.head())

print("\nTest Data:")
print(test_data.head())

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [1]:
import os
print("Current Working Directory:", os.getcwd())

Current Working Directory: C:\Users\Zaharan


In [2]:
import pandas as pd

train_data = pd.read_csv('C:/Datasets/train.csv') 
test_data = pd.read_csv('C:/Datasets/test.csv')   

print("Training Data:")
print(train_data.head())

print("\nTest Data:")
print(test_data.head())

Training Data:
   id        date country              store             product  num_sold
0   0  2010-01-01  Canada  Discount Stickers   Holographic Goose       NaN
1   1  2010-01-01  Canada  Discount Stickers              Kaggle     973.0
2   2  2010-01-01  Canada  Discount Stickers        Kaggle Tiers     906.0
3   3  2010-01-01  Canada  Discount Stickers            Kerneler     423.0
4   4  2010-01-01  Canada  Discount Stickers  Kerneler Dark Mode     491.0

Test Data:
       id        date country              store             product
0  230130  2017-01-01  Canada  Discount Stickers   Holographic Goose
1  230131  2017-01-01  Canada  Discount Stickers              Kaggle
2  230132  2017-01-01  Canada  Discount Stickers        Kaggle Tiers
3  230133  2017-01-01  Canada  Discount Stickers            Kerneler
4  230134  2017-01-01  Canada  Discount Stickers  Kerneler Dark Mode


In [5]:

print("Missing Values in Training Data:")
print(train_data.isnull().sum())

print("\nData Types in Training Data:")
print(train_data.dtypes)

print("\nDate Range in Training Data:")
print(f"Earliest Date: {train_data['date'].min()}")
print(f"Latest Date: {train_data['date'].max()}")

Missing Values in Training Data:
id             0
date           0
country        0
store          0
product        0
num_sold    8871
dtype: int64

Data Types in Training Data:
id            int64
date         object
country      object
store        object
product      object
num_sold    float64
dtype: object

Date Range in Training Data:
Earliest Date: 2010-01-01
Latest Date: 2016-12-31


In [6]:

train_data = train_data.dropna(subset=['num_sold'])

print("Missing Values After Dropping Rows:")
print(train_data.isnull().sum())

Missing Values After Dropping Rows:
id          0
date        0
country     0
store       0
product     0
num_sold    0
dtype: int64


In [14]:

train_data['date'] = pd.to_datetime(train_data['date'])
test_data['date'] = pd.to_datetime(test_data['date'])

train_data['day_of_week'] = train_data['date'].dt.dayofweek  
train_data['month'] = train_data['date'].dt.month

test_data['day_of_week'] = test_data['date'].dt.dayofweek
test_data['month'] = test_data['date'].dt.month

train_data = train_data.drop('date', axis=1)
test_data = test_data.drop('date', axis=1)

print("\nTraining Data After Preprocessing:")
print(train_data.head())


Training Data After Preprocessing:
   id country              store             product  num_sold  day_of_week  \
1   1  Canada  Discount Stickers              Kaggle     973.0            4   
2   2  Canada  Discount Stickers        Kaggle Tiers     906.0            4   
3   3  Canada  Discount Stickers            Kerneler     423.0            4   
4   4  Canada  Discount Stickers  Kerneler Dark Mode     491.0            4   
5   5  Canada  Stickers for Less   Holographic Goose     300.0            4   

   month  
1      1  
2      1  
3      1  
4      1  
5      1  


In [15]:

X = train_data.drop(['id', 'num_sold'], axis=1)  
y = train_data['num_sold']

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Number of rows in X_train: {X_train.shape[0]}")
print(f"Number of rows in y_train: {y_train.shape[0]}")
assert X_train.shape[0] == y_train.shape[0], "X_train and y_train are not aligned!"

Number of rows in X_train: 177007
Number of rows in y_train: 177007


In [20]:

X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_val_encoded = pd.get_dummies(X_val, drop_first=True)


X_val_encoded = X_val_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)


X_test_encoded = pd.get_dummies(test_data.drop('id', axis=1), drop_first=True)
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)


print("\nPreprocessed Training Data:")
print(X_train_encoded.head())


Preprocessed Training Data:
        day_of_week  month  country_Finland  country_Italy  country_Kenya  \
109844            6      5            False           True          False   
36903             1      2            False          False          False   
8583              1      4            False           True          False   
55231             1      9            False          False          False   
120919            3      9            False          False           True   

        country_Norway  country_Singapore  store_Premium Sticker Mart  \
109844           False              False                        True   
36903            False              False                       False   
8583             False              False                       False   
55231             True              False                       False   
120919           False              False                       False   

        store_Stickers for Less  product_Kaggle  product_Kaggle Tiers

In [10]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=42)
model.fit(X_train_encoded, y_train)

In [11]:
from sklearn.metrics import mean_absolute_error

y_val_pred = model.predict(X_val_encoded)

mae = mean_absolute_error(y_val, y_val_pred)
print(f"Mean Absolute Error on Validation Set: {mae:.2f}")

Mean Absolute Error on Validation Set: 75.65


In [12]:

test_predictions = model.predict(X_test_encoded)

test_data['num_sold'] = test_predictions

submission = test_data[['id', 'num_sold']]  
submission.to_csv('submission.csv', index=False)

print("\nSubmission File Preview:")
print(submission.head())


Submission File Preview:
       id    num_sold
0  230130  140.635691
1  230131  871.733822
2  230132  715.245094
3  230133  367.168087
4  230134  472.724735


In [14]:
import pickle

with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [14]:

import pickle

with open('columns.pkl', 'wb') as file:
    pickle.dump(X_train_encoded.columns, file)