In [52]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load data
train = pd.read_csv('C:/store-sales-time-series-forecasting/train.csv')
test = pd.read_csv('C:/store-sales-time-series-forecasting/test.csv')
stores = pd.read_csv('C:/store-sales-time-series-forecasting/stores.csv')
oil = pd.read_csv('C:/store-sales-time-series-forecasting/oil.csv')
holidays = pd.read_csv('C:/store-sales-time-series-forecasting/holidays_events.csv')
transactions = pd.read_csv('C:/store-sales-time-series-forecasting/transactions.csv')

# Convert date columns to datetime
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])
oil['date'] = pd.to_datetime(oil['date'])
holidays['date'] = pd.to_datetime(holidays['date'])
transactions['date'] = pd.to_datetime(transactions['date'])

# Fill missing oil prices with forward fill
oil['dcoilwtico'] = oil['dcoilwtico'].fillna(method='ffill')

# Merge datasets
train = train.merge(stores, on='store_nbr', how='left')
train = train.merge(oil, on='date', how='left')
train = train.merge(holidays, on='date', how='left')
train = train.merge(transactions, on=['date', 'store_nbr'], how='left')

test = test.merge(stores, on='store_nbr', how='left')
test = test.merge(oil, on='date', how='left')
test = test.merge(holidays, on='date', how='left')
test = test.merge(transactions, on=['date', 'store_nbr'], how='left')

# Handle missing values in test
test['transactions'] = test['transactions'].fillna(0)

  oil['dcoilwtico'] = oil['dcoilwtico'].fillna(method='ffill')


In [56]:
# Create additional features
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['day_of_week'] = train['date'].dt.dayofweek

test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test['day_of_week'] = test['date'].dt.dayofweek

# Initialize LabelEncoder
le = LabelEncoder()

# List of columns to be encoded
columns_to_encode = ['family', 'city', 'state', 'type_x','type_y', 'locale', 'locale_name', 'description', 'transferred']

# Encode categorical features in train dataset
for column in columns_to_encode:
    if column in train.columns:
        train[column] = le.fit_transform(train[column])




In [62]:
import numpy as np
# Ensure the label encoder can handle new categories in the test data
for column in columns_to_encode:
    if column in test.columns:
        # Find new categories in the test set that are not in the training set
        new_categories = set(test[column].unique()) - set(le.classes_)
        
        # Add new categories to the classes_ attribute
        le.classes_ = np.append(le.classes_, list(new_categories))
        
        # Transform the test data
        test[column] = le.transform(test[column])

In [63]:
# Select features and target
features = ['store_nbr', 'family', 'onpromotion', 'transactions', 'dcoilwtico', 'year', 'month', 'day', 'day_of_week']
target = 'sales'

In [64]:

# Prepare data
X = train[features]
y = train[target]
X_test = test[features]

# Split train data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Validate the model
y_pred = model.predict(X_val)
print("Validation RMSE:", mean_squared_error(y_val, y_pred, squared=False))

# Predict on test data
test['sales'] = model.predict(X_test)

# Prepare submission
sample_submission = pd.read_csv('C:/store-sales-time-series-forecasting/sample_submission.csv')
submission = sample_submission.copy()
submission['sales'] = test['sales']

# Save to CSV
submission.to_csv('submission.csv', index=False)
print("Submission file saved.")



Validation RMSE: 198.28852926275667
Submission file saved.
