In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df_train = pd.read_csv('/kaggle/input/playground-series-s3e19/train.csv')
df_train.head()

In [None]:
df_train.isna().sum()

**There is no null values in the dataset**

In [None]:
df_train.duplicated().sum()

**There is no duplicated values in the dataset**

# Exploratory Data Analysis

In [None]:
plt.pie(df_train['store'].value_counts().values, labels = df_train['store'].value_counts().index, autopct = '%1.1f%%')
plt.title('Stores in the Dataset')
plt.show()

In [None]:
country_sells = df_train[['country', 'num_sold']].groupby('country').sum().reset_index()

import plotly.graph_objects as go

fig = go.Figure(data = go.Bar(x = country_sells['country'], y = country_sells['num_sold'], marker_color = 'lightsalmon'))

fig.update_layout(
    title = 'Total Number of Sells in Each Country',
    xaxis_title = 'Country',
    yaxis_title = '# of sells')

fig.show()

In [None]:
store_sells = df_train[['store', 'num_sold']].groupby('store').sum().reset_index()

fig = go.Figure(data = go.Bar(x = store_sells['store'], y = store_sells['num_sold'], marker_color = 'purple'))

fig.update_layout(
    title = 'Total Number of Sells in Each Store',
    xaxis_title = 'Stores',
    yaxis_title = '# of sells')

fig.show()

In [None]:
product_sells = df_train[['product', 'num_sold']].groupby('product').sum().reset_index()

fig = go.Figure(data = go.Pie(labels = product_sells['product'], values = product_sells['num_sold']))

fig.update_layout(
    title='Total Sells for Each Product',
)
fig.show()

# Model

In [None]:
df_train.head()

In [None]:
X_train = df_train.drop(['id', 'date', 'num_sold'], axis = 1)

y_train = df_train['num_sold']

#By using LabelEncoder, I will convert categorical values to integers
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

features = ['country', 'store', 'product']

for i in features:
    X_train[i] = le.fit_transform(X_train[i])

In [None]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# XGBoost Regression 
xgb_reg = xgb.XGBRegressor()

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Grid Search Definition
grid_search = GridSearchCV(estimator=xgb_reg, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Grid Search Train
grid_search.fit(X_train, y_train)

# Getting best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Parameters:", best_params)

In [None]:
# Importing test set
df_test = pd.read_csv('/kaggle/input/playground-series-s3e19/test.csv')
df_test.head()

In [None]:
# Encoding test set
X_test = df_test.drop(['id', 'date'], axis = 1)
features = ['country', 'store', 'product']

for i in features:
    X_test[i] = le.fit_transform(df_test[i])

In [None]:
y_pred = best_model.predict(X_test)
y_pred

In [None]:
output = pd.DataFrame({'id': df_test.id, 'num_sold': y_pred})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")