# **Kaggle Playground Series S5E1 EDA**

In [1]:
import pandas as pd

train = pd.read_csv('train.csv', index_col='id')
test = pd.read_csv('test.csv', index_col='id')
label = 'num_sold'
categorical_features = ['country', 'store', 'product']
numerical_features = []

In [2]:
train

Unnamed: 0_level_0,date,country,store,product,num_sold
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0
...,...,...,...,...,...
230125,2016-12-31,Singapore,Premium Sticker Mart,Holographic Goose,466.0
230126,2016-12-31,Singapore,Premium Sticker Mart,Kaggle,2907.0
230127,2016-12-31,Singapore,Premium Sticker Mart,Kaggle Tiers,2299.0
230128,2016-12-31,Singapore,Premium Sticker Mart,Kerneler,1242.0


In [3]:
import plotly.express as px

def plot_categorical_counts(df, categorical_features):
    """
    Plots bar charts for each categorical feature in the dataframe,
    showing the counts of each category.
    
    Parameters:
    -----------
    df : pd.DataFrame
        The dataframe containing the data.
    categorical_features : list
        A list of column names of categorical features in df.
    """
    for cf in categorical_features:
        count_df = df[cf].value_counts(dropna=False).reset_index()
        count_df.columns = [cf, "count"]
        count_df = count_df.sort_values("count", ascending=False)

        fig = px.bar(
            count_df,
            x=cf,
            y="count",
            color=cf,
            text="count",
            title=f"Distribution of '{cf}'",
            template='simple_white',
            color_discrete_sequence=px.colors.qualitative.Pastel,
        )

        fig.update_traces(
            textposition="outside"
        )
        fig.update_layout(
            font=dict(
                family="Helvetica",
                size=12,
                color="black"
            ),
            xaxis_title=f"{cf.capitalize()} Categories",
            yaxis_title="Count",
            showlegend=False,
            margin=dict(l=30, r=30, t=60, b=30)
        )
        fig.show()

plot_categorical_counts(train, categorical_features)


In [4]:
c = 'Canada'
for p in train['product'].unique():
        df = train[
            (train['product'] == p) &
            (train['country'] == c) 
        ].dropna()
        fig = px.line(
            df,
            x='date',
            y='num_sold',
            color='store',
            title=f"Time series of '{p}' sales in '{c}'.",
            template='simple_white',
            color_discrete_sequence=px.colors.qualitative.Pastel,
        )
        fig.update_layout(
            font=dict(
                family="Helvetica",
                size=12,
                color="black"
            ),
            xaxis_title=f"Time",
            yaxis_title="Sales",
            showlegend=True,
            margin=dict(l=30, r=30, t=60, b=30)
        )
        fig.show()

In [14]:
def plot_ratios(df, column):
    agg_sales = df.groupby(['date', column])['num_sold'].sum().reset_index()
    total_sales = df.groupby(['date'])['num_sold'].sum().reset_index()
    sales_ratio = pd.merge(
        agg_sales,
        total_sales,
        how='left',
        on='date',
        suffixes=('_agg','_total')
    )
    sales_ratio['ratio'] = sales_ratio['num_sold_agg'] / sales_ratio['num_sold_total']

    fig = px.line(
        sales_ratio,
        x='date',
        y='ratio',
        color=column,
        title=f"Time series of sales ratio aggregated by '{column}'.",
        template='simple_white',
        color_discrete_sequence=px.colors.qualitative.Pastel,
    )
    fig.update_layout(
        font=dict(
            family="Helvetica",
            size=12,
            color="black"
        ),
        xaxis_title=f"Time",
        yaxis_title="Ratio",
        showlegend=True,
        margin=dict(l=30, r=30, t=60, b=30)
    )
    fig.show()

plot_ratios(train, 'store')
plot_ratios(train, 'country')
plot_ratios(train, 'product')

In [6]:
fig = px.line(
    train.groupby(['date'])['num_sold'].sum().reset_index(),
    x='date',
    y='num_sold',
    title=f"Time series of total sales.",
    template='simple_white',
    color_discrete_sequence=px.colors.qualitative.Pastel,
)
fig.update_layout(
    font=dict(
        family="Helvetica",
        size=12,
        color="black"
    ),
    xaxis_title=f"Time",
    yaxis_title="Ratio",
    showlegend=True,
    margin=dict(l=30, r=30, t=60, b=30)
)
fig.show()