In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
holi_events = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv")
oil = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/oil.csv")
stores = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/stores.csv")
trans = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/transactions.csv")
train = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/train.csv")
test = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/test.csv")

/kaggle/input/store-sales-time-series-forecasting/oil.csv
/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv
/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv
/kaggle/input/store-sales-time-series-forecasting/stores.csv
/kaggle/input/store-sales-time-series-forecasting/train.csv
/kaggle/input/store-sales-time-series-forecasting/test.csv
/kaggle/input/store-sales-time-series-forecasting/transactions.csv


In [3]:
dataset = [holi_events, oil, stores, trans, train, test]
for ds in dataset:
    print(ds.info())
    print("=="*50)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   date         350 non-null    object
 1   type         350 non-null    object
 2   locale       350 non-null    object
 3   locale_name  350 non-null    object
 4   description  350 non-null    object
 5   transferred  350 non-null    bool  
dtypes: bool(1), object(5)
memory usage: 14.1+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        1218 non-null   object 
 1   dcoilwtico  1175 non-null   float64
dtypes: float64(1), object(1)
memory usage: 19.2+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   

# 📊 RMSLE (Root Mean Squared Logarithmic Error) Function
- **Purpose**: Computes the logarithmic error between actual and predicted values.
- **Non-Negativity**: Ensures `y_pred` is non-negative using `np.maximum()`.
- **Output**: Returns the square root of the mean squared logarithmic error.

---

# 📅 Convert Datetime Function
- **Functionality**: Converts the 'date' column to a datetime object using `pd.to_datetime()`.
- **Verification**: Prints the sorted date range for verification.

---

# 🗑️ Drop Duplicate Values Function
- **Identification**: Identifies duplicate rows in the dataset.
- **Reporting**: Prints the number of duplicates found.
- **Output**: Returns a cleaned DataFrame with duplicates removed.

---

# 🔍 EDA (Exploratory Data Analysis) Function
- **Feature Extraction**: Extracts time-based features (year, month, day, quarter, week, etc.).
- **Cyclic Encoding**: Encodes cyclic time features using sine and cosine transformations.
- **Categorical Conversion**: Converts categorical time features (month_name, day_of_week) into dummy variables.
- **Redundancy Removal**: Drops the original 'date' column to avoid redundancy.
- **Output**: Returns an enhanced feature-rich dataset.

In [10]:
%%time
yay = holi_events.copy()

# RMSLE (Root Mean Squared Logarithmic Error) function
def rmsle(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.maximum(np.array(y_pred), 0)
    return np.sqrt(np.mean((np.log1p(y_true) - np.log1p(y_pred)) ** 2))

# Function to convert date column from object to datetime type
def convert_datetime(df):
    df['date'] = pd.to_datetime(df['date'])
    print("The range of df date: ")
    print(df['date'].sort_values(ascending=True))

# Function to drop duplicates for better predictions
def drop_duplicates(df):
    num_duplicates = df.duplicated().sum()
    print(f'The dataset has {num_duplicates} duplicated value(s)')
    df.drop_duplicates(inplace=True)
    return df

# Convert the date column to get meaningful data
def eda(df):
    df['year'] = df['date'].dt.year
    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['quarter'] = df['date'].dt.quarter
    df['day_of_week'] = df['date'].dt.day_name()
    df['month_name'] = df['date'].dt.month_name()
    df['week'] = df['date'].dt.isocalendar().week.astype(int)

    # Apply sine/cosine encoding for cyclic features
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['quarter_sin'] = np.sin(2 * np.pi * df['quarter'] / 4)
    df['quarter_cos'] = np.cos(2 * np.pi * df['quarter'] / 4)
    df['day_sin'] = np.sin(2 * np.pi * df['day'] / 31)
    df['day_cos'] = np.cos(2 * np.pi * df['day'] / 31)
    df['cos_year'] = np.cos(df['year'] * (2 * np.pi) / 100)
    df['sin_year'] = np.sin(df['year'] * (2 * np.pi) / 100)
    
    # Drop 'date' column
    df.drop('date', axis=1, inplace=True)
    
    # One-hot encode categorical time features
    df = pd.get_dummies(df, columns=['month_name', 'day_of_week'])
    
    return df

# Function to display value counts of the dataset
def display_values(df):
    for col in df.columns:
        print(df[col].value_counts())
        print("=="*50)

def encoded_cols(df, cols):
    df = pd.get_dummies(df, columns=cols)
    return df

CPU times: user 319 µs, sys: 0 ns, total: 319 µs
Wall time: 274 µs


In [18]:
holi_events['description'].value_counts()

description
Carnaval                              10
Fundacion de Cuenca                    7
Fundacion de Ibarra                    7
Fundacion de Quito                     6
Provincializacion de Santo Domingo     6
                                      ..
Terremoto Manabi+8                     1
Recupero puente Navidad                1
Terremoto Manabi+10                    1
Terremoto Manabi+11                    1
Traslado Fundacion de Quito            1
Name: count, Length: 103, dtype: int64

In [15]:
train['family'].value_counts()

family
AUTOMOTIVE                    90936
HOME APPLIANCES               90936
SCHOOL AND OFFICE SUPPLIES    90936
PRODUCE                       90936
PREPARED FOODS                90936
POULTRY                       90936
PLAYERS AND ELECTRONICS       90936
PET SUPPLIES                  90936
PERSONAL CARE                 90936
MEATS                         90936
MAGAZINES                     90936
LIQUOR,WINE,BEER              90936
LINGERIE                      90936
LAWN AND GARDEN               90936
LADIESWEAR                    90936
HOME CARE                     90936
HOME AND KITCHEN II           90936
BABY CARE                     90936
HOME AND KITCHEN I            90936
HARDWARE                      90936
GROCERY II                    90936
GROCERY I                     90936
FROZEN FOODS                  90936
EGGS                          90936
DELI                          90936
DAIRY                         90936
CLEANING                      90936
CELEBRATION          