In [46]:
import pandas as pd
import os
from typing import Optional

In [76]:
class DataLoader:
    def __init__(self):
        pass

    def _check_file_existence(self, file_path: str) -> None:
        if not os.path.isfile(file_path):
            raise FileNotFoundError(f"File {file_path} does not exist.")

    def _load_data(self, file_path: str) -> pd.DataFrame:
        self._check_file_existence(file_path)
        
        if file_path.endswith('.csv'):
            data = pd.read_csv(file_path)
        elif file_path.endswith('.xlsx') or file_path.endswith('.xls'):
            data = pd.read_excel(file_path)
        else:
            raise ValueError("Unsupported file format. Please provide a .csv or .xlsx/.xls file.")
        
        return data

    def _add_date_features(self, data: pd.DataFrame) -> pd.DataFrame:
        if not pd.api.types.is_datetime64_any_dtype(data['date']):
            data['date'] = pd.to_datetime(data['date'])

        data['weekday'] = data['date'].dt.weekday
        data['day'] = data['date'].dt.day
        data['month'] = data['date'].dt.month
        data['year'] = data['date'].dt.year

        return data

    def _preprocess_store_sales(self, data: pd.DataFrame) -> pd.DataFrame:
        data['date'] = pd.to_datetime(data['date'])
        return data

    def _preprocess_dates(self, data: pd.DataFrame) -> pd.DataFrame:
        data['event_name'] = data['event_name'].fillna('not_event')
        data['event_type'] = data['event_type'].fillna('not_event')
        data['date'] = pd.to_datetime(df['date'])
        data = self._add_date_features(data)
        return data
    
    def _check_store_sales_columns(self, data: pd.DataFrame) -> bool:
        required_columns = ['store_id', 'item_id', 'date', 'cnt']
        for col in required_columns:
            if col not in data.columns:
                raise ValueError(f"Column '{col}' not found in the dataset.")
        
        expected_dtypes = {'store_id': ['int64', 'object'],
                           'item_id': ['int64', 'object'],
                           'date': ['object'],
                           'cnt': ['int64']}
        
        for col, expected_types in expected_dtypes.items():
            if col in data.columns:
                actual_type = str(data[col].dtype)
                if actual_type not in expected_types:
                    raise ValueError(f"Column '{col}' should have dtype {' or '.join(expected_types)}, "
                                     f"but found '{actual_type}'.")
        
        return True

    def _check_dates_columns(self, data: pd.DataFrame) -> bool:
        required_columns = ['date', 'event_name', 'event_type']
        for col in required_columns:
            if col not in data.columns:
                raise ValueError(f"Column '{col}' not found in the dataset.")
        
        expected_dtypes = {'date': ['object'],
                           'event_name': ['object'],
                           'event_type': ['object']}
        
        for col, expected_types in expected_dtypes.items():
            if col in data.columns:
                actual_type = str(data[col].dtype)
                if actual_type not in expected_types:
                    raise ValueError(f"Column '{col}' should have dtype {' or '.join(expected_types)}, "
                                     f"but found '{actual_type}'.")
        
        return True

    def _load_store_sales(self, store_sales_path: str) -> pd.DataFrame:
        data = self._load_data(store_sales_path)

        if self._check_store_sales_columns(data):
            return self._preprocess_store_sales(data)

    def _load_dates(self, dates_path: str) -> pd.DataFrame:
        data = self._load_data(dates_path)

        if self._check_dates_columns(data):
            return self._preprocess_dates(data)

    def load_data(self, store_sales_path: str, dates_path: Optional[str] = None) -> pd.DataFrame:
        store_sales = self._load_store_sales(store_sales_path)

        if dates_path:
            store_sales_dates = self._load_dates(dates_path)
            store_sales = store_sales.merge(store_sales_dates, 'left', on='date')
            
        return store_sales

In [77]:
dl = DataLoader()
df = dl.load_data('../data/store_sales.csv', '../data/store_sales_dates.csv')
df.head()

Unnamed: 0,store_id,item_id,date,cnt,event_name,event_type,weekday,day,month,year
0,STORE_1,STORE_1_555,2011-01-29,51,not_event,not_event,5,29,1,2011
1,STORE_1,STORE_1_555,2011-01-30,45,not_event,not_event,6,30,1,2011
2,STORE_1,STORE_1_555,2011-01-31,25,not_event,not_event,0,31,1,2011
3,STORE_1,STORE_1_555,2011-02-01,39,not_event,not_event,1,1,2,2011
4,STORE_1,STORE_1_555,2011-02-02,23,not_event,not_event,2,2,2,2011


In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5457 entries, 0 to 5456
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   store_id    5457 non-null   object        
 1   item_id     5457 non-null   object        
 2   date        5457 non-null   datetime64[ns]
 3   cnt         5457 non-null   int64         
 4   event_name  5457 non-null   object        
 5   event_type  5457 non-null   object        
 6   weekday     5457 non-null   int32         
 7   day         5457 non-null   int32         
 8   month       5457 non-null   int32         
 9   year        5457 non-null   int32         
dtypes: datetime64[ns](1), int32(4), int64(1), object(4)
memory usage: 341.2+ KB
