# Анализ набора данных Product Demand Forecast

**Цель**: Определить, какие атрибуты (факторы) влияют на объем продаж. Это поможет вам понять, как различные переменные, такие как праздники, цены на нефть, типы магазинов и другие факторы, влияют на продажи. Это нужно для создания наиболее точной модели прогноза продаж

## Загрузка данных
Загрузим обработанные данные и исправим типы данных для их парвильного анализа

In [12]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [13]:
holidays_events = pd.read_csv('precessing_data/holidays_events.csv')
oil = pd.read_csv('precessing_data/oil.csv')
# sample_submission = pd.read_csv('precessing_data/sample_submission.csv')
stores = pd.read_csv('precessing_data/stores.csv')
# test = pd.read_csv('precessing_data/test.csv')
train = pd.read_csv('precessing_data/train.csv')
transactions = pd.read_csv('precessing_data/transactions.csv')

# Удаление ненужных столбцов, которые могут вызвать конфликты
holidays_events = holidays_events.drop(columns=['Unnamed: 0'])
oil = oil.drop(columns=['Unnamed: 0'])
stores = stores.drop(columns=['Unnamed: 0'])
train = train.drop(columns=['Unnamed: 0'])
transactions = transactions.drop(columns=['Unnamed: 0'])

dfs = {
    'holidays_events': holidays_events,
    'oil': oil,
    # 'sample_submission': sample_submission,
    'stores': stores,
    # 'test': test,
    'train': train,
    'transactions': transactions
}

def check_and_fix_dtypes(df: pd.DataFrame):

    df = df.infer_objects()
    try:
        df['date'] = pd.to_datetime(df['date'])
    except:
        pass

    return df

for name, df in dfs.items():
    dfs[name] = check_and_fix_dtypes(df)

holidays_events = dfs['holidays_events']
oil = dfs['oil']
stores = dfs['stores']
train = dfs['train']
transactions = dfs['transactions']

In [14]:
for name, df in dfs.items():
    print(name)
    print(df)
    print(df.dtypes)

holidays_events
          date        type    locale locale_name  transferred
0   2012-03-02     Holiday     Local       Manta        False
1   2012-04-01     Holiday  Regional    Cotopaxi        False
2   2012-04-12     Holiday     Local      Cuenca        False
3   2012-04-14     Holiday     Local    Libertad        False
4   2012-04-21     Holiday     Local    Riobamba        False
..         ...         ...       ...         ...          ...
345 2017-12-22  Additional  National     Ecuador        False
346 2017-12-23  Additional  National     Ecuador        False
347 2017-12-24  Additional  National     Ecuador        False
348 2017-12-25     Holiday  National     Ecuador        False
349 2017-12-26  Additional  National     Ecuador        False

[350 rows x 5 columns]
date           datetime64[ns]
type                   object
locale                 object
locale_name            object
transferred              bool
dtype: object
oil
           date  dcoilwtico
0    2013-01-02     

In [24]:
# Объединение train и transactions по ['store_nbr', 'date']
train_transactions = pd.merge(train, transactions, on=['store_nbr', 'date'])

# Объединение с holidays_events по 'date'
train_transactions_holidays = pd.merge(train_transactions, holidays_events, on='date', how='left')

# Объединение с oil по 'date'
full_data = pd.merge(train_transactions_holidays, oil, on='date', how='left')

# Объединение с stores по 'store_nbr'
full_data = pd.merge(full_data, stores, on='store_nbr', how='left')

# Переименование колонок для ясности
full_data.rename(columns={
    'type_x': 'holiday_type',  # Из holidays_events
    'locale': 'holiday_locale',
    'locale_name': 'holiday_locale_name',
    'transferred': 'holiday_transferred',
    'type_y': 'store_type',    # Из stores
    'cluster': 'store_cluster'
}, inplace=True)

full_data

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,transactions,holiday_type,holiday_locale,holiday_locale_name,holiday_transferred,dcoilwtico,city,state,store_type,store_cluster
0,561,2013-01-01,25,AUTOMOTIVE,0.000,0,770,Holiday,National,Ecuador,False,,Salinas,Santa Elena,D,1
1,562,2013-01-01,25,BABY CARE,0.000,0,770,Holiday,National,Ecuador,False,,Salinas,Santa Elena,D,1
2,563,2013-01-01,25,BEAUTY,2.000,0,770,Holiday,National,Ecuador,False,,Salinas,Santa Elena,D,1
3,564,2013-01-01,25,BEVERAGES,810.000,0,770,Holiday,National,Ecuador,False,,Salinas,Santa Elena,D,1
4,565,2013-01-01,25,BOOKS,0.000,0,770,Holiday,National,Ecuador,False,,Salinas,Santa Elena,D,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2805226,3000883,2017-08-15,9,POULTRY,438.133,0,2155,Holiday,Local,Riobamba,False,47.57,Quito,Pichincha,B,6
2805227,3000884,2017-08-15,9,PREPARED FOODS,154.553,1,2155,Holiday,Local,Riobamba,False,47.57,Quito,Pichincha,B,6
2805228,3000885,2017-08-15,9,PRODUCE,2419.729,148,2155,Holiday,Local,Riobamba,False,47.57,Quito,Pichincha,B,6
2805229,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,2155,Holiday,Local,Riobamba,False,47.57,Quito,Pichincha,B,6


In [29]:
full_data.isna().sum()

id                           0
date                         0
store_nbr                    0
family                       0
sales                        0
onpromotion                  0
transactions                 0
holiday_type           2346168
holiday_locale               0
holiday_locale_name    2346168
holiday_transferred    2346168
dcoilwtico                  33
city                         0
state                        0
store_type                   0
store_cluster                0
dtype: int64

In [31]:
full_data[full_data.filter(like='holiday_').columns] = full_data[full_data.filter(like='holiday_').columns].fillna('NoEvent')

In [32]:
full_data.isna().sum()

id                      0
date                    0
store_nbr               0
family                  0
sales                   0
onpromotion             0
transactions            0
holiday_type            0
holiday_locale          0
holiday_locale_name     0
holiday_transferred     0
dcoilwtico             33
city                    0
state                   0
store_type              0
store_cluster           0
dtype: int64