# Пропущенные значения

In [1]:
import pandas as pd
import numpy as np

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

bikes = pd.read_pickle('Data/BikesDataVars.pkl')
bikes.head(3)

Unnamed: 0,Date,Hour,Temperature,Humidity,Wind speed,Rainfall,Snowfall,Seasons,Holiday,Functioning Day,Rental Count,Normal Humidity,Temperature Category,Good Weather
0,2017-12-01,0,-5.2,37,2.2,0.0,0.0,Winter,0,True,257,0,Freezing,0
1,2017-12-01,1,-5.5,38,0.8,0.0,0.0,Winter,0,True,219,0,Freezing,0
2,2017-12-01,2,-6.0,39,1.0,0.0,0.0,Winter,0,True,162,0,Freezing,0


In [2]:
bikes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Date                  8760 non-null   datetime64[ns]
 1   Hour                  8760 non-null   int64         
 2   Temperature           8581 non-null   float64       
 3   Humidity              8760 non-null   int64         
 4   Wind speed            8760 non-null   float64       
 5   Rainfall              8760 non-null   float64       
 6   Snowfall              8760 non-null   float64       
 7   Seasons               8760 non-null   object        
 8   Holiday               8760 non-null   int64         
 9   Functioning Day       8760 non-null   bool          
 10  Rental Count          8760 non-null   int64         
 11  Normal Humidity       8760 non-null   int64         
 12  Temperature Category  8581 non-null   category      
 13  Good Weather      

## Подсчет пропусков

In [3]:
bikes.isna().sum()

Date                      0
Hour                      0
Temperature             179
Humidity                  0
Wind speed                0
Rainfall                  0
Snowfall                  0
Seasons                   0
Holiday                   0
Functioning Day           0
Rental Count              0
Normal Humidity           0
Temperature Category    179
Good Weather              0
dtype: int64

In [4]:
bikes[bikes['Temperature'].isna()].shape

(179, 14)

In [5]:
type(bikes[bikes['Temperature'].isna()]['Temperature'][39])

numpy.float64

In [6]:
bikes[bikes['Temperature'].notna()].shape

(8581, 14)

## Что делать с пропусками

### Вар 1 - убрать колонку целиком

### Вар 2 - убрать пропущенные значения

In [7]:
bikes.dropna(subset=['Temperature']).shape

(8581, 14)

### Вар 3 - заролнение числом

In [8]:
# Заполнение конкретным числом

bikes['Temperature_Median'] = bikes['Temperature'].fillna(42)

In [9]:
bikes.loc[bikes['Temperature'].isna(), ['Temperature', 'Temperature_Median']].head()

Unnamed: 0,Temperature,Temperature_Median
39,,42.0
50,,42.0
64,,42.0
105,,42.0
151,,42.0


In [10]:
bikes['Temperature_Median'] = bikes['Temperature'].fillna(value=bikes['Temperature'].median())

In [11]:
bikes.loc[bikes['Temperature'].isna(), ['Temperature', 'Temperature_Median']].head()

Unnamed: 0,Temperature,Temperature_Median
39,,13.7
50,,13.7
64,,13.7
105,,13.7
151,,13.7


In [12]:
bikes.iloc[38:42][['Seasons', 'Temperature_Median']]

# Минус - не учитывает другие факторы, например сезонность

Unnamed: 0,Seasons,Temperature_Median
38,Winter,7.3
39,Winter,13.7
40,Winter,6.4
41,Winter,6.0


### Вар 4 - заполнение случайными данными

In [13]:
# hot deck imputation
np.random.choice(bikes['Temperature'].dropna())

3.6

In [14]:
bikes['Temperature_Random'] = bikes['Temperature'].fillna(value=np.random.choice(bikes['Temperature'].dropna()))

bikes[bikes['Temperature'].isna()].head()

Unnamed: 0,Date,Hour,Temperature,Humidity,Wind speed,Rainfall,Snowfall,Seasons,Holiday,Functioning Day,Rental Count,Normal Humidity,Temperature Category,Good Weather,Temperature_Median,Temperature_Random
39,2017-12-02,15,,41,2.3,0.0,0.0,Winter,0,True,688,1,,0,13.7,21.2
50,2017-12-03,2,,79,1.4,0.0,0.0,Winter,0,True,262,0,,0,13.7,21.2
64,2017-12-03,16,,77,1.6,0.0,0.0,Winter,0,True,577,0,,0,13.7,21.2
105,2017-12-05,9,,31,1.3,0.0,0.0,Winter,0,True,313,0,,0,13.7,21.2
151,2017-12-07,7,,93,0.5,0.0,0.9,Winter,0,True,269,0,,0,13.7,21.2


In [15]:
bikes.shape

(8760, 16)

In [16]:
temps = np.random.choice(bikes['Temperature'].dropna(), bikes.shape[0])
temps[:30]

array([  8.9,  -3.5,  13.4,  20.5,  24.8,  16.2,   4.4, -11.5,  -6.5,
        18.8,  -5.2,  28.4,  29.3,  29.8,  -0.4,  16.2,  -0.8,  -1.8,
        -0.5,  16.2,  -0.8,  -0.1,  11.1,  21.1,  -4.4,  18.9,  -0.4,
        10.4,  14.4,  -0.3])

In [17]:
bikes['Temperature_Random'] = bikes['Temperature'].fillna(pd.Series(temps))

In [18]:
bikes[bikes['Temperature'].isna()].head()

Unnamed: 0,Date,Hour,Temperature,Humidity,Wind speed,Rainfall,Snowfall,Seasons,Holiday,Functioning Day,Rental Count,Normal Humidity,Temperature Category,Good Weather,Temperature_Median,Temperature_Random
39,2017-12-02,15,,41,2.3,0.0,0.0,Winter,0,True,688,1,,0,13.7,-7.8
50,2017-12-03,2,,79,1.4,0.0,0.0,Winter,0,True,262,0,,0,13.7,31.0
64,2017-12-03,16,,77,1.6,0.0,0.0,Winter,0,True,577,0,,0,13.7,10.0
105,2017-12-05,9,,31,1.3,0.0,0.0,Winter,0,True,313,0,,0,13.7,25.7
151,2017-12-07,7,,93,0.5,0.0,0.9,Winter,0,True,269,0,,0,13.7,7.8


In [19]:
bikes.iloc[38:42]

Unnamed: 0,Date,Hour,Temperature,Humidity,Wind speed,Rainfall,Snowfall,Seasons,Holiday,Functioning Day,Rental Count,Normal Humidity,Temperature Category,Good Weather,Temperature_Median,Temperature_Random
38,2017-12-02,14,7.3,35,1.3,0.0,0.0,Winter,0,True,1054,0,Chilly,0,7.3,7.3
39,2017-12-02,15,,41,2.3,0.0,0.0,Winter,0,True,688,1,,0,13.7,-7.8
40,2017-12-02,16,6.4,48,2.6,0.0,0.0,Winter,0,True,592,1,Chilly,0,6.4,6.4
41,2017-12-02,17,6.0,51,2.5,0.0,0.0,Winter,0,True,141,1,Chilly,0,6.0,6.0


In [20]:
bikes.groupby(by=[bikes['Date'].dt.isocalendar().week, 'Hour'])['Temperature'].median()

week  Hour
1     0      -4.3
      1      -4.8
      2      -5.3
      3      -5.5
      4      -5.1
             ... 
52    19     -0.4
      20     -1.0
      21     -1.6
      22     -1.7
      23     -1.0
Name: Temperature, Length: 1248, dtype: float64

In [21]:
temp_medians = bikes \
    .groupby(by=[bikes['Date'].dt.isocalendar().week, 'Hour'])\
    ['Temperature']\
    .transform('median')
temp_medians.head()

0    2.75
1    2.50
2    1.35
3    2.15
4    2.15
Name: Temperature, dtype: float64

In [22]:
bikes['Temperature_Median_Group'] = bikes['Temperature'].fillna(temp_medians)
bikes[bikes['Temperature'].isna()].head()

Unnamed: 0,Date,Hour,Temperature,Humidity,Wind speed,Rainfall,Snowfall,Seasons,Holiday,Functioning Day,Rental Count,Normal Humidity,Temperature Category,Good Weather,Temperature_Median,Temperature_Random,Temperature_Median_Group
39,2017-12-02,15,,41,2.3,0.0,0.0,Winter,0,True,688,1,,0,13.7,-7.8,7.5
50,2017-12-03,2,,79,1.4,0.0,0.0,Winter,0,True,262,0,,0,13.7,31.0,1.35
64,2017-12-03,16,,77,1.6,0.0,0.0,Winter,0,True,577,0,,0,13.7,10.0,7.2
105,2017-12-05,9,,31,1.3,0.0,0.0,Winter,0,True,313,0,,0,13.7,25.7,-2.4
151,2017-12-07,7,,93,0.5,0.0,0.9,Winter,0,True,269,0,,0,13.7,7.8,-4.85


In [23]:
bikes.iloc[38:42]

Unnamed: 0,Date,Hour,Temperature,Humidity,Wind speed,Rainfall,Snowfall,Seasons,Holiday,Functioning Day,Rental Count,Normal Humidity,Temperature Category,Good Weather,Temperature_Median,Temperature_Random,Temperature_Median_Group
38,2017-12-02,14,7.3,35,1.3,0.0,0.0,Winter,0,True,1054,0,Chilly,0,7.3,7.3,7.3
39,2017-12-02,15,,41,2.3,0.0,0.0,Winter,0,True,688,1,,0,13.7,-7.8,7.5
40,2017-12-02,16,6.4,48,2.6,0.0,0.0,Winter,0,True,592,1,Chilly,0,6.4,6.4,6.4
41,2017-12-02,17,6.0,51,2.5,0.0,0.0,Winter,0,True,141,1,Chilly,0,6.0,6.0,6.0


In [24]:
bikes.iloc[102:107]

Unnamed: 0,Date,Hour,Temperature,Humidity,Wind speed,Rainfall,Snowfall,Seasons,Holiday,Functioning Day,Rental Count,Normal Humidity,Temperature Category,Good Weather,Temperature_Median,Temperature_Random,Temperature_Median_Group
102,2017-12-05,6,-7.4,34,3.2,0.0,0.0,Winter,0,True,174,0,Freezing,0,-7.4,-7.4,-7.4
103,2017-12-05,7,-7.5,33,2.2,0.0,0.0,Winter,0,True,494,0,Freezing,0,-7.5,-7.5,-7.5
104,2017-12-05,8,-8.1,36,1.6,0.0,0.0,Winter,0,True,191,0,Freezing,0,-8.1,-8.1,-8.1
105,2017-12-05,9,,31,1.3,0.0,0.0,Winter,0,True,313,0,,0,13.7,25.7,-2.4
106,2017-12-05,10,-5.5,26,2.8,0.0,0.0,Winter,0,True,62,0,Freezing,0,-5.5,-5.5,-5.5


In [25]:
bikes = pd.read_pickle('Data/BikesDataVars.pkl')

bikes['Temperature'] = bikes['Temperature'].fillna(temp_medians)

In [26]:
def get_temp_cat(temp):
    if temp < 0:
        return 'Freezing'
    elif temp < 15:
        return 'Chilly'
    elif temp < 26:
        return 'Nice'
    elif temp >= 26:
        return 'Hot'
    else:
        return temp
    
bikes['Temperature Category'] = pd.Categorical(bikes['Temperature'].apply(get_temp_cat))

In [27]:
bikes.to_pickle('Data/BikesDataImputed.pkl')

In [28]:
bikes

Unnamed: 0,Date,Hour,Temperature,Humidity,Wind speed,Rainfall,Snowfall,Seasons,Holiday,Functioning Day,Rental Count,Normal Humidity,Temperature Category,Good Weather
0,2017-12-01,0,-5.2,37,2.2,0.0,0.0,Winter,0,True,257,0,Freezing,0
1,2017-12-01,1,-5.5,38,0.8,0.0,0.0,Winter,0,True,219,0,Freezing,0
2,2017-12-01,2,-6.0,39,1.0,0.0,0.0,Winter,0,True,162,0,Freezing,0
3,2017-12-01,3,-6.2,40,0.9,0.0,0.0,Winter,0,True,148,1,Freezing,0
4,2017-12-01,4,-6.0,36,2.3,0.0,0.0,Winter,0,True,97,0,Freezing,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2018-11-30,19,4.2,34,2.6,0.0,0.0,Autumn,0,True,644,0,Chilly,0
8756,2018-11-30,20,3.4,37,2.3,0.0,0.0,Autumn,0,True,359,0,Chilly,0
8757,2018-11-30,21,2.6,39,0.3,0.0,0.0,Autumn,0,True,1236,0,Chilly,0
8758,2018-11-30,22,2.1,41,1.0,0.0,0.0,Autumn,0,True,628,1,Chilly,0
