# Анализ и обработка пропусков в данных

In [1]:
import warnings
warnings.filterwarnings('ignore')
from pprint import pprint, pformat
from pathlib import Path

import joblib

from my_lib import *
from my_config import *

In [2]:
dataset_df = joblib.load(Path(result_foler, dataset_filename_after_EDA))
params = joblib.load(Path(result_foler, params_filename_after_EDA))
#display(dataset_df)
#display(params)

In [3]:
print("Доля пропусков в столбцах в %:")
nan_in_columns = nans_percents(dataset_df)
# список столбцов с пропусками
params["columns_with_nan"] = nan_in_columns[nan_in_columns > 0].index.to_list()
nan_in_columns[nan_in_columns > 0]

Доля пропусков в столбцах в %:


Age                      1.558750
Annual Income            3.745750
Marital Status           1.544083
Number of Dependents     9.139333
Occupation              29.839583
Health Score             6.173000
Previous Claims         30.335750
Vehicle Age              0.000500
Credit Score            11.490167
Insurance Duration       0.000083
Customer Feedback        6.485333
dtype: float64

Часть столбцов имеет пропуски - в основном небольшое количество. 

Заполним пропуски модой.

P.S. Пробовал удалять столбцы `Occupation` и `Previous Claims`, т.к. в них значительное количество пропусков - конечный результат практически не менялся.

## Заполнение пропусков модой

In [4]:
mode_age = dataset_df['Age'].mode()[0]
mode_previous_claims = dataset_df['Previous Claims'].mode()[0]
mode_vehicle_age = dataset_df['Vehicle Age'].mode()[0]

In [5]:
dataset_df_wo_nan = dataset_df.copy()
# train data
dataset_df_wo_nan.fillna({'Age': mode_age}, inplace=True)
fill_with_mode(dataset_df_wo_nan, ['Education Level', 'Location', 'Property Type'], 'Annual Income')
fill_with_mode(dataset_df_wo_nan, ['Age', 'Gender', 'Annual Income', 'Education Level'], 'Marital Status')
fill_with_mode(dataset_df_wo_nan, ['Age', 'Annual Income', 'Marital Status', 'Education Level', 'Location'], 'Number of Dependents')
fill_with_mode(dataset_df_wo_nan, ['Gender', 'Education Level', 'Location', 'Annual Income'], 'Occupation')
fill_with_mode(dataset_df_wo_nan, ['Age', 'Gender', 'Education Level', 'Smoking Status', 'Exercise Frequency'], 'Health Score')
dataset_df_wo_nan.fillna({'Previous Claims': mode_previous_claims}, inplace=True)
dataset_df_wo_nan.fillna({'Vehicle Age': mode_vehicle_age}, inplace=True)
fill_with_mode(dataset_df_wo_nan, ['Education Level', 'Location', 'Property Type', 'Annual Income'], 'Credit Score')
fill_with_mode(dataset_df_wo_nan, ['Policy Type', 'Policy Start Date'], 'Insurance Duration')
fill_with_mode(dataset_df_wo_nan, ['Education Level', 'Policy Type', 'Policy Start Date', 'Gender'], 'Customer Feedback')

print(nans_percents(dataset_df_wo_nan))

Age                     0.0
Gender                  0.0
Annual Income           0.0
Marital Status          0.0
Number of Dependents    0.0
Education Level         0.0
Occupation              0.0
Health Score            0.0
Location                0.0
Policy Type             0.0
Previous Claims         0.0
Vehicle Age             0.0
Credit Score            0.0
Insurance Duration      0.0
Policy Start Date       0.0
Customer Feedback       0.0
Smoking Status          0.0
Exercise Frequency      0.0
Property Type           0.0
Premium Amount          0.0
dtype: float64


In [6]:
print("Уникальные значения по столбцам после заполнения пропусков")
nunique = dataset_df_wo_nan[params["columns_X"]].nunique()
for column in nunique.index:
    if nunique[column] <= 10:
        print(f'{column:20}: {nunique[column]:6}, {dataset_df_wo_nan[column].unique().tolist()}')
    else:
        print(f'{column:20}: {nunique[column]:6}')

Уникальные значения по столбцам после заполнения пропусков
Age                 :     47
Gender              :      2, ['Female', 'Male']
Annual Income       :     11
Marital Status      :      3, ['Married', 'Divorced', 'Single']
Number of Dependents:      5, [0.0, 1.0, 3.0, 4.0, 2.0]
Education Level     :      4, ["Bachelor's", "Master's", 'High School', 'PhD']
Occupation          :      3, ['Employed', 'Self-Employed', 'Unemployed']
Health Score        :   2519
Location            :      3, ['Urban', 'Rural', 'Suburban']
Policy Type         :      3, ['Premium', 'Comprehensive', 'Basic']
Previous Claims     :     10, [2.0, 1.0, 0.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]
Vehicle Age         :     20
Credit Score        :     14
Insurance Duration  :      9, [5.0, 2.0, 1.0, 4.0, 3.0, 8.0, 6.0, 9.0, 7.0]
Policy Start Date   : 167381
Customer Feedback   :      3, ['Poor', 'Average', 'Good']
Smoking Status      :      2, ['No', 'Yes']
Exercise Frequency  :      4, ['Weekly', 'Monthly', 'Dail

In [7]:
# сохранить промежуточные результаты
_ = joblib.dump(dataset_df_wo_nan, Path(result_foler, dataset_filename_after_PrepareNans), compress=3)
_ = joblib.dump(params, Path(result_foler, params_filename_after_PrepareNans), compress=3)