In [77]:
import pandas as pd 
import plotly.graph_objects as go
import clean
import os
from sklearn.impute import KNNImputer, SimpleImputer
import warnings
import numpy as np

In [82]:
df = pd.read_csv('dengue_features_train.csv').loc[1:50, :]
df_clean = pd.read_csv('dengue_features_train_CLEAN.csv').dropna().loc[1:50, :]

def _impute_data(df: pd.DataFrame, categorical: bool = True) -> pd.DataFrame:
    """Imputes missing numerical or categorical values if the percentage of rows containing NaN's is > 5%.
    Else, returns a dataframe without those rows.
        Usage:
        -------
        dataframe_no_nan = impute_data(dataframe_with_nan)
    """
    df = df.infer_objects()

    if df.isna().sum().sum() / df.shape[0] <= 0.05:
        return df.dropna()
    
    knnimp = KNNImputer() # for numeric 
    simpimp = SimpleImputer(strategy='most_frequent') # for categorical

    for column in df.columns:
        if _is_likely_categorical(df[column]):
            warnings.warn("Column {} is likely categorical, creating dummies... run with categorical=False to disable".format(column))
            df[column] = simpimp.fit_transform(df[[column]])
    for column in df.columns:
        if _is_numeric(df[column]):
            print(df[column].dtypes)
            df[column] = knnimp.fit_transform(df[column].to_frame())
        else:
            df[column].fillna(value='')
    return df

def _is_likely_categorical(df_col: pd.Series) -> bool:
    return df_col.nunique() / df_col.count() < 0.05

def _is_numeric(df_col: pd.Series) -> bool:
    return np.issubdtype(df_col.dtype, np.number) 

In [86]:
_impute_data(df, categorical=False)

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
1,sj,1990.0,19.0,1990-05-07,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,...,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
2,sj,1990.0,20.0,1990-05-14,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,...,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4
3,sj,1990.0,21.0,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,...,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
4,sj,1990.0,22.0,1990-05-28,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,...,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8
5,sj,1990.0,23.0,1990-06-04,0.181978,0.17485,0.254314,0.181743,9.58,299.63,...,26.49,79.891429,9.58,17.212857,2.1,28.114286,6.942857,34.4,23.9,39.1
6,sj,1990.0,24.0,1990-06-11,0.1129,0.0928,0.205071,0.210271,3.48,299.207143,...,38.6,82.0,3.48,17.234286,2.042857,27.414286,6.771429,32.2,23.3,29.7
7,sj,1990.0,25.0,1990-06-18,0.0725,0.0725,0.151471,0.133029,151.12,299.591429,...,30.0,83.375714,151.12,17.977143,1.571429,28.371429,7.685714,33.9,22.8,21.1
8,sj,1990.0,26.0,1990-06-25,0.10245,0.146175,0.125571,0.1236,19.32,299.578571,...,37.51,82.768571,19.32,17.79,1.885714,28.328571,7.385714,33.9,22.8,21.1
9,sj,1990.0,27.0,1990-07-02,0.181978,0.12155,0.160683,0.202567,14.41,300.154286,...,28.4,81.281429,14.41,18.071429,2.014286,28.328571,6.514286,33.9,24.4,1.1
10,sj,1990.0,28.0,1990-07-09,0.192875,0.08235,0.191943,0.152929,22.27,299.512857,...,43.72,81.467143,22.27,17.418571,2.157143,27.557143,7.157143,31.7,21.7,63.7
