# Missing Data Imputation using KNN

In [1]:
import numpy as np
import pandas as pd 
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
)

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer

In [2]:
#import dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd drive/MyDrive/peak_prediction/


/content/drive/MyDrive/peak_prediction


In [4]:
ls


 [0m[01;34mcatboost_info[0m/
 daily_temperatures_impute_missing.csv
 imputed_data.csv
 japan.csv
 kyoto.csv
 liestal.csv
 merged_washington_1.csv
'merged_washington (1).csv'
 merged_washington_co2_test.csv
 merged_washington_co2_test_excel.csv
 merged_washington_co2_test.gsheet
 merged_washington.csv
 merged_washington_test.csv
 merged_washington_test_excel.csv
 merged_washington_test.gsheet
 meteoswiss.csv
 README.md
 south_korea.csv
 USA-NPN_individual_phenometrics_data.csv
 USA-NPN_individual_phenometrics_datafield_descriptions.csv
 USA-NPN_status_intensity_datafield_descriptions.csv
 USA-NPN_status_intensity_observations_data.csv
 vancouver.csv
 washingtondc.csv


In [5]:
df = pd.read_csv("daily_temperatures_impute_missing.csv")
filtered_df = df[df['year'] >= 1950]
filtered_df.tail()


Unnamed: 0,location,date,tmax,tmin,prcp,year,month,season,month_name
92192,vancouver,2022-02-24,,,,2022,2,Winter,February
92193,vancouver,2022-02-25,,,,2022,2,Winter,February
92194,vancouver,2022-02-26,,,,2022,2,Winter,February
92195,vancouver,2022-02-27,,,,2022,2,Winter,February
92196,vancouver,2022-02-28,,,,2022,2,Winter,February


##Set up the KNNimputer and select columns used for imputations


In [6]:
imputer = KNNImputer(n_neighbors=10, weights='distance', metric='nan_euclidean')

In [7]:
for_imputation = [
    "tmax",
    "tmin",
    "prcp",
]

others = ["location", "year", "date", "month", "season", "month_name"]

In [8]:
for_impute_df = filtered_df[for_imputation]
for_impute_df

Unnamed: 0,tmax,tmin,prcp
0,11.7,,0.0
1,14.4,,0.0
2,15.0,,18.0
3,21.7,,5.0
4,21.1,,0.0
...,...,...,...
92192,,,
92193,,,
92194,,,
92195,,,


In [9]:
impute_df = pd.DataFrame(imputer.fit_transform(for_impute_df),columns = for_impute_df.columns).reset_index()
impute_df

Unnamed: 0,index,tmax,tmin,prcp
0,0,11.700000,9.150000,0.000000
1,1,14.400000,6.850000,0.000000
2,2,15.000000,5.080000,18.000000
3,3,21.700000,4.790000,5.000000
4,4,21.100000,10.960000,0.000000
...,...,...,...,...
92192,92192,18.114105,8.081758,34.795868
92193,92193,18.114105,8.081758,34.795868
92194,92194,18.114105,8.081758,34.795868
92195,92195,18.114105,8.081758,34.795868


In [10]:
others_df = filtered_df[others].reset_index()
others_df

Unnamed: 0,index,location,year,date,month,season,month_name
0,0,washingtondc,1950,1950-01-01,1,Winter,January
1,1,washingtondc,1950,1950-01-02,1,Winter,January
2,2,washingtondc,1950,1950-01-03,1,Winter,January
3,3,washingtondc,1950,1950-01-04,1,Winter,January
4,4,washingtondc,1950,1950-01-05,1,Winter,January
...,...,...,...,...,...,...,...
92192,92192,vancouver,2022,2022-02-24,2,Winter,February
92193,92193,vancouver,2022,2022-02-25,2,Winter,February
92194,92194,vancouver,2022,2022-02-26,2,Winter,February
92195,92195,vancouver,2022,2022-02-27,2,Winter,February


##Assemble final impute dataset 

In [11]:
final_df = pd.concat([others_df, impute_df], axis = 1)
final_df = final_df.drop(columns = ['index'])

In [12]:
final_df.isna().sum()

location      0
year          0
date          0
month         0
season        0
month_name    0
tmax          0
tmin          0
prcp          0
dtype: int64

In [13]:
final_df.to_csv('imputed_data.csv', index=False)