In [1]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

In [2]:
#get data from csv
df = pd.read_csv("df_for_imputation.csv", dtype={"TIME OCC": str})

In [3]:
#get likely true number of crimes at noon
noon_avg = int((df[df["hour"] == 11].shape[0] + df[df["hour"] == 13].shape[0]) / 2)

#get length of data to drop
all_noons = df[df["hour"] == 12]
drop_len = len(all_noons) - noon_avg

#drop random noons
np.random.seed(42)
drop_noons = all_noons.sample(n=drop_len, random_state=42).index

#replace noons with nans
df["hour_less_12"] = df["hour"].copy()
df.loc[drop_noons, "hour_less_12"] = np.nan

In [4]:
#instantiate encoders for selected categorical features
area_le = LabelEncoder()
crime_le = LabelEncoder()
day_le = LabelEncoder()

#encode categorical features
df["crime_encoded"] = crime_le.fit_transform(df["crime_group"])
df["day_encoded"] = day_le.fit_transform(df["day"])
df["area_encoded"] = area_le.fit_transform(df["area"])

#get feature matrix
imputer_features = df[["crime_encoded", "day_encoded", "area_encoded"]].values

#impute missing hours
imputer = KNNImputer(n_neighbors=3)
df["hour_imputed"] = imputer.fit_transform(np.column_stack([imputer_features, df["hour_less_12"].values]))[:, -1]
df["hour_imputed"] = np.round(df["hour_imputed"]).astype(int)

In [5]:
#drop and rename columns
df = df.drop(columns=["hour", "hour_less_12", "crime_encoded", "day_encoded", "area_encoded"]).rename(columns={"hour_imputed":"hour"})

In [9]:
#export df to be used in notebook
df.to_csv("df_after_imputation.csv", index=False)