In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [24]:

class DataImputer:
    def __init__(self, df, target_column):
        """
        Initializes the DataImputer with the given DataFrame and target column (Y).
        
        Parameters:
        df (pd.DataFrame): The input DataFrame with missing values.
        target_column (str): The name of the target column (Y).
        """
        self.df = df
        self.target_column = target_column
        self.imputers = {}

    def train_models(self):
        """
        Trains linear regression models to predict each column with missing values.
        """
        missing_cols = self.df.columns[self.df.isnull().any()]

        for col in missing_cols:
            if col != self.target_column:  # Skip the target column
                complete_set = self.df.dropna(subset=[col])
                X_train = complete_set.drop(columns=[col])
                y_train = complete_set[col]
                
                # Train model
                model = LinearRegression()
                model.fit(X_train, y_train)
                
                # Store the trained model
                self.imputers[col] = model

    def impute_values(self):
        """
        Imputes missing values in the DataFrame using the trained models.
        """
        df_filled = self.df.copy()
        for idx, row in df_filled.iterrows():
            for col in self.imputers:
                if pd.isnull(row[col]):
                    X_incomplete = row.drop(labels=[col]).values.reshape(1, -1)
                    predicted_value = self.imputers[col].predict(X_incomplete)
                    df_filled.at[idx, col] = predicted_value
        return df_filled

    def get_imputed_dataframe(self):
        """
        Returns a new DataFrame with imputed values.
        
        Returns:
        pd.DataFrame: The DataFrame with imputed values.
        """
        self.train_models()
        return self.impute_values()




In [26]:
df = pd.read_csv(r'D:\python\test_csv_files\housing.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200


In [27]:
#ocean_proximity needs to be changed to numerical values
df = pd.get_dummies(df, columns=['ocean_proximity'])
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,False,False,False,True,False
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,False,False,False,True,False
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,False,False,False,True,False
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,False,False,False,True,False
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,False,False,False,True,False


In [28]:
#median_house_value is the target column
#remove it from the input features
x = df.drop(columns=['median_house_value'])
y = df['median_house_value']


In [31]:
#now lets find the missing values in the dataset and show them
missing_values = x.isnull().sum()
print(missing_values)

longitude                       0
latitude                        0
housing_median_age              0
total_rooms                     0
total_bedrooms                207
population                      0
households                      0
median_income                   0
ocean_proximity_<1H OCEAN       0
ocean_proximity_INLAND          0
ocean_proximity_ISLAND          0
ocean_proximity_NEAR BAY        0
ocean_proximity_NEAR OCEAN      0
dtype: int64


In [33]:
#now lets impute the missing values with our new class
imputer = DataImputer(x, 'median_house_value')
imputed_df = imputer.get_imputed_dataframe()

#now lets check if the missing values are imputed
missing_values = imputed_df.isnull().sum()
print(missing_values)

#and create a new datafream with the imputed values
df_imputed = pd.concat([imputed_df, y], axis=1)
df_imputed.head()




longitude                     0
latitude                      0
housing_median_age            0
total_rooms                   0
total_bedrooms                0
population                    0
households                    0
median_income                 0
ocean_proximity_<1H OCEAN     0
ocean_proximity_INLAND        0
ocean_proximity_ISLAND        0
ocean_proximity_NEAR BAY      0
ocean_proximity_NEAR OCEAN    0
dtype: int64


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,False,False,False,True,False,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,False,False,False,True,False,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,False,False,False,True,False,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,False,False,False,True,False,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,False,False,False,True,False,342200


In [35]:
#now lets find the missing values in the new dataset and show them
missing_values = df_imputed.isnull().sum()
print(missing_values)

longitude                     0
latitude                      0
housing_median_age            0
total_rooms                   0
total_bedrooms                0
population                    0
households                    0
median_income                 0
ocean_proximity_<1H OCEAN     0
ocean_proximity_INLAND        0
ocean_proximity_ISLAND        0
ocean_proximity_NEAR BAY      0
ocean_proximity_NEAR OCEAN    0
median_house_value            0
dtype: int64


In [36]:
#now lets find the missing values in the dataset and show them
missing_values = x.isnull().sum()
print(missing_values)

longitude                       0
latitude                        0
housing_median_age              0
total_rooms                     0
total_bedrooms                207
population                      0
households                      0
median_income                   0
ocean_proximity_<1H OCEAN       0
ocean_proximity_INLAND          0
ocean_proximity_ISLAND          0
ocean_proximity_NEAR BAY        0
ocean_proximity_NEAR OCEAN      0
dtype: int64


In [37]:
df_imputed['total_bedrooms']

0         129.0
1        1106.0
2         190.0
3         235.0
4         280.0
          ...  
20635     374.0
20636     150.0
20637     485.0
20638     409.0
20639     616.0
Name: total_bedrooms, Length: 20640, dtype: float64