In [80]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression



In [83]:
data = pd.read_csv("processed_car_data.csv")

print(data.info())
print(data.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   symboling         205 non-null    int64  
 1   fueltype          205 non-null    int64  
 2   aspiration        205 non-null    int64  
 3   doornumber        205 non-null    int64  
 4   carbody           205 non-null    int64  
 5   drivewheel        205 non-null    int64  
 6   enginelocation    205 non-null    int64  
 7   wheelbase         205 non-null    float64
 8   carlength         205 non-null    float64
 9   carwidth          205 non-null    float64
 10  carheight         205 non-null    float64
 11  curbweight        205 non-null    int64  
 12  enginetype        205 non-null    int64  
 13  cylindernumber    205 non-null    int64  
 14  enginesize        205 non-null    int64  
 15  fuelsystem        205 non-null    int64  
 16  boreratio         205 non-null    float64
 1

In [64]:
def random_null(data, column_list, percentage):
    """
    Introduces random null values into specified columns of the data.

    Args:
        data (pd.DataFrame): The dataset to modify.
        column_list (list): List of columns to add null values.
        percentage (float): Percentage of rows to replace with nulls.

    Returns:
        pd.DataFrame: The modified dataset with null values.
    """
    df_copy = data.copy()  
    percentage = percentage / 100
    total_rows = df_copy.shape[0]
    
    for column_name in column_list:
        num_nulls = int(total_rows * percentage)
        null_indices = np.random.choice(df_copy.index, num_nulls, replace=False)
        
        if pd.api.types.is_integer_dtype(df_copy[column_name]):
            df_copy[column_name] = df_copy[column_name].astype("Int64")
        
        df_copy.loc[null_indices, column_name] = np.nan
        
    return df_copy


In [None]:
def get_null_indices(data, columns_list):
    """Get indices of null values for specified columns."""
    null_indices_dict = {
        column: data.index[data[column].isnull()].tolist()  
        for column in columns_list
    }
    return null_indices_dict


In [66]:
# List of discrete and continuous columns
discrete_columns = ["symboling", "fueltype", "aspiration", "doornumber", "carbody", 
                    "drivewheel", "enginelocation", "enginetype", "cylindernumber", 
                    "fuelsystem", "company_name"]

continuous_columns = ["horsepower", "peakrpm", "citympg", "highwaympg", "enginesize", "curbweight", 
                      "wheelbase", "carlength", "carwidth", "carheight", "boreratio", "stroke", 
                      "compressionratio", "price"]


In [67]:
# Introduce random null values into both discrete and continuous columns
data_with_null_values = random_null(data, discrete_columns + continuous_columns, 10)

# Get null indices for both discrete and continuous columns
discrete_null_indices = get_null_indices(data_with_null_values, discrete_columns)
continuous_null_indices = get_null_indices(data_with_null_values, continuous_columns)

# Combine both null indices dictionaries
null_indices_dict = {**discrete_null_indices, **continuous_null_indices}

# Print the null indices for both types
print("Null Indices for Discrete Columns:")
print(discrete_null_indices)

print("\nNull Indices for Continuous Columns:")
print(continuous_null_indices)


Null Indices for Discrete Columns:
{'symboling': [14, 24, 50, 78, 89, 95, 96, 105, 115, 137, 143, 147, 153, 158, 160, 168, 183, 186, 190, 196], 'fueltype': [8, 12, 17, 29, 30, 32, 35, 47, 51, 61, 68, 71, 109, 130, 141, 159, 162, 185, 193, 198], 'aspiration': [1, 6, 12, 15, 18, 37, 55, 57, 62, 107, 119, 126, 127, 138, 141, 143, 163, 175, 176, 192], 'doornumber': [9, 21, 39, 73, 94, 108, 109, 112, 123, 125, 130, 142, 150, 157, 169, 171, 178, 189, 192, 204], 'carbody': [1, 4, 10, 21, 25, 34, 43, 47, 61, 77, 92, 126, 133, 143, 145, 148, 155, 189, 192, 200], 'drivewheel': [3, 8, 15, 28, 77, 87, 101, 117, 118, 120, 129, 132, 136, 143, 147, 174, 176, 178, 184, 200], 'enginelocation': [14, 15, 24, 36, 37, 40, 43, 44, 45, 90, 95, 122, 124, 138, 140, 152, 160, 186, 190, 196], 'enginetype': [32, 38, 39, 42, 74, 92, 93, 97, 101, 123, 128, 131, 147, 149, 154, 157, 175, 182, 196, 201], 'cylindernumber': [0, 22, 23, 37, 38, 43, 48, 56, 66, 74, 76, 87, 107, 124, 132, 160, 175, 180, 183, 204], 'fuelsys

In [68]:
def iterative_Cleaning_discrete_values(data, null_indices_dict, columns_list, number_of_loops):
    """
    Iteratively fills null values in discrete columns using RandomForestClassifier.

    Args:
        data (pd.DataFrame): The dataset containing missing values.
        null_indices_dict (dict): A dictionary where keys are column names and values are lists of indices with null values.
        columns_list (list): List of column names to clean.
        number_of_loops (int): Number of iterations to perform the cleaning.

    Returns:
        pd.DataFrame: The dataset with missing values filled.
    """
    for i in range(1, number_of_loops + 1):
        for column_name in columns_list:
            if column_name not in null_indices_dict:
                continue  
            
            null_indices = null_indices_dict[column_name]
            non_null_indices = data.index.difference(null_indices)
            
            data_without_null_values = data.loc[non_null_indices]
            data_with_null_values = data.loc[null_indices]
        
            x_train = data_without_null_values.drop(columns=[column_name])
            y_train = data_without_null_values[column_name]
        
            model = RandomForestClassifier()
            model.fit(x_train, y_train)
        
            x_test = data_with_null_values.drop(columns=[column_name])
        
            predicted = model.predict(x_test)
        
            data.loc[null_indices, column_name] = predicted
    
    return data


In [69]:
def iterative_regression_imputation(data, null_indices_dict, continuous_columns, initial_fill_method_dict, num_rounds):
    """
    Iteratively fills missing values in continuous columns using Linear Regression.

    Args:
        data (pd.DataFrame): DataFrame with missing values.
        null_indices_dict (dict): Dictionary of columns with null indices.
        continuous_columns (list): List of continuous columns to clean.
        initial_fill_method_dict (dict): Initial fill methods ('mean', 'median', 'mode').
        num_rounds (int): Number of iterative rounds to perform.

    Returns:
        pd.DataFrame: DataFrame with all missing values filled.
    """
    # Step 1: Initial fill using mean, median, or mode
    data = fill_continuous_values(data, null_indices_dict, continuous_columns, initial_fill_method_dict) # type: ignore

    for _ in range(num_rounds):
        for column in continuous_columns:
            if column in null_indices_dict:
                null_indices = null_indices_dict[column]
                if not null_indices:  # Skip if there are no nulls in this column
                    continue
                
                # Prepare training data (other columns are features, target column is the one to predict)
                train_data = data.dropna(subset=[column])  # Rows where target column is not null
                test_data = data.loc[null_indices]  # Rows where target column is null

                x_train = train_data.drop(columns=[column])
                y_train = train_data[column]

                x_test = test_data.drop(columns=[column])

                # Train Linear Regression model
                model = LinearRegression()
                model.fit(x_train, y_train)

                # Predict missing values
                predicted_values = model.predict(x_test)

                # Fill the missing values with the predictions
                data.loc[null_indices, column] = predicted_values

    return data


In [70]:
def fill_continuous_values(data, null_indices_dict, columns_list, method_dict):
    """
    Fill missing continuous values in specified columns using the provided method (mean, median, mode).
    
    Args:
        data (pd.DataFrame): DataFrame containing the data with missing values.
        null_indices_dict (dict): Dictionary containing indices of null values for each column.
        columns_list (list): List of continuous columns to clean.
        method_dict (dict): A dictionary where keys are column names and values are the imputation method ('mean', 'median', 'mode').
    
    Returns:
        pd.DataFrame: DataFrame with missing continuous values filled.
    """
    for column_name in columns_list:
        if column_name in null_indices_dict:
            # Determine the method for imputation (mean, median, mode)
            method = method_dict.get(column_name, 'mean')  # Default to 'mean' if method is not specified
            
            # Handle the column type
            if method == 'mean':
                fill_value = data[column_name].mean()
            elif method == 'median':
                fill_value = data[column_name].median()
            elif method == 'mode':
                fill_value = data[column_name].mode()[0]  # Mode can return multiple values, we pick the first
            
            # If column type is 'Int64', we need to convert to float for mean/median, then round for int columns
            if pd.api.types.is_integer_dtype(data[column_name]):
                data[column_name] = data[column_name].astype(float)
                fill_value = round(fill_value)
            
            # Fill missing values with the determined fill_value
            data[column_name].fillna(fill_value, inplace=True)
            
    return data

# Example usage


# Assuming 'sample_data' is the DataFrame containing your data with missing values


In [71]:
method_dict = {
    'wheelbase': 'mean', 'carlength': 'median', 'carwidth': 'mean',
    'carheight': 'median', 'curbweight': 'mean', 'enginesize': 'median',
    'boreratio': 'mean', 'stroke': 'median', 'compressionratio': 'mean',
    'horsepower': 'median', 'peakrpm': 'median', 'citympg': 'mean',
    'highwaympg': 'mean', 'price': 'median'
}

In [None]:
sample_discrete = iterative_Cleaning_discrete_values(data_with_null_values, null_indices_dict, discrete_columns, 5)

filled_data = iterative_regression_imputation(
    data=data_with_null_values,
    null_indices_dict=null_indices_dict,
    continuous_columns=continuous_columns,
    initial_fill_method_dict=method_dict,
    num_rounds=5  # Number of iterations
)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column_name].fillna(fill_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column_name].fillna(fill_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

In [73]:
def compare_filled_columns(original_data, filled_data, null_indices_dict, columns_to_compare):
    """
    Compare specified columns in the original and filled data, showing only filled indices.

    Args:
        original_data (pd.DataFrame): Original DataFrame before introducing nulls.
        filled_data (pd.DataFrame): DataFrame after filling nulls.
        null_indices_dict (dict): Dictionary containing indices of null values for each column.
        columns_to_compare (list): List of columns to focus on for the comparison.

    Returns:
        pd.DataFrame: A DataFrame showing index, column_name, original_value, and filled_value.
    """
    comparison_results = []

    for column in columns_to_compare:
        if column in null_indices_dict:  
            null_indices = null_indices_dict[column]
            for idx in null_indices:
                original_value = original_data.at[idx, column]
                filled_value = filled_data.at[idx, column]

                comparison_results.append({
                    'index': idx,
                    'column_name': column,
                    'original_value': original_value,
                    'filled_value': filled_value
                })

    comparison_df = pd.DataFrame(comparison_results)
    return comparison_df


In [78]:
# Compare original and filled data for both discrete and continuous columns
comparison_df = compare_filled_columns(data, filled_data, null_indices_dict, ["wheelbase"])

# Display the comparison
print(comparison_df)


    index column_name  original_value  filled_value
0       3   wheelbase            99.8     98.279055
1      13   wheelbase           101.2     98.617288
2      37   wheelbase            96.5     97.210278
3      53   wheelbase            93.1     95.111840
4      63   wheelbase            98.8    101.428123
5      66   wheelbase           104.9    102.573710
6      79   wheelbase            93.0     92.736057
7      83   wheelbase            95.9     95.688036
8      86   wheelbase            96.3     96.384905
9      93   wheelbase            94.5     95.789372
10     96   wheelbase            94.5     95.368171
11    109   wheelbase           114.2    107.896242
12    113   wheelbase           114.2    108.866978
13    118   wheelbase            93.7     92.133871
14    119   wheelbase            93.7     93.020245
15    126   wheelbase            89.5     89.568175
16    150   wheelbase            95.7     94.267780
17    154   wheelbase            95.7     97.197952
18    161   