Data Cleaning

In [92]:
import pandas as pd
import numpy as np 

In [93]:
#create a random data set with name, age, sex, tenure, job, province
# Set the number of records
num_records = 1000

# Generate random data for each variable
names = [f"Person {i}" for i in range(1, num_records + 1)]
ages = np.random.randint(18, 65, size=num_records)
sexes = np.random.choice(["Male", "Female"], size=num_records)
tenures = np.random.randint(0, 20, size=num_records)  # Tenure in years
jobs = np.random.choice(["Engineer", "Teacher", "Doctor", "Artist", "Salesperson"], size=num_records)
provinces = np.random.choice(["Ontario", "Quebec", "British Columbia", "Alberta", "Manitoba"], size=num_records)

# Create a DataFrame calle rd
rd = pd.DataFrame({
    "Name": names,
    "Age": ages,
    "Sex": sexes,
    "Tenure": tenures,
    "Job": jobs,
    "Province": provinces
})

# Introduce missing data randomly
for col in rd.columns:
    # Randomly select indices to set as missing
    missing_indices = np.random.choice(rd.index, size=int(num_records * 0.1), replace=False)
    rd.loc[missing_indices, col] = np.nan  # Set selected values to NaN

# Display the first 5 rows of the DataFrame
print(rd.head().to_markdown(index=False, numalign="left", stralign="left"))

| Name     | Age   | Sex    | Tenure   | Job      | Province         |
|:---------|:------|:-------|:---------|:---------|:-----------------|
| nan      | nan   | Male   | 15       | Engineer | British Columbia |
| Person 2 | nan   | Male   | 7        | Engineer | British Columbia |
| Person 3 | 47    | Female | 16       | Artist   | Manitoba         |
| Person 4 | 50    | Male   | 16       | nan      | Ontario          |
| Person 5 | 29    | Female | 17       | Teacher  | British Columbia |


Identification of missing values

In [94]:
#Identify missing values by column
print(rd.isnull().sum())

Name        100
Age         100
Sex         100
Tenure      100
Job         100
Province    100
dtype: int64


In [95]:
#Indentify missing values by column a percentage
print(rd.isnull().sum() / len(rd) * 100)

Name        10.0
Age         10.0
Sex         10.0
Tenure      10.0
Job         10.0
Province    10.0
dtype: float64


Dropping missing values

In [96]:
#Drop rows with missing data
#first copy the df into a new df
rd2 = rd.copy()
rd2_dropped_rows = rd2.dropna()

In [97]:
#Missing Values are now null
print(rd2_dropped_rows.isnull().sum())

Name        0
Age         0
Sex         0
Tenure      0
Job         0
Province    0
dtype: int64


In [98]:
#speciy how many columns must be null before dropping
#example 3 columns must be null
rd_dropped = rd2.dropna(thresh= 3)

#none are dropped
print(rd_dropped.isnull().sum())

Name         98
Age          98
Sex         100
Tenure       99
Job          98
Province     99
dtype: int64


In [99]:
#Dropp Null Columns 
dropped_cols = rd2.dropna(axis=1)
dropped_cols

0
1
2
3
4
...
995
996
997
998
999


Imputation (Filing Missin Values)

In [100]:
#Fill using the mean of the column
#Can also use median and mode

#determine average age
average_age = rd2['Age'].mean()

#repalce missing value with mean value
#specify inplace to update the original df
rd2['Age'].fillna(average_age,inplace = True)

#age is no longer null
print(rd2.isnull().sum())


Name        100
Age           0
Sex         100
Tenure      100
Job         100
Province    100
dtype: int64


In [101]:
#Fill using a specified value

#replace null job with unknown
rd2['Job'].fillna('Unknown',inplace=True)

#Job is no longer null
print(rd2.isnull().sum())
rd2

Name        100
Age           0
Sex         100
Tenure      100
Job           0
Province    100
dtype: int64


Unnamed: 0,Name,Age,Sex,Tenure,Job,Province
0,,40.136667,Male,15.0,Engineer,British Columbia
1,Person 2,40.136667,Male,7.0,Engineer,British Columbia
2,Person 3,47.000000,Female,16.0,Artist,Manitoba
3,Person 4,50.000000,Male,16.0,Unknown,Ontario
4,Person 5,29.000000,Female,17.0,Teacher,British Columbia
...,...,...,...,...,...,...
995,,40.136667,Male,10.0,Teacher,Alberta
996,Person 997,19.000000,Female,13.0,Doctor,Quebec
997,Person 998,27.000000,Female,5.0,Unknown,Ontario
998,Person 999,20.000000,Male,2.0,Doctor,Alberta


In [102]:
#Forward Fill

#Fill tenure using the value from the next row 
rd2['Tenure'].ffill(inplace=True)

#tenure is no longer null
print(rd2.isnull().sum())
rd2

Name        100
Age           0
Sex         100
Tenure        0
Job           0
Province    100
dtype: int64


Unnamed: 0,Name,Age,Sex,Tenure,Job,Province
0,,40.136667,Male,15.0,Engineer,British Columbia
1,Person 2,40.136667,Male,7.0,Engineer,British Columbia
2,Person 3,47.000000,Female,16.0,Artist,Manitoba
3,Person 4,50.000000,Male,16.0,Unknown,Ontario
4,Person 5,29.000000,Female,17.0,Teacher,British Columbia
...,...,...,...,...,...,...
995,,40.136667,Male,10.0,Teacher,Alberta
996,Person 997,19.000000,Female,13.0,Doctor,Quebec
997,Person 998,27.000000,Female,5.0,Unknown,Ontario
998,Person 999,20.000000,Male,2.0,Doctor,Alberta


In [103]:
#Backward fill

#Fill sex using the value from the previous row 
rd2['Sex'].bfill(inplace=True)

#sex is no longer null
print(rd2.isnull().sum())
rd2


Name        100
Age           0
Sex           0
Tenure        0
Job           0
Province    100
dtype: int64


Unnamed: 0,Name,Age,Sex,Tenure,Job,Province
0,,40.136667,Male,15.0,Engineer,British Columbia
1,Person 2,40.136667,Male,7.0,Engineer,British Columbia
2,Person 3,47.000000,Female,16.0,Artist,Manitoba
3,Person 4,50.000000,Male,16.0,Unknown,Ontario
4,Person 5,29.000000,Female,17.0,Teacher,British Columbia
...,...,...,...,...,...,...
995,,40.136667,Male,10.0,Teacher,Alberta
996,Person 997,19.000000,Female,13.0,Doctor,Quebec
997,Person 998,27.000000,Female,5.0,Unknown,Ontario
998,Person 999,20.000000,Male,2.0,Doctor,Alberta


Handling Duplicates

In [104]:
#create a new df with dupes
rd3 = rd.copy()

rd4 = pd.concat([rd,rd3],ignore_index=True)
rd4

Unnamed: 0,Name,Age,Sex,Tenure,Job,Province
0,,,Male,15.0,Engineer,British Columbia
1,Person 2,,Male,7.0,Engineer,British Columbia
2,Person 3,47.0,Female,16.0,Artist,Manitoba
3,Person 4,50.0,Male,16.0,,Ontario
4,Person 5,29.0,Female,17.0,Teacher,British Columbia
...,...,...,...,...,...,...
1995,,,Male,10.0,Teacher,Alberta
1996,Person 997,19.0,Female,13.0,Doctor,Quebec
1997,Person 998,27.0,Female,5.0,,Ontario
1998,Person 999,20.0,Male,2.0,Doctor,Alberta


In [105]:
#Identify Dupes
#Count duplicate rows
#A dupes is identified if all columns are the same
print(rd4.duplicated().sum())  

1000


In [106]:
#Show the dupe rows
print(rd4[rd4.duplicated()])

             Name   Age     Sex  Tenure       Job          Province
1000          NaN   NaN    Male    15.0  Engineer  British Columbia
1001     Person 2   NaN    Male     7.0  Engineer  British Columbia
1002     Person 3  47.0  Female    16.0    Artist          Manitoba
1003     Person 4  50.0    Male    16.0       NaN           Ontario
1004     Person 5  29.0  Female    17.0   Teacher  British Columbia
...           ...   ...     ...     ...       ...               ...
1995          NaN   NaN    Male    10.0   Teacher           Alberta
1996   Person 997  19.0  Female    13.0    Doctor            Quebec
1997   Person 998  27.0  Female     5.0       NaN           Ontario
1998   Person 999  20.0    Male     2.0    Doctor           Alberta
1999  Person 1000  54.0    Male    16.0  Engineer            Quebec

[1000 rows x 6 columns]


In [107]:
#remove duplicates
rd4_no_dupes = rd4.drop_duplicates()
print(rd4_no_dupes.duplicated().sum()) 

0


Data Type Conversion

In [108]:
#Check data types
print(rd2.dtypes)

Name         object
Age         float64
Sex          object
Tenure      float64
Job          object
Province     object
dtype: object


In [109]:
#Convert Age to Int
#will throw errors when there are nulls, so using rd2
rd2['Age'] = rd2['Age'].astype(int)

print(rd2.dtypes)


Name         object
Age           int64
Sex          object
Tenure      float64
Job          object
Province     object
dtype: object


In [110]:
#Convert Sex to Categorical
#Use when there are few categories in string data
#helps improve performance
rd2['Sex'] = rd2['Sex'].astype('category')
print(rd2.dtypes)
rd2

Name          object
Age            int64
Sex         category
Tenure       float64
Job           object
Province      object
dtype: object


Unnamed: 0,Name,Age,Sex,Tenure,Job,Province
0,,40,Male,15.0,Engineer,British Columbia
1,Person 2,40,Male,7.0,Engineer,British Columbia
2,Person 3,47,Female,16.0,Artist,Manitoba
3,Person 4,50,Male,16.0,Unknown,Ontario
4,Person 5,29,Female,17.0,Teacher,British Columbia
...,...,...,...,...,...,...
995,,40,Male,10.0,Teacher,Alberta
996,Person 997,19,Female,13.0,Doctor,Quebec
997,Person 998,27,Female,5.0,Unknown,Ontario
998,Person 999,20,Male,2.0,Doctor,Alberta


Handling Inconsistent Data

In [111]:
#removing whitespace
rd4['Province'] = rd4['Province'].str.strip()

In [112]:
#converting to lower/upper case
rd4['Name'] = rd4['Name'].str.lower()
rd4['Province'] = rd4['Province'].str.upper()

In [None]:
#removing currency sign

# create random set of mixed currency values
import random

def generate_euro_dollar_values(num_values=100):
    """Generates a DataFrame with random Euro and Dollar values."""

    currencies = ['$', '€']
    data = []
    for _ in range(num_values):
        currency = random.choice(currencies)
        value = random.uniform(1, 1000)
        data.append(f"{currency}{value:.2f}")

    df = pd.DataFrame({'CurrencyValue': data})
    return df

# Generate and display the DataFrame
euro_dollar_df = generate_euro_dollar_values()

#create cleaned column im same df
euro_dollar_df['Clean_Currency'] = euro_dollar_df['CurrencyValue'].str.replace(r'[$,€]', '', regex=True) #Remove currency
euro_dollar_df['Clean_Currency'] = pd.to_numeric(euro_dollar_df['Clean_Currency'])

euro_dollar_df

Unnamed: 0,CurrencyValue,Clean_Currency
0,€96.83,96.83
1,€857.78,857.78
2,€352.81,352.81
3,€531.22,531.22
4,$232.94,232.94
...,...,...
95,€610.80,610.80
96,€99.15,99.15
97,€369.65,369.65
98,€70.47,70.47


Advanced Techniques

In [None]:
#nearest neighbour
from sklearn.impute import KNNImputer

#new data set
rd3 = rd.copy()

#this only works on numeric data so delete non numeric columns 
del rd3['Name']
del rd3['Sex']
del rd3['Province']
del rd3['Job']

rd3

Unnamed: 0,Age,Tenure
0,,15.0
1,,7.0
2,47.0,16.0
3,50.0,16.0
4,29.0,17.0
...,...,...
995,,10.0
996,19.0,13.0
997,27.0,5.0
998,20.0,2.0


In [None]:
# Nearet Neighbour
#Define how many "neighbours". It looks at the 5 nearest rows in this example to model the replacement value for nulls
#This works across all numeric columns, not just 1
imputer = KNNImputer(n_neighbors=5)  # Use 5 nearest neighbors
df_imputed = pd.DataFrame(imputer.fit_transform(rd3), columns=rd3.columns)

df_imputed


Unnamed: 0,Age,Tenure
0,37.2,15.0
1,41.0,7.0
2,47.0,16.0
3,50.0,16.0
4,29.0,17.0
...,...,...
995,40.6,10.0
996,19.0,13.0
997,27.0,5.0
998,20.0,2.0


In [None]:
#Model Based
#Using linear regression to predict 
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Create a copy to avoid modifying the original DataFrame
rd_imputed = rd.copy()

#The following example will predict Age using Tenure
#Tenure cannot have nulls so we need to fill them fist

#determine average tenure
average_tenure = rd_imputed['Tenure'].mean()

#repalce missing value with mean value
#specify inplace to update the original df
rd_imputed['Tenure'].fillna(average_tenure,inplace = True)


In [136]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split


def predict_nulls_with_linear_regression(df, target_column, features=None, random_state=42, handle_non_numeric='impute', fill_value_for_imputation=None):
    """
    Predicts null values in a dataset using linear regression.

    Args:
        df: pandas DataFrame containing the data.
        target_column: The name of the column with null values to predict.
        features:  A list of column names to use as features for the linear regression.
                   If None (default), all columns *except* the target_column will be used.
                   It's crucial that the selected features themselves have NO missing values in
                   the rows where the `target_column` *is* missing, AND the target_column has
                   no missing values in the rows used for *training*.  Handling this is part of the function.
        random_state:  Random state for reproducibility in train_test_split.  Defaults to 42.
        handle_non_numeric: Strategy for handling non-numeric columns in the features:
                - 'impute' (default): Imputes missing values using SimpleImputer (mean, median, most_frequent, or constant).
                - 'drop': Drops non-numeric columns from the feature set.
                - 'ignore': Keeps non-numeric columns. This will likely cause an error in the linear
                   regression unless your data is already one-hot encoded, etc.  Not recommended unless you know
                   what you are doing.
        fill_value_for_imputation:  If `handle_non_numeric` is 'impute' and `strategy='constant'`, this specifies
                                   the value to use for imputation.

    Returns:
        A tuple containing:
            - modified_df:  A copy of the input DataFrame with the null values in the target_column replaced by the predictions.
            - model: The trained LinearRegression model.  Can be used for further analysis or prediction on new data *if*
                     the feature columns are preprocessed in the same way (handling non-numeric data, etc.).
            - mse:  The Mean Squared Error of the model on the test set (before imputation). This helps evaluate the model's performance.
            - r2: The R-squared score of the model on the test set (before imputation).  Closer to 1 is better.

    Raises:
        ValueError: If the input DataFrame is empty, if the target_column doesn't exist,
                    or if all features are dropped due to non-numeric handling.
        TypeError:  If input df is not pandas DataFrame
    """

    # --- Input Validation ---
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input df must be a pandas DataFrame.")

    if df.empty:
        raise ValueError("Input DataFrame cannot be empty.")

    if target_column not in df.columns:
        raise ValueError(f"Target column '{target_column}' not found in DataFrame.")

    modified_df = df.copy()

    # --- Feature Selection and Preprocessing ---
    if features is None:
        features = [col for col in df.columns if col != target_column]
    else:  # check if user specified features are in the DataFrame
        if not all(feature in df.columns for feature in features):
            raise ValueError("One or more specified features are not in the DataFrame.")

    # Handle non-numeric features
    if handle_non_numeric == 'drop':
        numeric_features = df[features].select_dtypes(include=np.number).columns
        features = [col for col in features if col in numeric_features]
        if not features:
            raise ValueError("All features were dropped because they are non-numeric.  Consider using 'impute' or a different feature set.")
    elif handle_non_numeric == 'impute':
        numeric_features = df[features].select_dtypes(include=np.number).columns
        non_numeric_features = [col for col in features if col not in numeric_features]

        for col in non_numeric_features:  # convert object type features to numeric by mapping
            if df[col].dtype == 'object':
                # Create a mapping from unique string values to numbers
                mapping = {value: i for i, value in enumerate(df[col].unique())}
                modified_df[col] = modified_df[col].map(mapping)

        # Impute missing values *within* the feature columns.  Do this *before* splitting into known/unknown.
        imputer = SimpleImputer(strategy='mean')
        if fill_value_for_imputation is not None:  # for constant strategy
            imputer = SimpleImputer(strategy='constant', fill_value=fill_value_for_imputation)

        try:
            modified_df[features] = imputer.fit_transform(modified_df[features])
        except ValueError as e:
            raise ValueError(f"Error during imputation: {e}.  Check your feature columns and fill_value_for_imputation.")

    # --- Data Splitting (Known vs. Unknown Target) ---
    known_df = modified_df[modified_df[target_column].notna()]
    unknown_df = modified_df[modified_df[target_column].isna()]

    if known_df.empty:
        raise ValueError(f"No rows with non-null values in target column '{target_column}'. Cannot train the model.")
    if unknown_df.empty:
        print(f"No missing values in target column '{target_column}'. Returning the original DataFrame")
        return modified_df, None, None, None

    # Ensure that the features used have no missing values in either known_df and unknown_df *after* the above preprocessing
    if known_df[features].isnull().any().any():
        raise ValueError(f"Missing values found in feature columns of known_df. Ensure your features do not have missing values. \n {known_df[features].isnull().sum()}")
    if unknown_df[features].isnull().any().any():
        raise ValueError(f"Missing values found in feature columns of unknown_df. Ensure your features do not have missing values. \n {unknown_df[features].isnull().sum()}")

    # --- Train/Test Split (on the Known Data) ---
    X = known_df[features]
    y = known_df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

    # --- Model Training ---
    model = LinearRegression()
    model.fit(X_train, y_train)

    # --- Model Evaluation (on Test Set) ---
    y_pred = model.predict(X_test)
    # Crucial fix: Handle cases where y_test has zero variance (all same values)
    mse = np.mean((y_test - y_pred) ** 2) if len(np.unique(y_test)) > 1 else 0
    r2 = model.score(X_test, y_test) if len(np.unique(y_test)) > 1 else 1.0  # If all y values are the same, R^2 is not defined, so set to 1

    # --- Prediction on Unknown Data ---
    X_unknown = unknown_df[features]
    predicted_values = model.predict(X_unknown)

    # --- Imputation in the Original DataFrame ---
    modified_df.loc[modified_df[target_column].isna(), target_column] = predicted_values

    return modified_df, model, mse, r2


# --- Example Usage ---
if __name__ == '__main__':
    # Create a sample DataFrame (replace with your actual data)
    data = {
        'feature1': [1, 2, 3, 4, 5, np.nan, 7, 8, np.nan, 10],
        'feature2': [2, 4, np.nan, 8, 10, 12, 14, 16, 18, 20],
        'feature3': [1, 5, 6, 2, 3, 8, 9, 5, 6, 7],
        'target': [3, 6, 9, np.nan, 15, 18, np.nan, 24, 27, 30]
    }
    df = pd.DataFrame(data)
    # Introduce some strategically placed NaNs, to show imputation in features.
    df.loc[2, 'feature2'] = np.nan
    df.loc[5, 'feature1'] = np.nan

    # Example 1:  Predict 'target' using all other columns, imputing non-numeric features.
    try:
        df_imputed, model, mse, r2 = predict_nulls_with_linear_regression(df, 'target')
        print("Example 1 (Imputed):\n", df_imputed)
        print("MSE:", mse)
        print("R-squared:", r2)
        if model:
            print("Model coefficients:", model.coef_)
            print("Model intercept:", model.intercept_)
    except ValueError as e:
        print(f"Example 1 Error: {e}")

    # Example 2: Predict 'target' using only 'feature1', after dropping rows with NaNs.
    data2 = {
        'feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'target': [3, 6, 9, np.nan, 15, 18, np.nan, 24, 27, 30]
    }
    df2 = pd.DataFrame(data2)

    try:
        df2_imputed, model2, mse2, r2_2 = predict_nulls_with_linear_regression(
            df2, 'target', features=['feature1']
        )
        print("\nExample 2 (Dropping Rows):\n", df2_imputed)
        print("MSE:", mse2)
        print("R-squared:", r2_2)

    except ValueError as e:
        print(f"Example 2 Error: {e}")

    # Example 3: DataFrame with non-numeric features and handling with 'drop'.
    data3 = {
        'feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'feature2': ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'B', 'A', 'C'],  # Non-numeric
        'target': [3, 6, 9, np.nan, 15, 18, np.nan, 24, 27, 30]
    }
    df3 = pd.DataFrame(data3)
    try:
        df3_imputed, model3, mse3, r2_3 = predict_nulls_with_linear_regression(df3, 'target', handle_non_numeric='drop')
        print("\nExample 3 (Drop Non-Numeric):\n", df3_imputed)
        print("MSE:", mse3)
        print("R-squared:", r2_3)
    except ValueError as e:
        print(f"Example 3 Error: {e}")

    # Example 4: DataFrame with non-numeric features and handling with 'impute'.
    data4 = {
        'feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'feature2': ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'B', 'A', 'C'],  # Non-numeric
        'target': [3, 6, 9, np.nan, 15, 18, np.nan, 24, 27, 30]
    }
    df4 = pd.DataFrame(data4)
    try:
        df4_imputed, model4, mse4, r2_4 = predict_nulls_with_linear_regression(df4, 'target', handle_non_numeric='impute')
        print("\nExample 4 (Impute Non-Numeric):\n", df4_imputed)
        print("MSE:", mse4)
        print("R-squared:", r2_4)
    except ValueError as e:
        print(f"Example 4 Error: {e}")

    # Example 5: Demonstrating a case where the error handling is triggered.
    data5 = {
        'feature1': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],  # All NaNs
        'target': [3, 6, 9, np.nan, 15, 18, np.nan, 24, 27, 30]
    }
    df5 = pd.DataFrame(data5)

    try:
        df5_imputed, model5, mse5, r2_5 = predict_nulls_with_linear_regression(df5, 'target', features=['feature1'])
        print("\nExample 5 (All NaN Features):\n", df5_imputed)  # This won't be reached
    except ValueError as e:
        print(f"\nExample 5 Error: {e}")  # Expect a ValueError here

    # Example 6: empty df
    data6 = {
    }
    df6 = pd.DataFrame(data6)

    try:
        df6_imputed, model6, mse6, r2_6 = predict_nulls_with_linear_regression(df6, 'target', features=['feature1'])
        print("\nExample 6 (Empty df):\n", df6_imputed)  # This won't be reached
    except ValueError as e:
        print(f"\nExample 6 Error: {e}")

    # Example 7: target column doesn't exist
    data7 = {
        'feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'target': [3, 6, 9, np.nan, 15, 18, np.nan, 24, 27, 30]
    }
    df7 = pd.DataFrame(data7)

    try:
        df7_imputed, model7, mse7, r2_7 = predict_nulls_with_linear_regression(df7, 'target_wrong', features=['feature1'])
        print("\nExample 7 (wrong target column):\n", df7_imputed)  # This won't be reached
    except ValueError as e:
        print(f"\nExample 7 Error: {e}")

    # Example 8: Input is not DataFrame
    data8 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    try:
        df8_imputed, model8, mse8, r2_8 = predict_nulls_with_linear_regression(data8, 'target_wrong', features=['feature1'])
        print("\nExample 8 (Input is not df):\n", df8_imputed)  # This won't be reached
    except TypeError as e:
        print(f"\nExample 8 Error: {e}")

    # Example 9: features not in df
    data9 = {
        'feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'target': [3, 6, 9, np.nan, 15, 18, np.nan, 24, 27, 30]
    }
    df9 = pd.DataFrame(data9)
    try:
        df9_imputed, model9, mse9, r2_9 = predict_nulls_with_linear_regression(df9, 'target', features=['feature_wrong'])
        print("\nExample 9 (features not in df):\n", df9_imputed)  # This won't be reached
    except ValueError as e:
        print(f"\nExample 9 Error: {e}")

    # Example 10: target column has no missing values
    data10 = {
        'feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

Example 1 (Imputed):
    feature1   feature2  feature3     target
0       1.0   2.000000       1.0   3.000000
1       2.0   4.000000       5.0   6.000000
2       3.0  11.555556       6.0   9.000000
3       4.0   8.000000       2.0  12.231131
4       5.0  10.000000       3.0  15.000000
5       5.0  12.000000       8.0  18.000000
6       7.0  14.000000       9.0  19.411447
7       8.0  16.000000       5.0  24.000000
8       5.0  18.000000       6.0  27.000000
9      10.0  20.000000       7.0  30.000000
MSE: 3.519484882924048
R-squared: 0.9565495693466167
Model coefficients: [ 0.95930901  1.30948924 -0.50779229]
Model intercept: -1.0664347401796235

Example 2 (Dropping Rows):
    feature1  target
0       1.0     3.0
1       2.0     6.0
2       3.0     9.0
3       4.0    12.0
4       5.0    15.0
5       6.0    18.0
6       7.0    21.0
7       8.0    24.0
8       9.0    27.0
9      10.0    30.0
MSE: 7.888609052210118e-30
R-squared: 1.0

Example 3 (Drop Non-Numeric):
    feature1 feature2  t

