In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from itertools import combinations
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from collections import defaultdict
from sklearn.covariance import MinCovDet
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
df=pd.read_csv('house_price_train.csv')
df.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


X_train = train_df.drop(columns=['SalePrice'])
y_train = train_df['SalePrice']

X_test = test_df.drop(columns=['SalePrice'])
y_test = test_df['SalePrice']

X_train.to_csv("X_train.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
X_test.to_csv("X_test.csv", index=False)
y_test.to_csv("y_test.csv", index=False)



In [4]:
class DataPreprocessor:
    def __init__(self, X_train_path, y_train_path, X_test_path, y_test_path, run_default=False):
        self.X_train = pd.read_csv(X_train_path)
        self.y_train = pd.read_csv(y_train_path).squeeze()
        self.X_test = pd.read_csv(X_test_path)
        self.y_test = pd.read_csv(y_test_path).squeeze()

        self.manual_ordinals = []
        self.ordinal_mappings = {}

        if run_default:
            self.run_full_pipeline()

    def run_full_pipeline(self,
                          null_threshold=0.8,
                          correlation_threshold=0.05,
                          numeric_impute='median',
                          categoric_impute='mode',
                          outlier_method='zscore',
                          scaler='standard',
                          metrics=['r2', 'mae', 'mse']):
        print("Running full preprocessing pipeline with default settings...")

        self.drop_uninformative_columns(null_threshold, correlation_threshold)
        self.impute_missing_values(numeric_impute, categoric_impute)
        self.detect_and_clean_outliers(outlier_method)
        self.encode_features(self.manual_ordinals, self.ordinal_mappings)
        self.scale_features(scaler)
        self.train_and_evaluate_model(metrics)
    def drop_uninformative_columns(self, null_threshold=0.8, correlation_threshold=0.05, multicol_thresh=0.75):
        print("Dropping uninformative columns...")

        df = self.X_train.copy()
        target = self.y_train.name if self.y_train.name else 'target'
        df[target] = self.y_train

        flagged = {}

        # Null too high
        null_ratios = df.isnull().mean()
        flagged['TooManyNulls'] = null_ratios[null_ratios > null_threshold].index.tolist()

        # Unique ID columns
        flagged['UniqueIDColumns'] = [col for col in df.columns if df[col].nunique(dropna=False) == df.shape[0]]

        # Low variance
        flagged['LowVariance'] = [col for col in df.columns
                                  if df[col].value_counts(normalize=True, dropna=False).values[0] > 0.99]

        # Low correlation
        numeric_cols = df.select_dtypes(include=np.number).columns
        if target in numeric_cols:
            correlations = df[numeric_cols].corr()[target].abs()
            low_corr_cols = correlations[correlations < correlation_threshold].index.tolist()
            if target in low_corr_cols:
                low_corr_cols.remove(target)
            flagged['LowCorrelation'] = low_corr_cols
        else:
            flagged['LowCorrelation'] = []

        # Redundant combinations: A + B = C
        redundant_combos = []
        to_drop_redundant = []
        valid_cols = [col for col in numeric_cols if df[col].nunique() >= 5 and (df[col] == 0).mean() <= 0.8]

        for col1, col2 in combinations(valid_cols, 2):
            summed = df[col1].fillna(0) + df[col2].fillna(0)
            for col3 in valid_cols:
                if col3 in (col1, col2): continue
                match_ratio = np.isclose(summed, df[col3].fillna(0), rtol=0.01, atol=1).mean()
                if match_ratio > 0.98:
                    redundant_combos.append((col1, col2, col3))
                    to_drop_redundant.append(col3)

        flagged['RedundantToDrop'] = list(set(to_drop_redundant))

        # Multicollinearity
        corr_matrix = df[numeric_cols].corr().abs()
        upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        to_drop_multi = []

        for col in upper_tri.columns:
            for row in upper_tri.index:
                if col == target or row == target:
                    continue
                corr_val = upper_tri.loc[row, col]
                if pd.notnull(corr_val) and corr_val > multicol_thresh:
                    to_drop_multi.append(row if corr_matrix[target][row] < corr_matrix[target][col] else col)

        flagged['MulticollinearityToDrop'] = list(set(to_drop_multi))

        cols_to_drop = set(sum(flagged.values(), []))
        self.X_train = self.X_train.drop(columns=cols_to_drop, errors='ignore')
        self.X_test = self.X_test.drop(columns=cols_to_drop, errors='ignore')

        self.flags = flagged
        print(f"Dropped {len(cols_to_drop)} columns.")

        return flagged
    def impute_missing_values(self, strategy_numeric='median', strategy_categoric='mode'):
        print("The imputation process has begun...")

        X_train = self.X_train.copy()
        X_test = self.X_test.copy()

        numeric_cols = X_train.select_dtypes(include=np.number).columns
        categoric_cols = X_train.select_dtypes(exclude=np.number).columns

        for col in numeric_cols:
            missing_ratio = X_train[col].isnull().mean()

            if strategy_numeric == 'median':
                fill_value = X_train[col].median()
            elif strategy_numeric == 'mean':
                fill_value = X_train[col].mean()
            elif strategy_numeric == 'constant':
                fill_value = 0
            elif strategy_numeric == 'missing_flag':
                X_train[col + '_missing_flag'] = X_train[col].isnull().astype(int)
                X_test[col + '_missing_flag'] = X_test[col].isnull().astype(int)
                fill_value = X_train[col].median()
            else:
                raise ValueError(f"Naməlum numeric imputation strategiyası: {strategy_numeric}")

            X_train[col] = X_train[col].fillna(fill_value)
            X_test[col] = X_test[col].fillna(fill_value)

        for col in categoric_cols:
            if strategy_categoric == 'mode':
                fill_value = X_train[col].mode(dropna=True)[0] if not X_train[col].mode(dropna=True).empty else 'Missing'
            elif strategy_categoric == 'missing':
                fill_value = 'Missing'
            elif strategy_categoric == 'constant':
                fill_value = 'Unknown'
            else:
                raise ValueError(f"Naməlum categoric imputation strategiyası: {strategy_categoric}")

            X_train[col] = X_train[col].fillna(fill_value)
            X_test[col] = X_test[col].fillna(fill_value)

        self.X_train = X_train
        self.X_test = X_test

        print("Imputation completed.")
    def detect_and_clean_outliers(self, method='zscore'):
        print("Outlier removal started...")

        X = self.X_train.copy()
        y = self.y_train.copy()
        numeric_cols = X.select_dtypes(include='number').columns

        valid_cols = [col for col in numeric_cols if X[col].nunique() >= 10]

        if method == 'zscore':
            mask = np.ones(len(X), dtype=bool)
            for col in valid_cols:
                z_scores = np.abs(stats.zscore(X[col].fillna(X[col].median())))
                mask &= (z_scores < 3)
            self.X_train = X[mask].reset_index(drop=True)
            self.y_train = y[mask].reset_index(drop=True)

        elif method == 'iqr':
            mask = np.ones(len(X), dtype=bool)
            for col in valid_cols:
                Q1 = X[col].quantile(0.25)
                Q3 = X[col].quantile(0.75)
                IQR = Q3 - Q1
                lower = Q1 - 1.5 * IQR
                upper = Q3 + 1.5 * IQR
                mask &= X[col].between(lower, upper)
            self.X_train = X[mask].reset_index(drop=True)
            self.y_train = y[mask].reset_index(drop=True)

        elif method == 'modified_z':
            mask = np.ones(len(X), dtype=bool)
            for col in valid_cols:
                median = X[col].median()
                mad = np.median(np.abs(X[col] - median))
                if mad == 0:
                    continue
                mod_z = 0.6745 * (X[col] - median) / mad
                mask &= (np.abs(mod_z) < 3.5)
            self.X_train = X[mask].reset_index(drop=True)
            self.y_train = y[mask].reset_index(drop=True)

        elif method == 'isolation_forest':
            from sklearn.ensemble import IsolationForest
            model = IsolationForest(contamination=0.01, random_state=42)
            preds = model.fit_predict(X[valid_cols].fillna(0))
            mask = preds != -1
            self.X_train = X[mask].reset_index(drop=True)
            self.y_train = y[mask].reset_index(drop=True)

        else:
            raise ValueError(f"Unspecified outlier method: {method}")

        print(f"Outlier removal completed. Remaining number of observations: {len(self.X_train)}")
    def encode_features(self, manual_ordinals=None, ordinal_mappings=None):
        print("Encoding stage started...")

        if manual_ordinals is not None:
            self.manual_ordinals = manual_ordinals
        if ordinal_mappings is not None:
            self.ordinal_mappings = ordinal_mappings

        X_train = self.X_train.copy()
        X_test = self.X_test.copy()

        manual_ordinals = self.manual_ordinals
        ordinal_mappings = self.ordinal_mappings

        categorical_cols = X_train.select_dtypes(exclude=np.number).columns.tolist()

        label_encoding_cols = []
        one_hot_encoding_cols = []
        ordinal_encoding_cols = [col for col in manual_ordinals if col in categorical_cols]

        for col in categorical_cols:
            if col in ordinal_encoding_cols:
                continue
            nunique = X_train[col].nunique()
            if nunique <= 2:
                label_encoding_cols.append(col)
            else:
                one_hot_encoding_cols.append(col)

        from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

        # Label encoding
        for col in label_encoding_cols:
            le = LabelEncoder()
            X_train[col] = le.fit_transform(X_train[col])
            X_test[col] = X_test[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

        # Ordinal encoding
        for col in ordinal_encoding_cols:
            if col not in ordinal_mappings:
                continue
            mapping = ordinal_mappings[col]
            enc = OrdinalEncoder(categories=[mapping], handle_unknown='use_encoded_value', unknown_value=-1)
            X_train[col] = enc.fit_transform(X_train[[col]])
            X_test[col] = enc.transform(X_test[[col]])

        # One-hot encoding
        if one_hot_encoding_cols:
            ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
            ohe.fit(X_train[one_hot_encoding_cols])

            train_encoded = pd.DataFrame(
                ohe.transform(X_train[one_hot_encoding_cols]),
                columns=ohe.get_feature_names_out(one_hot_encoding_cols),
                index=X_train.index
            )

            test_encoded = pd.DataFrame(
                ohe.transform(X_test[one_hot_encoding_cols]),
                columns=ohe.get_feature_names_out(one_hot_encoding_cols),
                index=X_test.index
            )

            X_train = X_train.drop(columns=one_hot_encoding_cols).join(train_encoded)
            X_test = X_test.drop(columns=one_hot_encoding_cols).join(test_encoded)

        self.X_train = X_train
        self.X_test = X_test

        print("Encoding completed.")
    def scale_features(self, scaler='standard'):
        print("Scaling started...")

        from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

        X_train = self.X_train.copy()
        X_test = self.X_test.copy()

        numeric_cols = X_train.select_dtypes(include=np.number).columns

        if scaler == 'standard':
            scaler_obj = StandardScaler()
        elif scaler == 'minmax':
            scaler_obj = MinMaxScaler()
        elif scaler == 'robust':
            scaler_obj = RobustScaler()
        else:
            raise ValueError(f"Tanınmayan scaler növü: {scaler}")

        scaler_obj.fit(X_train[numeric_cols])

        X_train[numeric_cols] = scaler_obj.transform(X_train[numeric_cols])
        X_test[numeric_cols] = scaler_obj.transform(X_test[numeric_cols])

        self.X_train = X_train
        self.X_test = X_test
        
        print("Scaling completed.")
    def train_and_evaluate_model(self, metrics=None):
        print("Model training and evaluation started...")

        from sklearn.linear_model import LinearRegression
        from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

        X_train = self.X_train
        X_test = self.X_test
        y_train = self.y_train
        y_test = self.y_test

        model = LinearRegression()
        model.fit(X_train, y_train)

        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)

        available_metrics = {
            "r2": r2_score,
            "mae": mean_absolute_error,
            "mse": mean_squared_error
        }

        if metrics is None:
            metrics = ["r2", "mae", "mse"]

        print("Train results:")
        for name in metrics:
            if name in available_metrics:
                value = available_metrics[name](y_train, y_pred_train)
                print(f"  {name.upper()}: {value:.4f}")

        print("\nTest results:")
        for name in metrics:
            if name in available_metrics:
                value = available_metrics[name](y_test, y_pred_test)
                print(f"  {name.upper()}: {value:.4f}")


In [5]:

dp = DataPreprocessor(
    X_train_path="X_train.csv",
    y_train_path="y_train.csv",
    X_test_path="X_test.csv",
    y_test_path="y_test.csv"
)

dp.run_full_pipeline(
    null_threshold=0.7,
    correlation_threshold=0.1,
    numeric_impute='median',
    categoric_impute='mode',
    outlier_method='zscore',
    scaler='standard',
    metrics=['r2', 'mae', 'mse']
)


Running full preprocessing pipeline with default settings...
Dropping uninformative columns...
Dropped 23 columns.
The imputation process has begun...
Imputation completed.
Outlier removal started...
Outlier removal completed. Remaining number of observations: 998
Encoding stage started...
Encoding completed.
Scaling started...
Scaling completed.
Model training and evaluation started...
Train results:
  R2: 0.9341
  MAE: 11846.7816
  MSE: 278881837.5721

Test results:
  R2: 0.8627
  MAE: 19066.3597
  MSE: 1053037426.8743
