In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from pandas.api.types import is_object_dtype, is_numeric_dtype
from category_encoders.woe import WOEEncoder

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_in = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_in = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
train_in.shape
train_in.Id.tail(10)

In [None]:
#Will drop these here to make it easier
#below is a function called scatter_ch which
#i tried on all the numerical data and got the
#Ids of the outliers and now im dropping them
IdToRemove = [935, 1299, 314 , 250 , 524 , 1062 ,1191 , 336, 707 , 379 ,1183 ,692 , 186 , 441 ,636 , 496, 198]
train_in = train_in[train_in.Id.isin(IdToRemove) == False]
test_in = test_in[test_in.Id.isin(IdToRemove) == False]

In [None]:
X = train_in.drop('SalePrice', axis = 1)
y= train_in['SalePrice']
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

**Learning About Data**

In [None]:
X_tr.info()

In [None]:
# ზოგადი წარმოდგენა შევიქმნათ მონაცემებზე
train_in.info()
#გვაქვს int64, object, float4 ტიპები

In [None]:
#გადმოცემული მონაცემების shape
train_in.shape

In [None]:
# Only numerics
numeric_columns = X_tr.select_dtypes(include=['int64', 'float64']).columns

# Are there infinite Vals?
is_inf = X_tr[numeric_columns].isin([np.inf, -np.inf])

# Sum the number of infinite values in each column
print(is_inf.sum().sum())

In [None]:
#ყველა არსებული column(სახლის მახასიათებელი კომპონენტების ჩამონათვალი)
X_tr.columns

In [None]:
#როგორც ვნახეთ, 3 ტიპის ცვლადი გვაქვს, მივიღოთ ინფორმაცია
#ცალკე numeric და ცალკე categorial ცვლადებზე

#numeric
num_in = X_tr.select_dtypes(include=['int64', 'float64'])
#categorical
cat_in = X_tr.select_dtypes(include=['object'])
#ვნახოთ numeric მონაცემები
print("Numeric Data Description")
display(num_in.describe())

In [None]:
#ვნახოთ categorial მონაცემების ზოგადი მიმოხილვა
print("Categorical Data Description")
display(cat_in.describe())

In [None]:
#ვიპოვოთ null values
#დავალაგოთ კლებადობის მიხედვით
null_vals = X_tr.isnull().sum().sort_values(ascending = False)
#დავბეჭდავ მხოლოდ 19 მონაცემს, რადგან დანარჩენ მონაცემებში არაა დაკარგული data
null_vals.head(19)

In [None]:
#იმისთვის, რომ სამომავლოდ შევაფასოთ, თუ როგორ უნდა მოვექცეთ დაკარგულ 
#მონაცემებს, ვნახოთ პროცენტულად რა ნაწილს შეადგენენ თითოეულ კატეგორიაში
null_perc_col = (X_tr.isnull().sum() / len(X_tr)) * 100
null_perc_col_info = null_perc_col.sort_values(ascending = False).head(19)
#create DataFrame უკეთესი ცხრილისთვის
null_perc_col_sid = pd.DataFrame({
    'Missing Data' : null_perc_col_info.index
})

null_perc_col_sid['Percentage (%)'] = null_perc_col_info.values

print(null_perc_col_sid)

#19 მახასიათებელს აკლია მონაცემი და აქედან 6-ს მონაცემების 50%ზე მეტი

# **Data Cleaning**

**Dropping features with most missing values**

In [None]:
#ყველაზე მეტი missing feature რასაც აქვს, ამოვაგდოთ
X_tr = X_tr.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis = 1)
X_val = X_val.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis = 1)
test_in =  test_in.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis = 1)

In [None]:
null_perc_row = (X_tr.isnull().sum(axis=1) / len(X_tr.columns)) * 100
null_perc_row_info = null_perc_row.sort_values(ascending = False).head(15)
#create DataFrame უკეთესი ცხრილისთვის
null_perc_row_sid = pd.DataFrame({
    'Missing Data' : null_perc_row_info.index
})

null_perc_row_sid['Percentage (%)'] = null_perc_row_info.values

print(null_perc_row_sid)

In [None]:
missing_feat = X_tr.columns[X_tr.isnull().sum() >0].tolist()
#non_missing_deat = X_tr.columns[X_tr.isnull().sum()==0].tolist()
missing_feat

In [None]:
#გრაფიკულად დავინახოთ განსხვავება missing vs non-missing data
#ამისთვის განსაზღვრული ფუნქცია:

def missing_vs_nonmissing(feature, data, target = 'SalePrice'):
    my_sample = data[feature].isnull()
    my_data =data.groupby(my_sample)[target] 
    mean_v = my_data.mean()
    print(mean_v)

    plt.figure(figsize=(8,4))
    sns.boxplot(x=my_sample.astype(int),y = target,data= data)

    plt.title(f"{target} with missing vs non-missing vals  '{feature}'")
    plt.xlabel(f'{feature}')
    plt.ylabel(target)
    plt.xticks([0, 1], ['Non-Missing', 'Missing'])
    plt.show()

In [None]:
#რადგანაც ვნახეთ, რომ null_vals დიდი გავლენა აქვს სახლის ფასებზე, შევეცადოთ მათი გამოსწორება
#missing_feat გვაქვს უკვე
missing_feat = X_tr.columns[X_tr.isnull().sum() >0].tolist()
missing_df = X_tr[missing_feat]
categorial_mf=missing_df.select_dtypes(include = ['object']).columns.tolist()
numeric_mf=missing_df.select_dtypes(include=['int64','float64']).columns.tolist()
print('დაკარგული კატეგორიული მახასიათებლები')
display(categorial_mf)
print('')
print('დაკარგული რიცხვობრივი მახასიათებლები')
display(numeric_mf)
#დავყავით 2 ნაწილად missing features, რადგან განსხვავებულად უნდა მოვექცეთ

**Handling Missing Values**

In [None]:
def filling(inps, col, val):
    for inp in inps:
        inp[col] = inp[col].fillna(val)

def fill_missing(X_tr, X_val, test_in, train_in):
    for col in X_tr.columns:
        if is_object_dtype(train_in[col]):
            md = X_tr[col].mode()[0]
            filling([X_tr, X_val, test_in], col, md)

def fill_missing_num(X_tr, X_val, test_in, train_in):
    for col in X_tr.columns:
        if is_numeric_dtype(train_in[col]):
            mean_val = X_tr[col].mean()
            filling([X_tr, X_val, test_in], col, mean_val)


fill_missing(X_tr, X_val, test_in, train_in)
fill_missing_num(X_tr, X_val, test_in, train_in)

In [None]:
#გადავამოწმოთ კვლავ გვაქვს თუ არა missing data 
still_missing_feat = X_tr.columns[X_tr.isnull().sum() >0].tolist()
display(still_missing_feat)
print("ცარიელია, ანუ აღარ გვაქვს empty data")

# **Feature Selection**

**Removing ourliers by visualising**

In [None]:
#გავარგრძელოთ feature-ბის კვლევა და მათი გავლენა ფასზე
#გამოვიყენოთ რამდენიმე scatter plot და თვალით დავინახოთ
#outliers
#შევხედოთ numerical data-ს scatter plot-ებს
#function for this
def scatter_ch(feature, data):
    x=data[feature]
    y='SalePrice'
    plt.scatter(x, y,data =data)
    plt.title(f'{feature} vs SalePrice')
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.show()
#ეს გამოვიყენე ყველა numerical data-ზე , უბრალოდ პლოტის ნაწილი წავშალე 
#რადგან ძალიან ანელებს კოდს

In [None]:
#შევამოწმოთ კორელაციები numeric value-ებთან
numeric_cols = X_tr.select_dtypes(include=['int64','float64'])
corr_checker = numeric_cols.corr()

**Dropping highly correlated numerics**

In [None]:
threshold = 0.85
corr_matrix_abs = corr_checker.abs()
high_corr_pairs = [(i, j) for i in corr_matrix_abs.columns for j in corr_matrix_abs.columns if i != j and corr_matrix_abs[i][j] > threshold]
print(high_corr_pairs)

In [None]:
#Drop one of GarageArea or GarageCars
X_tr = X_tr.drop(columns=['GarageArea'])
X_val = X_val.drop(columns=['GarageArea'])
test_in = test_in.drop(columns=['GarageArea'])

**Dropping redundant features**

In [None]:
def c_col_cat(inp):
    cat_inp = inp.select_dtypes(include=['object'])
    return cat_inp.apply(lambda col: (col.value_counts().iloc[0] / len(inp)) * 100)

def c_col_num(inp):
    num_inp= inp.select_dtypes(include = ['number'])
    return num_inp.apply(lambda col: (col.value_counts().iloc[0]/len(inp)) *100)

In [None]:
print('Categorical Data  Repeated Percentage')
print(c_col_cat(X_tr).sort_values(ascending = False).head(5))

In [None]:
print('Numerical Data  Repeated Percentage')
print(c_col_num(X_tr).sort_values(ascending = False).head(5))

In [None]:
cat = c_col_cat(X_tr)
num = c_col_num(X_tr)
bigger_97 = (list(c_col_cat(X_tr).loc[c_col_cat(X_tr) > 97].index) + 
                    list(c_col_num(X_tr).loc[c_col_num(X_tr) > 97].index))
print(bigger_97)
X_tr= X_tr.drop(bigger_97, axis = 1)
X_val= X_val.drop(bigger_97, axis = 1)
test_in=test_in.drop(bigger_97, axis = 1)

# **Feature Engineering**

In [None]:
#ვქმნი ახალ Feature-ებს

X_tr['SF'] = X_tr['1stFlrSF'] + X_tr['2ndFlrSF'] + X_tr['TotalBsmtSF']
X_tr['TotalPorch'] = X_tr['ScreenPorch'] + X_tr['OpenPorchSF'] + X_tr['EnclosedPorch']
X_tr['HouseArea'] = X_tr['GrLivArea'] * X_tr['TotalBsmtSF']
X_tr['OverallQual_GrLivArea'] = X_tr['OverallQual'] * X_tr['GrLivArea']


X_val['SF'] = X_val['1stFlrSF'] + X_val['2ndFlrSF'] + X_val['TotalBsmtSF']
X_val['TotalPorch'] = X_val['ScreenPorch'] + X_val['OpenPorchSF'] + X_val['EnclosedPorch']
X_val['HouseArea'] = X_val['GrLivArea'] * X_val['TotalBsmtSF']
X_val['OverallQual_GrLivArea'] = X_val['OverallQual'] * X_val['GrLivArea']


test_in['SF'] = test_in['1stFlrSF'] + test_in['2ndFlrSF'] + test_in['TotalBsmtSF']
test_in['TotalPorch'] = test_in['ScreenPorch'] + test_in['OpenPorchSF'] + test_in['EnclosedPorch']
test_in['HouseArea'] = test_in['GrLivArea'] * test_in['TotalBsmtSF']
test_in['OverallQual_GrLivArea'] = test_in['OverallQual'] * test_in['GrLivArea']

In [None]:
drop_cols = [
    '1stFlrSF', '2ndFlrSF', 'TotalBsmtSF',
    'ScreenPorch', 'OpenPorchSF', 'EnclosedPorch',
    'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath'
]


X_tr = X_tr.drop(drop_cols, axis =1)
X_tr = X_tr.drop('Id', axis =1)


X_val = X_val.drop(drop_cols, axis =1)
X_val = X_val.drop('Id', axis =1)

test_in = test_in.drop(drop_cols, axis =1)
test_in = test_in.drop('Id', axis =1)

**Transforming categorical data into numeric**

In [None]:
#when encoding, need to separate nominal and ordinal cats
def categorize_cols(col, inp, threshold, list_bin , list_multi):
        if inp[col].dtype == 'object':
            non_rep = inp[col].nunique()
            if non_rep <= threshold:
                list_bin.append(col)
            else:
                list_multi.append(col)


def separate_input(inp, threshold =2):
    tr_binary =[]
    tr_multi=[]

    for col in inp.columns:
        categorize_cols(col, inp, threshold, tr_binary, tr_multi)
    return tr_binary, tr_multi    

#change the target
def trans_pr(inp):
    return (inp >= inp.median()).astype(int)

#creating woe encoder which later will be used
def woe_trans(inp, dest, columns):
    ecdr = WOEEncoder(cols=columns)
    ecdr.fit(inp[columns], dest)
    return ecdr


def use_woe(ecdr, train, val, test, columns ):
    train_trans = ecdr.transform(train[columns])
    val_trans = ecdr.transform(val[columns])
    test_trans = ecdr.transform(test[columns])
    return train_trans, val_trans, test_trans


def one_hot_enc(inp, columns):
     return pd.get_dummies(inp, columns=columns, drop_first=True, dtype=int)

In [None]:

binary_cols, multi_cols = separate_input(X_tr)
y_binary = trans_pr(y_tr)

encoder = woe_trans(X_tr, y_binary, multi_cols)
X_tr_woe, X_val_woe, test_woe = use_woe(encoder, X_tr, X_val, test_in, multi_cols)

X_tr[multi_cols] = X_tr_woe
X_val[multi_cols] = X_val_woe
test_in[multi_cols] = test_woe

X_tr = one_hot_enc(X_tr, binary_cols)
X_val = one_hot_enc(X_val, binary_cols)
test_in = one_hot_enc(test_in, binary_cols)

**Checking correlation and dropping highly correlated features again**

In [None]:
#look at cat features correlation after transformation
corr_ch = X_tr.corr()
mask = np.triu(np.ones(corr_ch.shape), k=1)
corr_pairs = corr_ch.where(mask == 1).stack()
high_corr_pairs = corr_pairs[corr_pairs >= 0.8]
for (feature_1, feature_2), value in high_corr_pairs.items():
    print(f"Features: {feature_1} & {feature_2} | Correlation: {value:.2f}")

In [None]:
#Remove some of the correlated Stuff
X_tr = X_tr.drop(['Exterior2nd', 'TotRmsAbvGrd', 'SaleCondition'], axis=1)
X_val = X_val.drop(['Exterior2nd', 'TotRmsAbvGrd', 'SaleCondition'], axis=1)
test_in = test_in.drop(['Exterior2nd', 'TotRmsAbvGrd', 'SaleCondition'], axis=1)

# **Training**

In [None]:
#Finished with training my data
#Get libraries 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score,mean_absolute_percentage_error
from sklearn.metrics import mean_squared_log_error
from sklearn.feature_selection import RFE

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error, mean_squared_log_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler


scaler = StandardScaler()
scalerMinMax = MinMaxScaler()

In [None]:
#Helper functions for additional calculations 
def adjusted_r2(r2, n_samples, n_features):
    return 1 - (1 - r2) * ((n_samples - 1) / (n_samples - n_features - 1))

def f_statistic(y_true, y_pred, n_features):
    n = len(y_true)
    k = n_features
    ssr = np.sum((y_pred - np.mean(y_true)) ** 2)
    sse = np.sum((y_true - y_pred) ** 2)
    msr = ssr / k
    mse = sse / (n - k - 1)
    return msr / mse

In [None]:
#Helper functions for logging results into MLFlow

# Helper function for logging results into MLFlow
def logging_model_metr(name, metrics, params=None):
    mlflow.log_param("Model Name", name)  
    for metr_name, metr_val in metrics.items():
        mlflow.log_metric(metr_name, metr_val)
    if params:
        for prm_name, prm_val in params.items():
            mlflow.log_param(prm_name, prm_val)

In [None]:
!pip install dagshub

In [None]:
import dagshub
dagshub.init(repo_owner='kechik21',repo_name='ML_HW1',mlflow=True)

In [None]:
!pip install mlflow

**Linear Regression**

In [None]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import GridSearchCV

# Start MLflow experiment
mlflow.set_experiment("ML_HW1_Models")

# Linear Regression
with mlflow.start_run(run_name="Linear Regression"):
    lr_model = LinearRegression()
    lr_model.fit(X_tr, y_tr)
    y_pred_lr = lr_model.predict(X_val)
    
    # Calculate metrics
    r2_lr = r2_score(y_val, y_pred_lr)
    adj_r2_lr = adjusted_r2(r2_lr, X_val.shape[0], X_val.shape[1])
    rmse_lr = np.sqrt(mean_squared_error(y_val, y_pred_lr))
    mae_lr = mean_absolute_error(y_val, y_pred_lr)
    rmsle_lr = np.sqrt(mean_squared_log_error(y_val, y_pred_lr))
    fstat_lr = f_statistic(y_val, y_pred_lr, X_val.shape[1])
    cv_rmse_lr = -np.mean(cross_val_score(lr_model, X_tr, y_tr, cv=5, scoring='neg_root_mean_squared_error'))
    
    # Log metrics
    metrics = {
        "R2": r2_lr,
        "Adjusted_R2": adj_r2_lr,
        "RMSE": rmse_lr,
        "MAE": mae_lr,
        "RMSLE": rmsle_lr,
        "F-statistic": fstat_lr,
        "CV_RMSE": cv_rmse_lr
    }
    logging_model_metr("Linear Regression", metrics)
    
    # Log model
    mlflow.sklearn.log_model(lr_model, "linear_regression_model")

# Lasso Regression
with mlflow.start_run(run_name="Lasso Regression"):
    lasso = Lasso(max_iter=10000)
    lasso_params = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1]}
    lasso_grid = GridSearchCV(lasso, lasso_params, cv=5, scoring='neg_root_mean_squared_error')
    lasso_grid.fit(X_tr, y_tr)
    best_lasso = lasso_grid.best_estimator_
    
    y_pred_lasso = best_lasso.predict(X_val)
    
    # Calculate metrics
    r2_lasso = r2_score(y_val, y_pred_lasso)
    adj_r2_lasso = adjusted_r2(r2_lasso, X_val.shape[0], X_val.shape[1])
    rmse_lasso = np.sqrt(mean_squared_error(y_val, y_pred_lasso))
    mae_lasso = mean_absolute_error(y_val, y_pred_lasso)
    rmsle_lasso = np.sqrt(mean_squared_log_error(y_val, y_pred_lasso))
    fstat_lasso = f_statistic(y_val, y_pred_lasso, X_val.shape[1])
    cv_rmse_lasso = -np.mean(cross_val_score(best_lasso, X_tr, y_tr, cv=5, scoring='neg_root_mean_squared_error'))
    
    # Log metrics and parameters
    metrics = {
        "R2": r2_lasso,
        "Adjusted_R2": adj_r2_lasso,
        "RMSE": rmse_lasso,
        "MAE": mae_lasso,
        "RMSLE": rmsle_lasso,
        "F-statistic": fstat_lasso,
        "CV_RMSE": cv_rmse_lasso
    }
    params = {"alpha": best_lasso.alpha}
    logging_model_metr("Lasso Regression", metrics, params)
    
    # Log model
    mlflow.sklearn.log_model(best_lasso, "lasso_model")

# Random Forest
with mlflow.start_run(run_name="Random Forest"):
    rf = RandomForestRegressor(random_state=42)
    rf_params = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
    }
    rf_grid = GridSearchCV(rf, rf_params, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
    rf_grid.fit(X_tr, y_tr)
    best_rf = rf_grid.best_estimator_
    
    y_pred_rf = best_rf.predict(X_val)
    
    # Calculate metrics
    r2_rf = r2_score(y_val, y_pred_rf)
    adj_r2_rf = adjusted_r2(r2_rf, X_val.shape[0], X_val.shape[1])
    rmse_rf = np.sqrt(mean_squared_error(y_val, y_pred_rf))
    mae_rf = mean_absolute_error(y_val, y_pred_rf)
    rmsle_rf = np.sqrt(mean_squared_log_error(y_val, y_pred_rf))
    fstat_rf = f_statistic(y_val, y_pred_rf, X_val.shape[1])
    cv_rmse_rf = -np.mean(cross_val_score(best_rf, X_tr, y_tr, cv=5, scoring='neg_root_mean_squared_error'))
    
    # Log metrics and parameters
    metrics = {
        "R2": r2_rf,
        "Adjusted_R2": adj_r2_rf,
        "RMSE": rmse_rf,
        "MAE": mae_rf,
        "RMSLE": rmsle_rf,
        "F-statistic": fstat_rf,
        "CV_RMSE": cv_rmse_rf
    }
    logging_model_metr("Random Forest", metrics, rf_grid.best_params_)
    
    # Log model
    mlflow.sklearn.log_model(best_rf, "random_forest_model")

# XGBoost
with mlflow.start_run(run_name="XGBoost"):
    xgb = XGBRegressor(random_state=42, verbosity=0)
    xgb_params = {
        'n_estimators': [100, 200],
        'max_depth': [3, 5],
        'learning_rate': [0.01, 0.1],
        'subsample': [0.8, 1.0]
    }
    xgb_grid = GridSearchCV(xgb, xgb_params, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
    xgb_grid.fit(X_tr, y_tr)
    best_xgb = xgb_grid.best_estimator_
    
    y_pred_xgb = best_xgb.predict(X_val)
    
    # Calculate metrics
    r2_xgb = r2_score(y_val, y_pred_xgb)
    adj_r2_xgb = adjusted_r2(r2_xgb, X_val.shape[0], X_val.shape[1])
    rmse_xgb = np.sqrt(mean_squared_error(y_val, y_pred_xgb))
    mae_xgb = mean_absolute_error(y_val, y_pred_xgb)
    rmsle_xgb = np.sqrt(mean_squared_log_error(y_val, y_pred_xgb))
    fstat_xgb = f_statistic(y_val, y_pred_xgb, X_val.shape[1])
    cv_rmse_xgb = -np.mean(cross_val_score(best_xgb, X_tr, y_tr, cv=5, scoring='neg_root_mean_squared_error'))
    
    # Log metrics and parameters
    metrics = {
        "R2": r2_xgb,
        "Adjusted_R2": adj_r2_xgb,
        "RMSE": rmse_xgb,
        "MAE": mae_xgb,
        "RMSLE": rmsle_xgb,
        "F-statistic": fstat_xgb,
        "CV_RMSE": cv_rmse_xgb
    }
    logging_model_metr("XGBoost", metrics, xgb_grid.best_params_)
    
    # Log model
    mlflow.sklearn.log_model(best_xgb, "xgboost_model")

# **Process Test Data**

In [None]:
test_rem=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
test_rem.columns

In [None]:
nulls =  test_rem.isnull().sum().sort_values(ascending = False)
#დავბეჭდავ მხოლოდ 19 მონაცემს, რადგან დანარჩენ მონაცემებში არაა დაკარგული data
nulls.head(19)

In [None]:
#იმისთვის, რომ სამომავლოდ შევაფასოთ, თუ როგორ უნდა მოვექცეთ დაკარგულ 
#მონაცემებს, ვნახოთ პროცენტულად რა ნაწილს შეადგენენ თითოეულ კატეგორიაში
null_perc_col = (test_rem.isnull().sum() / len(test_rem)) * 100
null_perc_col_info = null_perc_col.sort_values(ascending = False).head(19)
#create DataFrame უკეთესი ცხრილისთვის
null_perc_col_sid = pd.DataFrame({
    'Missing Data' : null_perc_col_info.index
})

null_perc_col_sid['Percentage (%)'] = null_perc_col_info.values

print(null_perc_col_sid)

#19 მახასიათებელს აკლია მონაცემი და აქედან 6-ს მონაცემების 50%ზე მეტი

In [None]:
test_rem = test_rem.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis = 1)


In [None]:
def fill_missing_testrem(test_rem):
    for c in test_rem.columns:
        if is_object_dtype(test_rem[c]):
            mode_mn = test_rem[c].mode()[0]
            test_rem[c] = test_rem[c].fillna(mode_mn)
        elif is_numeric_dtype(test_rem[c]):
            mean_mn = test_rem[c].mean()
            test_rem[c] = test_rem[c].fillna(mean_mn)
    
    

In [None]:
fill_missing_testrem(test_rem)

In [None]:
tstill_missing_feat = test_rem.columns[test_rem.isnull().sum() >0].tolist()
display(still_missing_feat)
print("ცარიელია, ანუ აღარ გვაქვს empty data")


In [None]:
test_rem= test_rem.drop('Id',axis =1)

In [None]:
test_rem.shape

In [None]:
numeric_cols = test_rem.select_dtypes(include=['int64','float64'])
corr_checker = numeric_cols.corr()
threshold = 0.85
corr_matrix_abs = corr_checker.abs()
high_corr_pairs = [(i, j) for i in corr_matrix_abs.columns for j in corr_matrix_abs.columns if i != j and corr_matrix_abs[i][j] > threshold]
print(high_corr_pairs)

In [None]:
#test_rem= test_rem.drop('Id',axis =1)
test_rem= test_rem.drop('GarageArea',axis =1)

In [None]:
print('Categorical Data  Repeated Percentage')
print(c_col_cat(test_rem).sort_values(ascending = False).head(5))

In [None]:
print('Numerical Data  Repeated Percentage')
print(c_col_num(test_rem).sort_values(ascending = False).head(5))

In [None]:
cat = c_col_cat(test_rem)
num = c_col_num(test_rem)
bigger_97 = (list(c_col_cat(test_rem).loc[c_col_cat(test_rem) > 97].index) + 
                    list(c_col_num(test_rem).loc[c_col_num(test_rem) > 97].index))
print(bigger_97)
test_rem=test_rem.drop(bigger_97, axis = 1)


In [None]:
test_rem.shape

In [None]:
test_rem['SF'] = test_rem['1stFlrSF'] + test_rem['2ndFlrSF'] + test_rem['TotalBsmtSF']
test_rem['TotalPorch'] = test_rem['ScreenPorch'] + test_rem['OpenPorchSF'] + test_rem['EnclosedPorch']
test_rem['HouseArea'] = test_rem['GrLivArea'] * test_rem['TotalBsmtSF']
test_rem['OverallQual_GrLivArea'] = test_rem['OverallQual'] * test_rem['GrLivArea']

In [None]:
drop_cols = [
    '1stFlrSF', '2ndFlrSF', 'TotalBsmtSF',
    'ScreenPorch', 'OpenPorchSF', 'EnclosedPorch',
    'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath'
]


test_rem = test_rem.drop(drop_cols, axis =1)
#test_rem = test_rem.drop('Id', axis =1)

In [None]:
binary_cols_rem, multi_cols_rem = separate_input(test_rem)
test_rem_woe = encoder.transform(test_rem[multi_cols])  
test_rem[multi_cols] = test_rem_woe
test_rem = one_hot_enc(test_rem, binary_cols)  

In [None]:
test_rem = test_rem.drop(['Exterior2nd', 'TotRmsAbvGrd', 'SaleCondition'], axis=1)


In [None]:
X_tr.shape

In [None]:
test_rem.shape

In [None]:
test_rem.to_csv('/kaggle/working/new_test_data.csv', index=False)


In [None]:
mlflow.start_run()

mlflow.log_param("Best", "XGBoost")
mlflow.log_metric("R2", 0.92)
mlflow.sklearn.log_model(
    sk_model=best_xgb,
    artifact_path="Models",
    registered_model_name="MyModel"
)

mlflow.end_run()