In [34]:
dataset_train = "dataset\\ingatlan.com_training_final.csv"
dataset_val = "dataset\\ingatlan.com_testing_final.csv"
dataset = "dataset\\ingatlan.com_full_final.csv"
RANDOM_STATE = 39612
import numpy as np
import matplotlib as plt
import pandas as pd
import re
import roman
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
import joblib

In [35]:
mldf = pd.read_csv('finalized_for_train.csv')
mldf

Unnamed: 0,postcode,property_subtype,property_condition_type,property_floor,building_floor_count,view_type,orientation,garden_access,heating_type,elevator_type,room_cnt,small_room_cnt,created_at,property_area,balcony_area,price_created_at,meroszam
0,1015.0,1,2,4,-1,0.0,1,0,3,1,1.0,0.0,354,40.0,4.0,22.9,800.000000
1,1012.0,1,3,3,2,2.0,2,0,2,1,1.0,1.0,435,70.0,0.0,33.0,185.294118
2,1016.0,1,4,3,-1,0.0,2,0,1,0,1.0,1.0,302,40.0,1.0,21.5,118.055556
3,1016.0,1,2,0,-1,0.0,0,0,2,0,1.0,0.0,129,35.0,0.0,13.0,407.692308
4,1015.0,1,3,1,1,1.0,0,0,2,1,1.0,2.0,302,55.0,0.0,33.5,294.230769
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248087,1131.0,1,3,1,0,3.0,2,1,1,0,2.0,0.0,564,50.0,0.0,15.0,8933.333333
248088,1121.0,1,1,2,-1,0.0,0,0,1,0,2.0,0.0,564,65.0,6.0,23.9,4675.000000
248089,1111.0,1,4,3,-1,0.0,2,0,0,1,1.0,1.0,565,55.0,3.0,33.5,4800.000000
248090,1081.0,1,2,3,4,0.0,2,0,0,1,2.0,0.0,566,55.0,8.0,18.0,3833.333333


In [39]:
def train_eval_linreg(df,target_cols):
    X = df[target_cols]
    y = df['price_created_at']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)

    model = LinearRegression()
    model.fit(X_train, y_train)


    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    print(f'LR Mean Squared Error: {mse}')
    mape = mean_absolute_percentage_error(y_test, y_pred)*100
    print(f'LR Mean Absolute Percentage Error: {mape}')
    coefs = model.coef_
    coef_df = pd.DataFrame({'feature': X.columns, 'coefficient': coefs})
    coef_df['abs_coef'] = coef_df['coefficient'].abs()
    coef_df['importance_percent'] = 100 * coef_df['abs_coef'] / coef_df['abs_coef'].sum()
    print("Linear Regression Coefficient Importances (%):")
    print(coef_df[['feature', 'coefficient', 'importance_percent']].sort_values('importance_percent', ascending=False))
    lr_model_saved = "lr_model.pkl"
    joblib.dump(model, lr_model_saved)
    print(f"Model saved to {lr_model_saved}")

def train_eval_gbm_gridsearch(df,target_cols):
    X = df[target_cols]
    y = df['price_created_at']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)

    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 4, 5]
    }

    gbm = GradientBoostingRegressor(random_state=RANDOM_STATE)
    grid_search = GridSearchCV(estimator=gbm, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_percentage_error', n_jobs=-1)

    # Fit GridSearchCV
    grid_search.fit(X_train, y_train)
    best_gbm = grid_search.best_estimator_
    best_gbm.fit(X_train, y_train)

    y_pred = best_gbm.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    print(f"GBM Mean Squared Error: {mse}")
    mape = mean_absolute_percentage_error(y_test, y_pred) * 100
    print(f"GBM Mean Absolute Percentage Error: {mape}")
    print(f"Best GBM params {best_gbm.get_params}")
    importances = best_gbm.feature_importances_
    importance_df = pd.DataFrame({'feature': X.columns, 'importance': importances})
    importance_df['importance_percent'] = 100 * importance_df['importance'] / importance_df['importance'].sum()
    print("GBM Feature Importances (%):")
    print(importance_df.sort_values('importance_percent', ascending=False))
    gbm_model_saved = "gbm_model.pkl"
    joblib.dump(best_gbm, gbm_model_saved)
    print(f"Model saved to {gbm_model_saved}")

def train_eval_gbm(df,target_cols):
    X = df[target_cols]
    y = df['price_created_at']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)

    gbm = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=5, random_state=RANDOM_STATE)

    gbm.fit(X_train, y_train)
    y_pred = gbm.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"GBM Mean Squared Error: {mse}")
    mape = mean_absolute_percentage_error(y_test, y_pred)*100
    print(f"GBM Mean Absolute Percentage Error: {mape}")
    #return(mse,mape)

def train_eval_xgbm(df, target_cols):
    X = df[target_cols]
    y = df['price_created_at']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)

    xgbm = XGBRegressor(n_estimators=650, learning_rate=0.07, max_depth=13, random_state=RANDOM_STATE, n_jobs=-1)
    xgbm.fit(X_train, y_train)
    y_pred = xgbm.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    print(f"XGBM Mean Squared Error: {mse}")
    mape = mean_absolute_percentage_error(y_test, y_pred) * 100
    print(f"XGBM Mean Absolute Percentage Error: {mape}")
    xgbm_saved = "xgbm_model.pkl"
    joblib.dump(xgbm, xgbm_saved)
    print(f"Model saved to {xgbm_saved}")
    importances = xgbm.feature_importances_
    importance_df = pd.DataFrame({'feature': X.columns, 'importance': importances})
    importance_df['importance_percent'] = 100 * importance_df['importance'] / importance_df['importance'].sum()
    print("XGBM Feature Importances (%):")
    print(importance_df.sort_values('importance_percent', ascending=False))



#target_cols = ['property_area', 'room_cnt', 'small_room_cnt', 'balcony_area', 'elevator_type',
#               'city','property_subtype','garden_access','heating_type','orientation',
#               'property_floor', 'building_floor_count', 'property_condition_type',
#               'created_at']


---

> Tanítási start

In [37]:
target_cols = mldf.columns
#target_cols = target_cols.drop('meroszam')
#target_cols = target_cols.drop('active_days')
#target_cols = target_cols.drop('ad_view_cnt')
#target_cols = target_cols.drop('city')
target_cols = target_cols.drop('price_created_at')


In [40]:
##### Energiavámpír
##### FIGYELJÉL ODA MERT TOJÁST FOGSZ SÜTNI A CPU-n
train_eval_linreg(mldf,target_cols)
train_eval_gbm_gridsearch(mldf,target_cols)
#train_eval_gbm(mldf,target_cols)

LR Mean Squared Error: 33.412041848478125
LR Mean Absolute Percentage Error: 24.236671693156836
Linear Regression Coefficient Importances (%):
                    feature  coefficient  importance_percent
1          property_subtype    -7.672299           51.690720
9             elevator_type     2.313606           15.587498
2   property_condition_type     1.464698            9.868133
8              heating_type     0.738929            4.978397
10                 room_cnt     0.587743            3.959812
11           small_room_cnt     0.517911            3.489333
3            property_floor     0.412999            2.782503
13            property_area     0.300867            2.027035
4      building_floor_count    -0.204648            1.378782
5                 view_type     0.202595            1.364946
14             balcony_area     0.193370            1.302794
6               orientation    -0.166457            1.121474
0                  postcode    -0.042371            0.285469
7  

In [41]:
train_eval_xgbm(mldf, target_cols)

XGBM Mean Squared Error: 6.816590816103792
XGBM Mean Absolute Percentage Error: 8.11237278458184
Model saved to xgbm_model.pkl
XGBM Feature Importances (%):
                    feature  importance  importance_percent
1          property_subtype    0.421970           42.197048
13            property_area    0.193639           19.363909
0                  postcode    0.068333            6.833338
9             elevator_type    0.056107            5.610714
2   property_condition_type    0.055509            5.550861
8              heating_type    0.039760            3.976017
14             balcony_area    0.031698            3.169837
12               created_at    0.023989            2.398914
10                 room_cnt    0.019283            1.928350
15                 meroszam    0.017972            1.797171
3            property_floor    0.016996            1.699561
11           small_room_cnt    0.016762            1.676162
7             garden_access    0.013190            1.319036
5  