In [211]:
### Content

# 1- Libraries
# 2- API Integration
# 3- Download the Files
# 4- Read the Files
# 5.Save Original Data

# 6- Exploratory Data Analysis
# 6.1- Missing Data
# 6.2- Statistical Analysis
# 6.2- Histograms
# 6.3- Distributions
# 6.4- Outliers

# 7- Data Cleaning
# 7.1- Handling with Date Columns
# 7.2- Drop Unnecessary Columns
# 7.3- Encoding
# 7.4- Handling Missing Data

# 7.5 Drop Outliers 1
# 7.6 Check Outliers Test 1

# 7.7 Drop Outliers 2
# 7.8 Check Outliers Test 2

# 7.9 Drop Outliers 3
# 7.10 Check Outliers Test 3

# 7.11 Drop Outliers 4
# 7.12 Check Outliers Test 4    

# 7.13 Data Normalization
# 7.14 Check Distributions 2
# 7.15 Correlation Test

# 8.1- Drop Outliers
# 8.1.1- Balance
# 8.1.2- EstimatedSalary
# 8.2- Standart Scaler
# 5.3 Drop Unnecessary Columns 





# 7- Building Model-0 // Benchmark
# 7.1- Logistic Regression




# 9- Building Alternative Models
# 9.1- Model 1 - Decision Trees
# 9.2- Model 2 - Random Forest
# 9.3- Model 3 - Gradient Boosting Machines
# 9.4- Model 4 - xgboost

# 10- Handling Imbalanced Data
# 11- Re-Train Best Performed Model with Balanced Dataset 

# 12- Feature Engineering
# 12.1 Correlation Test
# 12.2 Recursive Feature Elimination

# 13- Hyperparameter Tuning
# 13.1- Grid Search
# 13.2- Random Search

# 14- Cross Validation
# 15- Prediction

### 1- Libraries

In [212]:
### 1- Libraries

# Data Manipulation and Cleaning Libraries
import pandas as pd  # For data manipulation and data frames
import numpy as np  # For numerical operations and arrays

# File and System Operations Libraries
import os  # For operating system interactions, like file handling
import sys
import zipfile  # For working with zip files

# Dataset Access Libraries
from kaggle.api.kaggle_api_extended import KaggleApi  # For accessing datasets from Kaggle
from config import *  # Importing custom configurations
from model_eval_func import *  # Custom model evaluation functions

# Data Visualization Libraries
import seaborn as sns  # For advanced data visualization
from matplotlib import pyplot as plt  # For plotting graphs and charts
import matplotlib  # For customizing matplotlib settings


import math
from scipy import stats

# Machine Learning Libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Visible All Rows and Columns 
pd.set_option("display.max_rows", None, "display.max_columns", None)


### 2- API

In [213]:
# Specify the correct file paths
raw_data_directory = path_1
extracted_path     = path_2

In [214]:
# Kaggle API
api = KaggleApi()
api.authenticate()

competition = comp
api.competition_download_files(competition, path=raw_data_directory)


### 3- Download the Files

In [215]:
# Download the file
zip_file_path = os.path.join(raw_data_directory, 'sberbank-russian-housing-market.zip')  # Burada dosya adını doğru şekilde belirtin

# Extract the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(raw_data_directory)

print(f"Zip Files Saved...")

Zip Files Saved...


### 4- Read the Files

In [216]:
# Klasörün içindeki dosyaları listeleme
if os.path.exists(raw_data_directory) and os.path.isdir(raw_data_directory):
    files = os.listdir(raw_data_directory)
    print("Files in the Raw Folder:")
    print(" ")
    for file in files:
        print(file)
else:
    print("Belirtilen klasör yok veya bir dizin değil.")

Files in the Raw Folder:
 
data_dictionary.txt
extracted
macro.csv.zip
sample_submission.csv.zip
sberbank-russian-housing-market.zip
test.csv.zip
train.csv.zip


In [217]:
# Create a directory for the extracted files
os.makedirs(extracted_path, exist_ok=True)

# List files in the directory
files = os.listdir(path_1)

# Extract zip files
for file in files:
    if file.endswith('.zip'):
        file_path = os.path.join(path_1, file)
        
        # Open the zip file
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            zip_ref.extractall(extracted_path)

print("Extraction complete.")


Extraction complete.


In [218]:
# List files in the extracted directory
files = os.listdir(extracted_path)

print("Dataframes: ")
print("")

# Read only the test and train CSV files and save them as variables
for file in files:
    if file.endswith('.csv') and ('test' in file or 'train' in file):
        file_path = os.path.join(extracted_path, file)
        print(f"Reading file: {file}")
        
        # Check if file exists
        if not os.path.exists(file_path):
            print(f"File does not exist: {file}")
            continue
        
        # Read the CSV file
        try:
            df = pd.read_csv(file_path, encoding='utf-8')
        except UnicodeDecodeError:
            try:
                df = pd.read_csv(file_path, encoding='latin1')
            except pd.errors.EmptyDataError:
                print(f"EmptyDataError: No columns to parse from file {file}")
                continue
            except Exception as e:
                print(f"Unexpected error while reading {file}: {e}")
                continue
        except pd.errors.EmptyDataError:
            print(f"EmptyDataError: No columns to parse from file {file}")
            continue
        except Exception as e:
            print(f"Unexpected error while reading {file}: {e}")
            continue

        # Determine variable name using the file name
        var_name = os.path.splitext(file)[0] + '_df'
        
        # Save the DataFrame using globals() function
        globals()[var_name] = df
        
        print(f"{var_name} saved...")

Dataframes: 

Reading file: test.csv
test_df saved...
Reading file: train.csv
train_df saved...


### 5- Save Original Data

In [219]:
# Combine dataframes
df = pd.concat([train_df, test_df])

In [220]:
# Copy the original dataframes
df_2 = df.copy()

### 6- Exploratory Data Analysis

In [221]:
# Drop id column
df_2 = df_2.drop("id", axis=1)

In [222]:
# Check datatype and blanks
df_2.info(verbose=True, show_counts=True)
df_2.head(2)

<class 'pandas.core.frame.DataFrame'>
Index: 38133 entries, 0 to 7661
Data columns (total 291 columns):
 #    Column                                 Non-Null Count  Dtype  
---   ------                                 --------------  -----  
 0    timestamp                              38133 non-null  object 
 1    full_sq                                38133 non-null  float64
 2    life_sq                                30574 non-null  float64
 3    floor                                  37966 non-null  float64
 4    max_floor                              28561 non-null  float64
 5    material                               28561 non-null  float64
 6    build_year                             23479 non-null  float64
 7    num_room                               28561 non-null  float64
 8    kitch_sq                               28561 non-null  float64
 9    state                                  23880 non-null  float64
 10   product_type                           38100 non-null  object 

Unnamed: 0,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,sub_area,area_m,raion_popul,green_zone_part,indust_part,children_preschool,preschool_quota,preschool_education_centers_raion,children_school,school_quota,school_education_centers_raion,school_education_centers_top_20_raion,hospital_beds_raion,healthcare_centers_raion,university_top_20_raion,sport_objects_raion,additional_education_raion,culture_objects_top_25,culture_objects_top_25_raion,shopping_centers_raion,office_raion,thermal_power_plant_raion,incineration_raion,oil_chemistry_raion,radiation_raion,railroad_terminal_raion,big_market_raion,nuclear_reactor_raion,detention_facility_raion,full_all,male_f,female_f,young_all,young_male,young_female,work_all,work_male,work_female,ekder_all,ekder_male,ekder_female,0_6_all,0_6_male,0_6_female,7_14_all,7_14_male,7_14_female,0_17_all,0_17_male,0_17_female,16_29_all,16_29_male,16_29_female,0_13_all,0_13_male,0_13_female,raion_build_count_with_material_info,build_count_block,build_count_wood,build_count_frame,build_count_brick,build_count_monolith,build_count_panel,build_count_foam,build_count_slag,build_count_mix,raion_build_count_with_builddate_info,build_count_before_1920,build_count_1921-1945,build_count_1946-1970,build_count_1971-1995,build_count_after_1995,ID_metro,metro_min_avto,metro_km_avto,metro_min_walk,metro_km_walk,kindergarten_km,school_km,park_km,green_zone_km,industrial_km,water_treatment_km,cemetery_km,incineration_km,railroad_station_walk_km,railroad_station_walk_min,ID_railroad_station_walk,railroad_station_avto_km,railroad_station_avto_min,ID_railroad_station_avto,public_transport_station_km,public_transport_station_min_walk,water_km,water_1line,mkad_km,ttk_km,sadovoe_km,bulvar_ring_km,kremlin_km,big_road1_km,ID_big_road1,big_road1_1line,big_road2_km,ID_big_road2,railroad_km,railroad_1line,zd_vokzaly_avto_km,ID_railroad_terminal,bus_terminal_avto_km,ID_bus_terminal,oil_chemistry_km,nuclear_reactor_km,radiation_km,power_transmission_line_km,thermal_power_plant_km,ts_km,big_market_km,market_shop_km,fitness_km,swim_pool_km,ice_rink_km,stadium_km,basketball_km,hospice_morgue_km,detention_facility_km,public_healthcare_km,university_km,workplaces_km,shopping_centers_km,office_km,additional_education_km,preschool_km,big_church_km,church_synagogue_km,mosque_km,theater_km,museum_km,exhibition_km,catering_km,ecology,green_part_500,prom_part_500,office_count_500,office_sqm_500,trc_count_500,trc_sqm_500,cafe_count_500,cafe_sum_500_min_price_avg,cafe_sum_500_max_price_avg,cafe_avg_price_500,cafe_count_500_na_price,cafe_count_500_price_500,cafe_count_500_price_1000,cafe_count_500_price_1500,cafe_count_500_price_2500,cafe_count_500_price_4000,cafe_count_500_price_high,big_church_count_500,church_count_500,mosque_count_500,leisure_count_500,sport_count_500,market_count_500,green_part_1000,prom_part_1000,office_count_1000,office_sqm_1000,trc_count_1000,trc_sqm_1000,cafe_count_1000,cafe_sum_1000_min_price_avg,cafe_sum_1000_max_price_avg,cafe_avg_price_1000,cafe_count_1000_na_price,cafe_count_1000_price_500,cafe_count_1000_price_1000,cafe_count_1000_price_1500,cafe_count_1000_price_2500,cafe_count_1000_price_4000,cafe_count_1000_price_high,big_church_count_1000,church_count_1000,mosque_count_1000,leisure_count_1000,sport_count_1000,market_count_1000,green_part_1500,prom_part_1500,office_count_1500,office_sqm_1500,trc_count_1500,trc_sqm_1500,cafe_count_1500,cafe_sum_1500_min_price_avg,cafe_sum_1500_max_price_avg,cafe_avg_price_1500,cafe_count_1500_na_price,cafe_count_1500_price_500,cafe_count_1500_price_1000,cafe_count_1500_price_1500,cafe_count_1500_price_2500,cafe_count_1500_price_4000,cafe_count_1500_price_high,big_church_count_1500,church_count_1500,mosque_count_1500,leisure_count_1500,sport_count_1500,market_count_1500,green_part_2000,prom_part_2000,office_count_2000,office_sqm_2000,trc_count_2000,trc_sqm_2000,cafe_count_2000,cafe_sum_2000_min_price_avg,cafe_sum_2000_max_price_avg,cafe_avg_price_2000,cafe_count_2000_na_price,cafe_count_2000_price_500,cafe_count_2000_price_1000,cafe_count_2000_price_1500,cafe_count_2000_price_2500,cafe_count_2000_price_4000,cafe_count_2000_price_high,big_church_count_2000,church_count_2000,mosque_count_2000,leisure_count_2000,sport_count_2000,market_count_2000,green_part_3000,prom_part_3000,office_count_3000,office_sqm_3000,trc_count_3000,trc_sqm_3000,cafe_count_3000,cafe_sum_3000_min_price_avg,cafe_sum_3000_max_price_avg,cafe_avg_price_3000,cafe_count_3000_na_price,cafe_count_3000_price_500,cafe_count_3000_price_1000,cafe_count_3000_price_1500,cafe_count_3000_price_2500,cafe_count_3000_price_4000,cafe_count_3000_price_high,big_church_count_3000,church_count_3000,mosque_count_3000,leisure_count_3000,sport_count_3000,market_count_3000,green_part_5000,prom_part_5000,office_count_5000,office_sqm_5000,trc_count_5000,trc_sqm_5000,cafe_count_5000,cafe_sum_5000_min_price_avg,cafe_sum_5000_max_price_avg,cafe_avg_price_5000,cafe_count_5000_na_price,cafe_count_5000_price_500,cafe_count_5000_price_1000,cafe_count_5000_price_1500,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
0,2011-08-20,43.0,27.0,4.0,,,,,,,Investment,Bibirevo,6407578.1,155572,0.189727,7e-05,9576,5001.0,5,10309,11065.0,5,0,240.0,1,0,7,3,no,0,16,1,no,no,no,no,no,no,no,no,86206,40477,45729,21154,11007,10147,98207,52277,45930,36211,10580,25631,9576,4899,4677,10309,5463,4846,23603,12286,11317,17508,9425,8083,18654,9709,8945,211.0,25.0,0.0,0.0,0.0,2.0,184.0,0.0,0.0,0.0,211.0,0.0,0.0,0.0,206.0,5.0,1,2.590241,1.13126,13.575119,1.13126,0.1457,0.177975,2.158587,0.600973,1.080934,23.68346,1.804127,3.633334,5.419893,65.038716,1.0,5.419893,6.905893,1,0.274985,3.299822,0.992631,no,1.422391,10.918587,13.100618,13.675657,15.156211,1.422391,1,no,3.830951,5,1.305159,no,14.231961,101,24.292406,1,18.152338,5.718519,1.210027,1.062513,5.814135,4.308127,10.814172,1.676258,0.485841,3.065047,1.107594,8.148591,3.516513,2.392353,4.248036,0.974743,6.715026,0.88435,0.648488,0.637189,0.947962,0.177975,0.625783,0.628187,3.93204,14.053047,7.389498,7.023705,0.516838,good,0.0,0.0,0,0,0,0,0,,,,0,0,0,0,0,0,0,0,0,0,0,1,0,7.36,0.0,1,30500,3,55600,19,527.78,888.89,708.33,1,10,4,3,1,0,0,1,2,0,0,6,1,14.27,6.92,3,39554,9,171420,34,566.67,969.7,768.18,1,14,11,6,2,0,0,1,2,0,0,7,1,11.77,15.97,9,188854,19,1244891,36,614.29,1042.86,828.57,1,15,11,6,2,1,0,1,2,0,0,10,1,11.98,13.55,12,251554,23,1419204,68,639.68,1079.37,859.52,5,21,22,16,3,1,0,2,4,0,0,21,1,13.09,13.31,29,807385,52,4036616,152,708.57,1185.71,947.14,12,39,48,40,9,4,0,13,22,1,0,52,4,5850000.0
1,2011-08-23,34.0,19.0,3.0,,,,,,,Investment,Nagatinskij Zaton,9589336.912,115352,0.372602,0.049637,6880,3119.0,5,7759,6237.0,8,0,229.0,1,0,6,1,yes,1,3,0,no,no,no,no,no,no,no,no,76284,34200,42084,15727,7925,7802,70194,35622,34572,29431,9266,20165,6880,3466,3414,7759,3909,3850,17700,8998,8702,15164,7571,7593,13729,6929,6800,245.0,83.0,1.0,0.0,67.0,4.0,90.0,0.0,0.0,0.0,244.0,1.0,1.0,143.0,84.0,15.0,2,0.9367,0.647337,7.62063,0.635053,0.147754,0.273345,0.55069,0.065321,0.966479,1.317476,4.655004,8.648587,3.411993,40.943917,2.0,3.641773,4.679745,2,0.065263,0.78316,0.698081,no,9.503405,3.103996,6.444333,8.13264,8.698054,2.887377,2,no,3.103996,4,0.694536,no,9.242586,32,5.706113,2,9.034642,3.489954,2.724295,1.246149,3.419574,0.72556,6.910568,3.424716,0.668364,2.000154,8.972823,6.127073,1.161579,2.543747,12.649879,1.477723,1.85256,0.686252,0.519311,0.688796,1.072315,0.273345,0.967821,0.471447,4.841544,6.829889,0.70926,2.35884,0.230287,excellent,25.14,0.0,0,0,0,0,5,860.0,1500.0,1180.0,0,1,3,0,0,1,0,0,1,0,0,0,0,26.66,0.07,2,86600,5,94065,13,615.38,1076.92,846.15,0,5,6,1,0,1,0,1,2,0,4,2,0,21.53,7.71,3,102910,7,127065,17,694.12,1205.88,950.0,0,6,7,1,2,1,0,1,5,0,4,9,0,22.37,19.25,4,165510,8,179065,21,695.24,1190.48,942.86,0,7,8,3,2,1,0,1,5,0,4,11,0,18.07,27.32,12,821986,14,491565,30,631.03,1086.21,858.62,1,11,11,4,2,1,0,1,7,0,6,19,1,10.26,27.47,66,2690465,40,2034942,177,673.81,1148.81,911.31,9,49,65,36,15,3,0,15,29,1,10,66,14,6000000.0


#### 6.1- Missing Data

In [223]:
# Count missing data
missing_data = df_2.isnull().sum()

# Missing data columns
missing_data = missing_data[missing_data > 0]

missing_data

life_sq                                   7559
floor                                      167
max_floor                                 9572
material                                  9572
build_year                               14654
num_room                                  9572
kitch_sq                                  9572
state                                    14253
product_type                                33
preschool_quota                           8284
school_quota                              8280
hospital_beds_raion                      17859
raion_build_count_with_material_info      6209
build_count_block                         6209
build_count_wood                          6209
build_count_frame                         6209
build_count_brick                         6209
build_count_monolith                      6209
build_count_panel                         6209
build_count_foam                          6209
build_count_slag                          6209
build_count_m

In [224]:
# Calculate the number of missing values
missing_data = df_2.isnull().sum()

# Calculate the percentage of missing values
missing_data_percentage = (df_2.isnull().sum() / len(df_2)) * 100

# Create a DataFrame that shows both the number and the percentage of missing values
missing_data_df = pd.DataFrame({'Missing Values': missing_data, 'Percentage': missing_data_percentage})

# Filter to show only columns where the number of missing values is greater than 0
missing_data_df = missing_data_df[missing_data_df['Missing Values'] > 0]

# Sort the DataFrame by the number of missing values in descending order
missing_data_df = missing_data_df.sort_values(by='Missing Values', ascending=False)

missing_data_df


Unnamed: 0,Missing Values,Percentage
hospital_beds_raion,17859,46.833451
cafe_sum_500_min_price_avg,16440,43.112265
cafe_avg_price_500,16440,43.112265
cafe_sum_500_max_price_avg,16440,43.112265
build_year,14654,38.428658
state,14253,37.377075
max_floor,9572,25.101618
material,9572,25.101618
num_room,9572,25.101618
kitch_sq,9572,25.101618


The dataset contains a significant amount of missing data, particularly in columns such as hospital_beds_raion (46.83%) and various cafe-related pricing metrics (around 43%). Additionally, important columns like build_year and state have over 30% missing values. This high percentage of missing data could impact the reliability and accuracy of any analysis, and it may require careful handling through imputation or exclusion strategies.

#### 6.2- Statistical Analysis

In [225]:
# Statistical Distribution
df_2.describe()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,area_m,raion_popul,green_zone_part,indust_part,children_preschool,preschool_quota,preschool_education_centers_raion,children_school,school_quota,school_education_centers_raion,school_education_centers_top_20_raion,hospital_beds_raion,healthcare_centers_raion,university_top_20_raion,sport_objects_raion,additional_education_raion,culture_objects_top_25_raion,shopping_centers_raion,office_raion,full_all,male_f,female_f,young_all,young_male,young_female,work_all,work_male,work_female,ekder_all,ekder_male,ekder_female,0_6_all,0_6_male,0_6_female,7_14_all,7_14_male,7_14_female,0_17_all,0_17_male,0_17_female,16_29_all,16_29_male,16_29_female,0_13_all,0_13_male,0_13_female,raion_build_count_with_material_info,build_count_block,build_count_wood,build_count_frame,build_count_brick,build_count_monolith,build_count_panel,build_count_foam,build_count_slag,build_count_mix,raion_build_count_with_builddate_info,build_count_before_1920,build_count_1921-1945,build_count_1946-1970,build_count_1971-1995,build_count_after_1995,ID_metro,metro_min_avto,metro_km_avto,metro_min_walk,metro_km_walk,kindergarten_km,school_km,park_km,green_zone_km,industrial_km,water_treatment_km,cemetery_km,incineration_km,railroad_station_walk_km,railroad_station_walk_min,ID_railroad_station_walk,railroad_station_avto_km,railroad_station_avto_min,ID_railroad_station_avto,public_transport_station_km,public_transport_station_min_walk,water_km,mkad_km,ttk_km,sadovoe_km,bulvar_ring_km,kremlin_km,big_road1_km,ID_big_road1,big_road2_km,ID_big_road2,railroad_km,zd_vokzaly_avto_km,ID_railroad_terminal,bus_terminal_avto_km,ID_bus_terminal,oil_chemistry_km,nuclear_reactor_km,radiation_km,power_transmission_line_km,thermal_power_plant_km,ts_km,big_market_km,market_shop_km,fitness_km,swim_pool_km,ice_rink_km,stadium_km,basketball_km,hospice_morgue_km,detention_facility_km,public_healthcare_km,university_km,workplaces_km,shopping_centers_km,office_km,additional_education_km,preschool_km,big_church_km,church_synagogue_km,mosque_km,theater_km,museum_km,exhibition_km,catering_km,green_part_500,prom_part_500,office_count_500,office_sqm_500,trc_count_500,trc_sqm_500,cafe_count_500,cafe_sum_500_min_price_avg,cafe_sum_500_max_price_avg,cafe_avg_price_500,cafe_count_500_na_price,cafe_count_500_price_500,cafe_count_500_price_1000,cafe_count_500_price_1500,cafe_count_500_price_2500,cafe_count_500_price_4000,cafe_count_500_price_high,big_church_count_500,church_count_500,mosque_count_500,leisure_count_500,sport_count_500,market_count_500,green_part_1000,prom_part_1000,office_count_1000,office_sqm_1000,trc_count_1000,trc_sqm_1000,cafe_count_1000,cafe_sum_1000_min_price_avg,cafe_sum_1000_max_price_avg,cafe_avg_price_1000,cafe_count_1000_na_price,cafe_count_1000_price_500,cafe_count_1000_price_1000,cafe_count_1000_price_1500,cafe_count_1000_price_2500,cafe_count_1000_price_4000,cafe_count_1000_price_high,big_church_count_1000,church_count_1000,mosque_count_1000,leisure_count_1000,sport_count_1000,market_count_1000,green_part_1500,prom_part_1500,office_count_1500,office_sqm_1500,trc_count_1500,trc_sqm_1500,cafe_count_1500,cafe_sum_1500_min_price_avg,cafe_sum_1500_max_price_avg,cafe_avg_price_1500,cafe_count_1500_na_price,cafe_count_1500_price_500,cafe_count_1500_price_1000,cafe_count_1500_price_1500,cafe_count_1500_price_2500,cafe_count_1500_price_4000,cafe_count_1500_price_high,big_church_count_1500,church_count_1500,mosque_count_1500,leisure_count_1500,sport_count_1500,market_count_1500,green_part_2000,prom_part_2000,office_count_2000,office_sqm_2000,trc_count_2000,trc_sqm_2000,cafe_count_2000,cafe_sum_2000_min_price_avg,cafe_sum_2000_max_price_avg,cafe_avg_price_2000,cafe_count_2000_na_price,cafe_count_2000_price_500,cafe_count_2000_price_1000,cafe_count_2000_price_1500,cafe_count_2000_price_2500,cafe_count_2000_price_4000,cafe_count_2000_price_high,big_church_count_2000,church_count_2000,mosque_count_2000,leisure_count_2000,sport_count_2000,market_count_2000,green_part_3000,prom_part_3000,office_count_3000,office_sqm_3000,trc_count_3000,trc_sqm_3000,cafe_count_3000,cafe_sum_3000_min_price_avg,cafe_sum_3000_max_price_avg,cafe_avg_price_3000,cafe_count_3000_na_price,cafe_count_3000_price_500,cafe_count_3000_price_1000,cafe_count_3000_price_1500,cafe_count_3000_price_2500,cafe_count_3000_price_4000,cafe_count_3000_price_high,big_church_count_3000,church_count_3000,mosque_count_3000,leisure_count_3000,sport_count_3000,market_count_3000,green_part_5000,prom_part_5000,office_count_5000,office_sqm_5000,trc_count_5000,trc_sqm_5000,cafe_count_5000,cafe_sum_5000_min_price_avg,cafe_sum_5000_max_price_avg,cafe_avg_price_5000,cafe_count_5000_na_price,cafe_count_5000_price_500,cafe_count_5000_price_1000,cafe_count_5000_price_1500,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
count,38133.0,30574.0,37966.0,28561.0,28561.0,23479.0,28561.0,28561.0,23880.0,38133.0,38133.0,38133.0,38133.0,38133.0,29849.0,38133.0,38133.0,29853.0,38133.0,38133.0,20274.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,31924.0,31924.0,31924.0,31924.0,31924.0,31924.0,31924.0,31924.0,31924.0,31924.0,31924.0,31924.0,31924.0,31924.0,31924.0,31924.0,38133.0,38133.0,38133.0,38074.0,38074.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38074.0,38074.0,38074.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,21693.0,21693.0,21693.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,30387.0,30387.0,30387.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,33113.0,33113.0,33113.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38114.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,35984.0,35984.0,35984.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,36960.0,36960.0,36960.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,37863.0,38133.0,38133.0,38133.0,38133.0,38133.0,37708.0,37708.0,37708.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,38133.0,30471.0
mean,54.111172,34.03346,7.667123,12.567592,1.83439,2716.785,1.900844,6.543995,2.07165,17662820.0,84752.862298,0.220199,0.119531,5183.292345,3269.047841,4.058532,5397.529253,8300.673333,4.715548,0.109328,1193.585035,1.335903,0.140351,6.719141,2.897464,0.294941,4.274985,8.440826,144219.3,66271.538353,77948.036582,11270.873758,5768.770461,5502.157816,54115.166654,27484.965804,26630.291926,19366.821887,5855.727008,13511.167886,5183.292345,2652.066347,2531.356096,5397.529253,2764.596413,2633.072772,12641.8768,6473.106574,6168.926337,30855.756353,15151.324234,15704.551438,9922.728031,5076.858836,4845.878662,334.721933,50.933718,42.586299,5.403145,110.502631,12.182747,107.755764,0.166959,4.591123,0.599549,334.386856,19.318945,28.061678,143.803032,80.708714,62.494487,72.078331,4.902702,3.682997,42.6461,3.553842,1.006149,1.347122,3.064809,0.30141,0.767431,11.435917,2.293647,11.028264,4.338967,52.067598,38.688974,4.521205,5.994829,45.493982,0.424452,5.093429,0.679687,6.329938,11.217024,13.93623,14.904962,15.92346,1.867409,11.496237,3.373048,22.202974,1.861238,17.050539,52.385047,10.077436,6.614455,17.538683,10.850669,4.435348,3.485451,7.310961,4.88134,13.31579,3.937603,1.156371,4.176315,6.12906,9.354236,4.775619,2.70221,14.557084,3.347699,6.823668,3.95383,1.499329,1.990698,1.32725,1.368545,2.321087,0.985379,7.73477,9.629857,7.047513,5.463351,0.680155,13.062945,6.431998,0.754806,14313.817061,0.562164,22313.82,3.919728,743.660153,1250.900929,997.280616,0.347599,1.000236,0.995568,0.858574,0.549734,0.138594,0.029423,0.28849,0.581072,0.00535,0.07327,0.929851,0.123305,16.866921,9.052249,3.170036,63861.99,2.003252,66690.62,15.662313,713.619105,1210.736782,962.177962,1.04007,4.214145,4.004878,3.579419,1.983662,0.779115,0.061023,0.829282,1.829334,0.019353,0.478588,2.950856,0.38232,19.222098,10.66737,7.471717,143381.1,3.764351,129312.7,32.904728,714.792999,1208.249856,961.521636,2.139643,8.337372,8.884824,7.975926,3.894239,1.476412,0.196313,2.011643,3.711615,0.037894,0.960533,5.916713,0.771327,20.904551,11.296266,13.614743,253202.1,6.038261,216199.0,55.860305,721.346343,1213.447266,967.396913,3.657436,13.787507,15.460939,13.447172,6.760365,2.362888,0.383998,3.31301,6.291611,0.088401,1.949571,9.940734,1.174127,22.858064,11.117434,30.03299,559462.7,11.998322,446114.0,112.806703,762.058612,1277.182466,1019.620697,7.408518,28.273884,30.955813,27.128235,13.582383,4.739491,0.71838,6.247319,12.494873,0.197257,3.962028,20.500669,2.348386,22.848842,10.499062,73.279338,1441316.0,30.502583,1191453.0,270.951748,765.654089,1279.640664,1022.647295,18.20161,67.563213,74.815357,64.687934,32.80568,11.05882,1.819133,15.387853,30.825741,0.436394,8.847901,53.487635,6.056119,7123035.0
std,35.171162,47.581529,5.276156,6.730496,1.490923,130852.1,0.84762,27.57163,0.864795,20950340.0,57576.253327,0.175476,0.120275,3807.969978,2170.030191,2.989401,3975.654268,4303.306519,3.449838,0.332874,1059.940646,1.487142,0.447871,6.592494,3.300857,1.520118,4.731001,23.779358,280478.4,128306.694764,152221.682821,8262.762652,4262.120905,4008.752854,37298.508493,18844.927634,18553.728684,13096.829822,4049.579361,9091.026813,3807.969978,1967.933042,1843.332364,3975.654268,2047.488987,1934.508375,9223.032614,4753.942488,4477.405002,59817.648041,29031.861131,30811.873687,7268.372894,3749.139961,3526.993395,278.589058,46.682757,127.837822,15.942031,129.922862,19.353596,88.052669,1.141112,13.242279,1.55496,278.585576,61.528402,63.296361,124.39747,57.434918,115.044941,55.04902,6.473965,5.837937,70.811055,5.900921,2.270515,3.159086,3.915439,0.293917,0.742974,7.056974,1.492882,6.869213,3.847041,46.164489,28.156479,3.889924,4.678139,34.690103,1.352674,16.232088,0.431069,5.228046,8.124999,8.397892,8.528672,8.4967,1.288391,11.26566,1.878294,17.087161,1.989607,10.315603,35.186425,7.420582,3.750596,10.45315,7.35604,5.678282,4.64322,5.690906,4.862071,8.473205,3.477093,1.54205,4.745467,4.790442,7.626957,5.493552,3.077801,11.116408,4.362089,6.785484,5.058338,2.341223,2.348471,1.530176,3.154612,3.02935,0.793365,5.731418,7.155372,5.795572,5.338443,0.843987,15.931179,13.222467,2.28626,42767.053838,1.239726,82922.4,11.333708,339.947751,524.273964,430.794777,1.366758,3.075474,2.280867,2.328207,2.147558,0.685881,0.180691,1.190161,2.176353,0.072947,0.389172,1.65112,0.396877,14.491153,12.096678,8.984913,145918.2,3.25736,150743.6,47.871449,227.496747,348.767679,286.998552,3.180333,13.943593,9.247377,10.54778,7.546792,3.978683,0.333574,2.505845,5.140197,0.137765,1.8312,3.330984,0.733166,14.241852,11.091944,21.153419,303873.7,4.487177,213532.9,97.95798,194.214259,296.117863,244.095781,6.002795,25.03103,22.245893,24.273068,13.569744,6.768311,0.911171,6.522144,9.80576,0.190942,3.36983,5.751337,1.115932,14.147591,9.710904,39.031688,509297.8,6.333235,292711.3,162.642911,199.039852,304.424118,250.715632,10.283807,39.392289,39.523087,39.465051,23.35879,9.785232,1.699635,10.077526,15.615366,0.283881,7.233659,9.230319,1.42919,13.942018,8.023349,82.010526,1066396.0,10.953158,471939.8,292.211339,218.393935,344.438116,280.926974,18.377905,72.338931,70.01069,71.382501,41.579568,16.997527,2.785199,15.572601,26.024078,0.447209,13.276473,18.889658,2.259046,11.15718,5.765588,156.854309,2323485.0,24.034633,1006871.0,518.994141,150.081066,233.09048,191.403457,33.53923,127.0037,127.251385,125.214092,74.104439,28.636604,5.469808,29.452128,47.850168,0.609313,20.772155,46.584733,4.904623,4780111.0
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2081628.0,2546.0,0.001879,0.0,175.0,0.0,0.0,168.0,1012.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2546.0,1208.0,1341.0,365.0,189.0,177.0,1633.0,863.0,771.0,548.0,156.0,393.0,175.0,91.0,85.0,168.0,87.0,82.0,411.0,214.0,198.0,575.0,308.0,253.0,322.0,166.0,156.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.00047,0.0,0.003737,0.0,0.0,0.274139,0.0,0.198071,0.028153,0.337833,1.0,0.028153,0.035191,1.0,0.002804,0.033646,0.0,0.003259,0.001935,0.000315,0.001947,0.072897,0.000364,1.0,0.001935,1.0,0.002299,0.136715,5.0,0.062026,1.0,0.51071,0.309811,0.002409,0.020103,0.236246,0.0,0.661366,0.003847,0.0,0.0,0.0,0.114746,0.001054,0.002517,0.041227,0.0,0.000282,0.0,0.0,0.0,0.0,0.0,0.004072,0.0,0.005545,0.026788,0.007903,0.000638,0.000357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,300.0,500.0,400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,300.0,500.0,400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,300.0,500.0,400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,300.0,500.0,400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,300.0,500.0,400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.52,0.21,0.0,0.0,0.0,0.0,0.0,300.0,500.0,400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100000.0
25%,38.9,20.0,3.0,9.0,1.0,1966.0,1.0,1.0,1.0,7307411.0,26943.0,0.065409,0.017647,1792.0,1874.0,2.0,1660.0,5782.0,2.0,0.0,350.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,31167.0,14906.0,15167.0,3639.0,1782.0,1905.0,16577.0,8329.0,7915.0,6297.0,1929.0,4168.0,1792.0,862.0,914.0,1660.0,821.0,871.0,3959.0,1973.0,2067.0,5829.0,2955.0,2874.0,3255.0,1600.0,1720.0,186.0,14.0,0.0,0.0,10.0,2.0,41.0,0.0,0.0,0.0,186.0,0.0,0.0,30.0,38.0,14.0,26.0,1.719313,1.036568,11.44385,0.953654,0.201211,0.273073,0.964876,0.103223,0.286587,5.386802,1.282383,6.351479,1.94454,23.334484,18.0,2.104573,3.23068,19.0,0.102853,1.234239,0.335591,2.647396,5.155608,8.197311,9.113764,10.290821,0.768375,2.0,2.091603,4.0,0.623775,9.909891,32.0,5.211726,3.0,8.750185,5.116159,1.235218,0.980711,3.723335,1.9965,7.610654,1.540511,0.362518,1.664932,3.031389,4.018205,1.301109,1.116103,5.571882,1.26644,2.18044,1.006486,0.475924,0.549295,0.48584,0.28985,0.846876,0.541706,3.79709,4.183495,2.826543,2.182771,0.204889,1.48,0.0,0.0,0.0,0.0,0.0,0.0,500.0,1000.0,750.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.22,0.0,0.0,0.0,0.0,0.0,1.0,550.0,1000.0,750.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.47,1.52,0.0,0.0,1.0,0.0,2.0,585.71,1000.0,796.05,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,10.3,3.12,0.0,0.0,1.0,14000.0,3.0,610.71,1000.0,825.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,12.35,4.21,1.0,3342.0,2.0,41100.0,9.0,649.35,1100.0,873.68,0.0,1.0,3.0,2.0,1.0,0.0,0.0,1.0,3.0,0.0,0.0,5.0,0.0,14.81,6.07,2.0,111700.0,7.0,262000.0,20.0,672.62,1145.95,909.38,1.0,5.0,8.0,6.0,2.0,1.0,0.0,2.0,9.0,0.0,0.0,11.0,1.0,4740002.0
50%,50.0,30.0,7.0,12.0,1.0,1980.0,2.0,6.0,2.0,10207220.0,83844.0,0.167526,0.072158,4926.0,2881.0,4.0,5285.0,7327.0,5.0,0.0,990.0,1.0,0.0,5.0,2.0,0.0,3.0,2.0,85083.0,39227.0,45410.0,10988.0,5470.0,5347.0,52560.0,26382.0,26333.0,20184.0,6180.0,13540.0,4926.0,2549.0,2390.0,5285.0,2693.0,2592.0,12508.0,6096.0,6321.0,17662.0,8896.0,9174.0,9633.0,4835.0,4702.0,282.0,43.0,1.0,0.0,68.0,6.0,92.0,0.0,0.0,0.0,282.0,0.0,2.0,139.0,71.0,24.0,53.0,2.769542,1.770572,20.16632,1.680527,0.358988,0.480208,1.787349,0.216846,0.571575,10.77952,1.954585,10.36799,3.213866,38.566389,33.0,3.363535,4.875792,34.0,0.162834,1.95401,0.602777,5.470993,9.692994,12.611656,13.490592,14.726279,1.721834,9.0,3.191018,21.0,1.218105,14.565736,50.0,7.568892,8.0,16.735588,8.840886,2.405544,1.887114,5.827496,3.855476,11.910442,2.887683,0.64543,2.799889,5.505685,6.947652,2.785516,1.921314,11.290192,2.287201,4.231661,1.986217,0.829028,1.052999,0.889735,0.502807,1.460192,0.870802,6.543563,8.6064,5.556948,4.064161,0.408914,8.07,0.0,0.0,0.0,0.0,0.0,1.0,668.75,1166.67,916.67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.86,4.06,0.0,0.0,1.0,8499.0,4.0,675.0,1150.0,916.05,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,15.12,7.78,1.0,19000.0,3.0,53350.0,10.0,693.33,1166.67,928.0,1.0,2.0,3.0,2.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,5.0,0.0,17.77,8.92,2.0,63808.0,5.0,117300.0,18.0,686.96,1162.16,923.53,1.0,4.0,6.0,4.0,1.0,0.0,0.0,1.0,3.0,0.0,0.0,9.0,1.0,20.41,9.91,5.0,154757.0,11.0,299547.0,42.0,707.765,1205.88,961.11,3.0,10.0,14.0,10.0,3.0,1.0,0.0,2.0,6.0,0.0,1.0,19.0,2.0,19.89,9.49,16.0,467146.0,31.0,1100073.0,110.0,725.2,1220.085,973.25,9.0,29.0,37.0,25.0,9.0,2.0,0.0,7.0,16.0,0.0,2.0,48.0,5.0,6274411.0
75%,63.0,43.0,11.0,17.0,2.0,2006.0,2.0,9.0,3.0,18036440.0,122862.0,0.338151,0.195781,7103.0,3953.0,6.0,7287.0,9891.0,7.0,0.0,1786.0,2.0,0.0,10.0,4.0,0.0,6.0,5.0,125111.0,58226.0,66338.0,15541.0,7789.0,7629.0,77062.0,38782.0,37556.0,29431.0,8775.0,20165.0,7103.0,3589.0,3484.0,7287.0,3623.0,3680.0,17425.0,8746.0,8619.0,27006.0,13683.0,14145.0,13682.0,6880.0,6721.0,400.0,72.0,11.0,1.0,156.0,13.0,157.0,0.0,2.0,0.0,400.0,4.0,22.0,216.0,125.0,57.0,108.0,4.788853,3.776836,43.598508,3.633209,0.96494,0.899203,3.30426,0.420417,1.040324,16.95335,3.018004,13.57905,5.053466,60.641589,53.0,5.301995,7.222037,73.0,0.279753,3.357039,0.942503,8.398551,15.524863,18.465068,19.603297,20.549464,2.806196,14.0,4.292973,38.0,2.416356,24.061214,83.0,13.780356,9.0,23.463447,16.37251,4.72201,4.79895,9.797933,5.513545,16.658272,5.439734,1.31318,5.246802,7.935177,13.430703,6.364524,3.340248,24.536255,3.934432,9.358961,5.372054,1.477304,2.781995,1.555985,0.936324,2.818066,1.253933,9.962969,13.459593,10.225384,6.759573,0.7993,18.98,5.99,0.0,0.0,1.0,1465.0,3.0,977.78,1500.0,1250.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,23.97,13.16,2.0,57636.0,3.0,67000.0,11.0,841.67,1400.0,1125.0,1.0,3.0,4.0,3.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,4.0,1.0,26.63,15.67,4.0,122709.0,5.0,157492.0,23.0,824.07,1375.0,1097.22,2.0,6.0,8.0,6.0,2.0,0.0,0.0,1.0,3.0,0.0,1.0,9.0,1.0,28.37,16.38,7.0,207193.0,9.0,291320.0,38.0,796.43,1330.19,1063.16,3.0,10.0,13.0,9.0,3.0,1.0,0.0,2.0,5.0,0.0,1.0,15.0,2.0,30.13,16.09,17.0,519477.0,17.0,670420.0,79.0,813.57,1333.33,1082.86,6.0,22.0,26.0,18.0,7.0,2.0,0.0,5.0,11.0,0.0,2.0,29.0,4.0,31.51,14.24,55.0,1505344.0,43.0,1723051.0,226.0,820.7375,1354.17,1091.67,15.0,61.0,70.0,51.0,22.0,5.0,1.0,12.0,28.0,1.0,7.0,76.0,11.0,8300000.0
max,5326.0,7478.0,77.0,117.0,6.0,20052010.0,19.0,2014.0,33.0,206071800.0,247469.0,0.852923,0.521867,19223.0,11926.0,13.0,19083.0,24750.0,14.0,2.0,4849.0,6.0,3.0,29.0,16.0,10.0,23.0,141.0,1716730.0,774585.0,942145.0,40692.0,20977.0,19715.0,161290.0,79622.0,81668.0,57086.0,19275.0,37811.0,19223.0,9987.0,9236.0,19083.0,9761.0,9322.0,45170.0,23233.0,21937.0,367659.0,172958.0,194701.0,36035.0,18574.0,17461.0,1681.0,223.0,793.0,97.0,664.0,127.0,431.0,11.0,84.0,9.0,1680.0,371.0,382.0,845.0,246.0,799.0,224.0,65.101125,74.905763,711.215806,59.267984,29.085774,47.394706,47.351538,2.036755,14.048162,47.59124,15.77871,58.63205,24.653041,295.836488,135.0,24.653983,38.691921,138.0,17.413002,208.956021,2.827709,53.277832,66.0332,68.853047,69.984874,70.738769,6.995416,48.0,13.798346,58.0,17.387119,91.215063,121.0,74.796111,14.0,70.413434,64.256957,53.890157,43.324371,56.856147,54.080913,59.501648,41.103651,26.652505,53.359294,46.037198,83.398514,56.703794,43.69464,89.371374,76.055135,84.862148,55.278225,26.259543,19.413195,24.268209,47.394706,45.66906,15.615728,44.849832,87.60069,59.203148,54.431244,12.162697,100.0,99.17,34.0,611015.0,8.0,1500000.0,120.0,4000.0,6000.0,5000.0,13.0,38.0,39.0,29.0,22.0,14.0,3.0,11.0,17.0,1.0,9.0,11.0,4.0,100.0,72.2,92.0,2244723.0,20.0,1500000.0,449.0,3250.0,5000.0,4125.0,28.0,112.0,107.0,104.0,79.0,40.0,7.0,27.0,38.0,1.0,30.0,25.0,6.0,91.41,63.0,173.0,2951861.0,27.0,1533000.0,784.0,2500.0,4000.0,3250.0,54.0,195.0,177.0,183.0,127.0,58.0,12.0,44.0,75.0,1.0,44.0,37.0,7.0,80.18,56.1,250.0,3602982.0,37.0,2448300.0,1168.0,2166.67,3500.0,2833.33,71.0,283.0,276.0,280.0,173.0,83.0,17.0,70.0,108.0,1.0,55.0,54.0,8.0,74.02,45.1,493.0,6106112.0,66.0,2654102.0,1815.0,1833.33,3000.0,2416.67,120.0,449.0,442.0,446.0,266.0,113.0,23.0,102.0,164.0,2.0,85.0,100.0,10.0,75.46,28.59,789.0,12702110.0,120.0,4585477.0,2645.0,1875.0,3000.0,2437.5,174.0,650.0,648.0,643.0,377.0,147.0,30.0,151.0,250.0,2.0,106.0,218.0,21.0,111111100.0


### Observations:

Standard Deviation Issues: 
The data has high standard deviations, indicating that the data is not normally distributed.

Unit Differences: 
Different columns are measured in different units, leading to inconsistencies.

Scaling and Transformation Needed: 
The data needs to be scaled and transformed to be suitable for modeling.

#### 6.3- Histograms

In [226]:
# List all the numeric columns
numeric_columns = df_2.select_dtypes(include=['float64', 'int64']).columns

In [227]:
'''
# Define a function to plot histograms in groups
def plot_histograms(df_2, cols, n_cols=3, figsize=(20, 15)):
    n_rows = (len(cols) + n_cols - 1) // n_cols
    fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
    axes = axes.flatten()
    
    for i, col in enumerate(cols):
        df[col].hist(bins=30, ax=axes[i])
        axes[i].set_title(col)
    
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])
    
    plt.tight_layout()
    plt.show()

# Plot histograms in groups of 10 columns
for i in range(0, len(numeric_columns), 10):
    plot_histograms(df_2, numeric_columns[i:i+10])
'''
print('')





#### 6.3- Check Distributions 1

In [228]:
'''
# Check the visualization
df_sampled = df_2.sample(frac=0.1)  # Verinin %10'unu alıyoruz
num_plots = len(numeric_columns)
plots_per_page = 16  # 4x4 yerleşim
num_pages = math.ceil(num_plots / plots_per_page)

for page in range(num_pages):
    plt.figure(figsize=(15, 15))
    
    start_idx = page * plots_per_page
    end_idx = min(start_idx + plots_per_page, num_plots)
    
    for i, column in enumerate(numeric_columns[start_idx:end_idx], 1):
        plt.subplot(4, 4, i)
        sns.histplot(df_sampled[column], kde=False)
        plt.title(column)
    
    plt.tight_layout()
    plt.show()
'''
print('')





The histograms above indicate that the distributions of the variables are heavily skewed and not normally distributed.

#### 6.4- Outliers Checking

In [229]:
'''
# Sütunları küçük gruplar halinde bölmek için adım boyutunu belirleyin
step = 10  # Aynı anda kaç sütun gösterileceğini belirler
num_columns = len(df_2.columns)

for i in range(0, num_columns, step):
    plt.figure(figsize=(12, 6))  # Grafik boyutunu ayarlayın
    sns.boxplot(data=df_2.iloc[:, i:i+step])
    plt.xticks(rotation=45, fontsize=8)
    plt.show()
'''
print('')




### 7- Data Cleaning

In [230]:
# Copy the dataframe
df_3 = df_2.copy()

#### 7.1- Handling with ID Columns

In [231]:
# List the date columns
date_columns = ['timestamp']  # Add any other date columns here

# Convert all date columns to retain only the year information
for col in date_columns:
    df_3[col + '_year'] = pd.to_datetime(df_3[col]).dt.year

# Optionally, drop the original date columns
df_3.drop(columns=date_columns, inplace=True)

#### 7.2- Drop Unnecessary Columns

In [232]:
# Drop all columns that contain 'ID' in their column name
df_4 = df_3.loc[:, ~df_3.columns.str.contains('ID')]

#### 7.3- Encoding

In [233]:
# Copy the dataset
df_5 = df_4.copy()

# Identify categorical columns
categorical_columns = df_5.select_dtypes(include=['object']).columns

# Apply Label Encoding to each categorical column
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df_5[col] = le.fit_transform(df_5[col])
    label_encoders[col] = le  # Store the label encoder for inverse transform later if needed

df_5.head(1)
df_5.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 38133 entries, 0 to 7661
Data columns (total 284 columns):
 #    Column                                 Non-Null Count  Dtype  
---   ------                                 --------------  -----  
 0    full_sq                                38133 non-null  float64
 1    life_sq                                30574 non-null  float64
 2    floor                                  37966 non-null  float64
 3    max_floor                              28561 non-null  float64
 4    material                               28561 non-null  float64
 5    build_year                             23479 non-null  float64
 6    num_room                               28561 non-null  float64
 7    kitch_sq                               28561 non-null  float64
 8    state                                  23880 non-null  float64
 9    product_type                           38133 non-null  int32  
 10   sub_area                               38133 non-null  int32  

#### 7.4 Handling Missing Data

In [234]:
# Copy the dataset
df_6 = df_5.copy()

In [235]:
# Eksik değerleri kolonların ortalaması ile doldurun
df_6 = df_6.fillna(df_6.mean())

#### 7.5 Drop Outliers 1

In [236]:
# Copy the dataframe
df_7 = df_6.copy()

In [237]:
import scipy.stats as stats

def winsorize_outliers(df, limits):
    df_winsorized = df.copy()
    for column in df.select_dtypes(include=['float64', 'int64']).columns:
        df_winsorized[column] = stats.mstats.winsorize(df[column], limits=[limits, limits])
    return df_winsorized

# Outlier'ları sınırlandırın (alt ve üst %5'lik dilimi)
df_7 = winsorize_outliers(df_7, limits=0.2)

#### 7.6 Check Outliers Test 1

In [238]:
'''
# Sütunları küçük gruplar halinde bölmek için adım boyutunu belirleyin
step = 10  # Aynı anda kaç sütun gösterileceğini belirler
num_columns = len(df_7.columns)

for i in range(0, num_columns, step):
    plt.figure(figsize=(12, 6))  # Grafik boyutunu ayarlayın
    sns.boxplot(data=df_2.iloc[:, i:i+step])
    plt.xticks(rotation=45, fontsize=8)
    plt.show()
'''
print('')




#### 7.7 Drop Outliers 2

In [239]:

import pandas as pd
from scipy import stats

def remove_outliers_zscore(df, threshold):
    df_out = df.copy()  # Orijinal veri setini korumak için bir kopya alın
    for column in df_out.select_dtypes(include=['float64', 'int64']).columns:
        # Z-score'u hesapla ve doğrudan DataFrame üzerinde filtreleme yap
        df_out = df_out[(abs(stats.zscore(df_out[column].dropna())) < threshold) | df_out[column].isna()]
    return df_out

In [240]:
# Outlier'ları Z-score yöntemiyle temizleyin
df_8 = remove_outliers_zscore(df_6, threshold=5)

#### 7.8 Check Outliers Test 2

In [241]:
'''
# Sütunları küçük gruplar halinde bölmek için adım boyutunu belirleyin
step = 10  # Aynı anda kaç sütun gösterileceğini belirler
num_columns = len(df_8.columns)

for i in range(0, num_columns, step):
    plt.figure(figsize=(12, 6))  # Grafik boyutunu ayarlayın
    sns.boxplot(data=df_2.iloc[:, i:i+step])
    plt.xticks(rotation=45, fontsize=8)
    plt.show()
'''
print('')




#### 7.9 Drop Outliers 3

In [242]:
# Cleaning Function 
def boxplot_outliers_treatment(df):
    df_out = df.copy()
    
    for column in df_out.select_dtypes(include=['float64', 'int64']).columns:
        # IQR hesaplama
        Q1 = df_out[column].quantile(0.25)
        Q3 = df_out[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Aykırı değerleri sınırlandırma (Winsorization)
        df_out[column] = df_out[column].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)
        
        # Alternatif: Aykırı değerleri medyan ile doldurma
        # median_value = df_out[column].median()
        # df_out[column] = df_out[column].apply(lambda x: median_value if x < lower_bound or x > upper_bound else x)
        
    return df_out

In [243]:
# Outlier'ları Boxplot ve IQR kullanarak düzeltin
df_9 = boxplot_outliers_treatment(df_6)

In [244]:
df_9.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38133 entries, 0 to 7661
Columns: 284 entries, full_sq to timestamp_year
dtypes: float64(258), int32(16), int64(10)
memory usage: 80.6 MB


#### 7.10 Check Outliers Test 3

In [245]:
'''
# Sütunları küçük gruplar halinde bölmek için adım boyutunu belirleyin
step = 10  # Aynı anda kaç sütun gösterileceğini belirler
num_columns = len(df_9.columns)

for i in range(0, num_columns, step):
    plt.figure(figsize=(12, 6))  # Grafik boyutunu ayarlayın
    sns.boxplot(data=df_2.iloc[:, i:i+step])
    plt.xticks(rotation=45, fontsize=8)
    plt.show()
'''
print('')




#### 7.11 Drop Outliers 4

In [246]:
def mark_outliers_zscore(df, threshold=3):
    df_out = df.copy()  # Orijinal veri setini korumak için bir kopya alın
    for column in df.select_dtypes(include=['float64', 'int64']).columns:
        z_scores = stats.zscore(df[column].dropna())  # NaN değerleri hesaplamadan önce çıkar
        abs_z_scores = abs(z_scores)
        # Sadece outlier olan değerleri NaN olarak işaretleyin
        df_out.loc[abs_z_scores > threshold, column] = None
    return df_out

# Outlier'ları NaN olarak işaretleyin
df_outliers_marked = mark_outliers_zscore(df_6)

# Outlier'ları içeren satırları düşürmek için tüm satırdaki NaN değerlerini temizleyin
df_10 = df_outliers_marked.dropna()

In [247]:
df_10.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16692 entries, 1 to 7660
Columns: 284 entries, full_sq to timestamp_year
dtypes: float64(268), int32(16)
memory usage: 35.3 MB


#### 7.12 Check Outliers Test 4

In [248]:
'''
# Sütunları küçük gruplar halinde bölmek için adım boyutunu belirleyin
step = 10  # Aynı anda kaç sütun gösterileceğini belirler
num_columns = len(df_10.columns)

for i in range(0, num_columns, step):
    plt.figure(figsize=(12, 6))  # Grafik boyutunu ayarlayın
    sns.boxplot(data=df_2.iloc[:, i:i+step])
    plt.xticks(rotation=45, fontsize=8)
    plt.show()
'''
print('')




#### 7.13 Data Normalization

In [249]:
# Copy the dataframe
df_11 = df_10.copy()

In [250]:
from sklearn.preprocessing import MinMaxScaler

def normalize_data(df, target_column):
    scaler = MinMaxScaler()
    df_normalized = df.copy()
    # Hedef değişkeni hariç tutarak bağımsız değişkenleri normalleştir
    columns_to_normalize = df_normalized.select_dtypes(include=['float64', 'int64']).columns.difference([target_column])
    df_normalized[columns_to_normalize] = scaler.fit_transform(df_normalized[columns_to_normalize])
    return df_normalized

# Hedef değişkeni hariç tutarak verileri normalize edin
target_column = 'price_doc'  # Buraya hedef kolonun adını yazın
df_11 = normalize_data(df_11, target_column)


#### 7.14 Check Distributions 2

In [251]:
'''
# Check the visualization
df_sampled = df_11.sample(frac=0.2)  # Verinin %10'unu alıyoruz

num_plots = len(numeric_columns)
plots_per_page = 16  # 4x4 yerleşim
num_pages = math.ceil(num_plots / plots_per_page)

for page in range(num_pages):
    plt.figure(figsize=(15, 15))
    
    start_idx = page * plots_per_page
    end_idx = min(start_idx + plots_per_page, num_plots)
    
    for i, column in enumerate(numeric_columns[start_idx:end_idx], 1):
        plt.subplot(4, 4, i)
        sns.histplot(df_sampled[column], kde=False)
        plt.title(column)
    
    plt.tight_layout()
    plt.show()
'''
print('')





#### 7.15 Correlation Test

In [252]:
# Copy Dataset
df_12 = df_11.copy()

In [253]:
# Hedef değişkeni çıkartarak korelasyon matrisini hesapla
target_column = 'price_doc'  # Buraya hedef kolonun adını yazın
corr_matrix = df_12.drop(columns=[target_column]).corr().abs()

In [254]:
# Üst üçgenin maskelenmesi
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Yüksek korelasyonlu kolonları bulma ve çıkarma (örneğin, 0.8 ve üzeri)
to_drop = [column for column in upper.columns if any(upper[column] > 0.80)]

In [255]:
print(f"drop the most correlated {len(to_drop)} columns")

drop the most correlated 128 columns


In [256]:
# Gereksiz kolonları çıkarma
df_12 = df_12.drop(columns=to_drop)

In [257]:
df_12.head(3)

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,sub_area,area_m,raion_popul,green_zone_part,indust_part,preschool_quota,school_education_centers_top_20_raion,hospital_beds_raion,healthcare_centers_raion,university_top_20_raion,sport_objects_raion,additional_education_raion,culture_objects_top_25,culture_objects_top_25_raion,shopping_centers_raion,office_raion,thermal_power_plant_raion,incineration_raion,oil_chemistry_raion,radiation_raion,railroad_terminal_raion,big_market_raion,nuclear_reactor_raion,detention_facility_raion,full_all,raion_build_count_with_material_info,build_count_block,build_count_wood,build_count_frame,build_count_brick,build_count_monolith,build_count_panel,build_count_foam,build_count_mix,build_count_1921-1945,build_count_1971-1995,build_count_after_1995,metro_min_avto,kindergarten_km,park_km,green_zone_km,industrial_km,water_treatment_km,cemetery_km,incineration_km,railroad_station_walk_km,public_transport_station_km,water_km,water_1line,mkad_km,ttk_km,big_road1_km,big_road1_1line,big_road2_km,railroad_1line,bus_terminal_avto_km,oil_chemistry_km,nuclear_reactor_km,power_transmission_line_km,ts_km,big_market_km,market_shop_km,fitness_km,swim_pool_km,ice_rink_km,detention_facility_km,shopping_centers_km,office_km,additional_education_km,big_church_km,church_synagogue_km,mosque_km,theater_km,catering_km,ecology,green_part_500,prom_part_500,office_count_500,office_sqm_500,trc_count_500,trc_sqm_500,cafe_count_500,cafe_sum_500_min_price_avg,cafe_count_500_na_price,cafe_count_500_price_500,cafe_count_500_price_1500,cafe_count_500_price_2500,cafe_count_500_price_4000,cafe_count_500_price_high,big_church_count_500,church_count_500,mosque_count_500,leisure_count_500,sport_count_500,market_count_500,green_part_1000,prom_part_1000,office_count_1000,office_sqm_1000,trc_count_1000,trc_sqm_1000,cafe_count_1000,cafe_sum_1000_min_price_avg,cafe_count_1000_na_price,cafe_count_1000_price_2500,cafe_count_1000_price_4000,cafe_count_1000_price_high,big_church_count_1000,church_count_1000,mosque_count_1000,leisure_count_1000,sport_count_1000,market_count_1000,trc_count_1500,trc_sqm_1500,cafe_sum_1500_min_price_avg,cafe_count_1500_na_price,cafe_count_1500_price_2500,cafe_count_1500_price_4000,cafe_count_1500_price_high,big_church_count_1500,church_count_1500,mosque_count_1500,leisure_count_1500,market_count_1500,trc_sqm_2000,cafe_sum_2000_min_price_avg,cafe_count_2000_price_4000,cafe_count_2000_price_high,church_count_2000,mosque_count_2000,leisure_count_2000,prom_part_3000,trc_sqm_3000,cafe_sum_3000_min_price_avg,cafe_count_3000_price_4000,cafe_count_3000_price_high,mosque_count_3000,leisure_count_3000,market_count_3000,prom_part_5000,cafe_sum_5000_min_price_avg,mosque_count_5000,market_count_5000,price_doc,timestamp_year
1,0.224422,0.125413,0.130435,0.41892,0.208597,0.547187,0.475211,0.090889,0.357217,0,70,0.115744,0.641209,0.53739,0.107777,0.399613,0.0,0.060856,0.2,0.0,0.333333,0.1,1,0.25,0.1875,0.0,0,0,0,0,0,0,0,0,0.300646,0.331522,0.535484,0.004902,0.0,0.211356,0.068966,0.288462,0.0,0.0,0.009091,0.407767,0.059524,0.051379,0.027974,0.044141,0.055534,0.329877,0.038904,0.793471,0.349541,0.216251,0.01436,0.356293,0,0.693861,0.110439,0.555971,0,0.437445,0,0.233007,0.248396,0.136027,0.119361,0.044044,0.213706,0.238748,0.116086,0.140713,0.492645,0.332047,0.100471,0.092275,0.20795,0.095426,0.164291,0.18814,0.229659,0.071817,0,0.417469,0.0,0.0,0.0,0.0,0.0,0.185185,0.466667,0.0,0.1,0.0,0.0,0.5,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.445596,0.001545,0.095238,0.178739,0.454545,0.187571,0.288889,0.309749,0.0,0.0,0.333333,0.0,0.166667,0.222222,0.0,0.8,0.166667,0.0,0.5,0.168973,0.414863,0.0,0.133333,0.25,0.0,0.083333,0.25,0.0,0.5,0.0,0.176775,0.408306,0.166667,0.0,0.2,0.0,0.4,0.779937,0.278155,0.303697,0.058824,0.0,0.0,0.315789,0.111111,0.99028,0.351038,0.5,0.7,6000000.0,2011
3,0.587459,0.330033,0.391304,0.41892,0.208597,0.547187,0.475211,0.090889,0.357217,0,65,0.162045,1.0,0.272972,0.151455,1.0,0.0,0.355836,0.2,0.0,0.944444,0.6,0,0.0,0.6875,0.083333,0,0,0,0,0,0,0,0,0.075424,0.620924,0.058065,0.25,0.255319,0.391167,0.862069,0.644231,0.0,1.0,0.218182,0.631068,1.0,0.081676,0.034008,0.006001,0.090224,0.153994,0.637559,0.343822,0.602604,0.272453,0.02984,0.612639,0,0.195342,0.52985,0.515616,0,0.391392,0,0.275487,0.802297,0.399647,0.170704,0.035391,0.933646,0.054971,0.108291,0.137205,0.355116,0.230395,0.116065,0.125161,0.173113,0.101765,0.544159,0.881804,0.572666,0.145734,1,0.288276,0.012429,0.0,0.0,0.0,0.0,0.074074,0.583333,0.0,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.321745,0.228376,0.047619,0.022704,0.545455,0.16108,0.266667,0.351932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.5,0.5,0.119008,0.393589,0.0,0.066667,0.0,0.0,0.083333,0.0,0.0,0.0,0.5,0.088347,0.371901,0.0,0.0,0.04,0.0,0.0,0.101172,0.116428,0.384174,0.0,0.0,0.0,0.0,0.333333,0.108785,0.681513,0.0,0.15,13100000.0,2011
7,0.290429,0.290429,0.217391,0.41892,0.208597,0.547187,0.475211,0.090889,0.357217,0,9,0.066542,0.869827,0.265577,0.000152,0.703357,0.0,0.06422,0.2,0.0,0.388889,0.3,0,0.0,1.0,0.020833,0,0,0,0,0,0,0,0,0.341181,0.285326,0.16129,0.0,0.0,0.0,0.034483,0.589744,0.0,0.0,0.0,1.0,0.019841,0.198741,0.025424,0.128991,0.140384,0.398316,0.95118,0.192921,0.143931,0.273612,0.058596,0.289394,0,0.075888,0.417824,0.200651,0,0.465849,0,0.963472,0.540116,0.216564,0.090495,0.29569,0.31674,0.194477,0.059496,0.147053,0.127318,0.109875,0.127223,0.116849,0.072113,0.106522,0.194537,0.163466,0.44309,0.116829,1,0.292594,0.0,0.0,0.0,0.0,0.0,0.148148,0.583333,0.0,0.0,0.285714,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.439077,0.0,0.047619,0.062951,0.272727,0.026322,0.288889,0.475967,0.0,0.181818,0.0,0.0,0.0,0.111111,0.0,0.0,0.25,0.0,0.785714,0.276163,0.300747,0.1,0.133333,0.0,0.0,0.083333,0.15,0.0,0.0,0.25,0.223397,0.32468,0.166667,0.0,0.16,0.0,0.0,0.386682,0.779316,0.296817,0.058824,0.0,0.0,0.0,0.222222,0.391028,0.396269,0.5,0.2,2000000.0,2011


### 8- Building Model-0 // Benchmark

In [258]:
# Copy dataset
df_13 = df_12.copy()

In [259]:
df_13.info(1)

<class 'pandas.core.frame.DataFrame'>
Index: 16692 entries, 1 to 7660
Data columns (total 156 columns):
 #    Column                                 Dtype  
---   ------                                 -----  
 0    full_sq                                float64
 1    life_sq                                float64
 2    floor                                  float64
 3    max_floor                              float64
 4    material                               float64
 5    build_year                             float64
 6    num_room                               float64
 7    kitch_sq                               float64
 8    state                                  float64
 9    product_type                           int32  
 10   sub_area                               int32  
 11   area_m                                 float64
 12   raion_popul                            float64
 13   green_zone_part                        float64
 14   indust_part                            flo

#### 8.1- Logistic Regression

In [260]:
# Separate the target variable and independent variables
target_column = 'price_doc'  

X = df_13.drop(columns=[target_column])
y = df_13[target_column]


In [261]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [262]:
# Create the Linear Regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

In [263]:
# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 4249879544852.8145
R^2 Score: 0.44098415551818104


#### 8.2- Ridge Regression

In [264]:
from sklearn.linear_model import Ridge

# Ridge Regression modelini oluşturma
ridge_model = Ridge(alpha=1.0)  # Alpha düzenleme parametresidir
ridge_model.fit(X_train, y_train)

# Tahmin yapma
y_pred_ridge = ridge_model.predict(X_test)

# Performansı değerlendirme
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f"Ridge Regression - Mean Squared Error: {mse_ridge}")
print(f"Ridge Regression - R^2 Score: {r2_ridge}")


Ridge Regression - Mean Squared Error: 4251866078411.1675
Ridge Regression - R^2 Score: 0.4407228531158437


#### 8.3- Lasso Regression

In [265]:
from sklearn.linear_model import Lasso

# Lasso Regression modelini oluşturma
lasso_model = Lasso(alpha=0.1)  # Alpha düzenleme parametresidir
lasso_model.fit(X_train, y_train)

# Tahmin yapma
y_pred_lasso = lasso_model.predict(X_test)

# Performansı değerlendirme
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print(f"Lasso Regression - Mean Squared Error: {mse_lasso}")
print(f"Lasso Regression - R^2 Score: {r2_lasso}")


Lasso Regression - Mean Squared Error: 4247317501678.2085
Lasso Regression - R^2 Score: 0.44132115865291355


  model = cd_fast.enet_coordinate_descent(


### 9- Feature Engineering 1

In [275]:
# Copy dataset
df_14 = df_13.copy()

In [276]:
# Clean the column names
df_14.columns = df_14.columns.str.replace(' ', '_').str.replace('-', '_').str.lower()

In [274]:
df_14.head(1)

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,sub_area,area_m,raion_popul,green_zone_part,indust_part,preschool_quota,school_education_centers_top_20_raion,hospital_beds_raion,healthcare_centers_raion,university_top_20_raion,sport_objects_raion,additional_education_raion,culture_objects_top_25,culture_objects_top_25_raion,shopping_centers_raion,office_raion,thermal_power_plant_raion,incineration_raion,oil_chemistry_raion,radiation_raion,railroad_terminal_raion,big_market_raion,nuclear_reactor_raion,detention_facility_raion,full_all,raion_build_count_with_material_info,build_count_block,build_count_wood,build_count_frame,build_count_brick,build_count_monolith,build_count_panel,build_count_foam,build_count_mix,build_count_1921_1945,build_count_1971_1995,build_count_after_1995,metro_min_avto,kindergarten_km,park_km,green_zone_km,industrial_km,water_treatment_km,cemetery_km,incineration_km,railroad_station_walk_km,public_transport_station_km,water_km,water_1line,mkad_km,ttk_km,big_road1_km,big_road1_1line,big_road2_km,railroad_1line,bus_terminal_avto_km,oil_chemistry_km,nuclear_reactor_km,power_transmission_line_km,ts_km,big_market_km,market_shop_km,fitness_km,swim_pool_km,ice_rink_km,detention_facility_km,shopping_centers_km,office_km,additional_education_km,big_church_km,church_synagogue_km,mosque_km,theater_km,catering_km,ecology,green_part_500,prom_part_500,office_count_500,office_sqm_500,trc_count_500,trc_sqm_500,cafe_count_500,cafe_sum_500_min_price_avg,cafe_count_500_na_price,cafe_count_500_price_500,cafe_count_500_price_1500,cafe_count_500_price_2500,cafe_count_500_price_4000,cafe_count_500_price_high,big_church_count_500,church_count_500,mosque_count_500,leisure_count_500,sport_count_500,market_count_500,green_part_1000,prom_part_1000,office_count_1000,office_sqm_1000,trc_count_1000,trc_sqm_1000,cafe_count_1000,cafe_sum_1000_min_price_avg,cafe_count_1000_na_price,cafe_count_1000_price_2500,cafe_count_1000_price_4000,cafe_count_1000_price_high,big_church_count_1000,church_count_1000,mosque_count_1000,leisure_count_1000,sport_count_1000,market_count_1000,trc_count_1500,trc_sqm_1500,cafe_sum_1500_min_price_avg,cafe_count_1500_na_price,cafe_count_1500_price_2500,cafe_count_1500_price_4000,cafe_count_1500_price_high,big_church_count_1500,church_count_1500,mosque_count_1500,leisure_count_1500,market_count_1500,trc_sqm_2000,cafe_sum_2000_min_price_avg,cafe_count_2000_price_4000,cafe_count_2000_price_high,church_count_2000,mosque_count_2000,leisure_count_2000,prom_part_3000,trc_sqm_3000,cafe_sum_3000_min_price_avg,cafe_count_3000_price_4000,cafe_count_3000_price_high,mosque_count_3000,leisure_count_3000,market_count_3000,prom_part_5000,cafe_sum_5000_min_price_avg,mosque_count_5000,market_count_5000,price_doc,timestamp_year
1,0.224422,0.125413,0.130435,0.41892,0.208597,0.547187,0.475211,0.090889,0.357217,0,70,0.115744,0.641209,0.53739,0.107777,0.399613,0.0,0.060856,0.2,0.0,0.333333,0.1,1,0.25,0.1875,0.0,0,0,0,0,0,0,0,0,0.300646,0.331522,0.535484,0.004902,0.0,0.211356,0.068966,0.288462,0.0,0.0,0.009091,0.407767,0.059524,0.051379,0.027974,0.044141,0.055534,0.329877,0.038904,0.793471,0.349541,0.216251,0.01436,0.356293,0,0.693861,0.110439,0.555971,0,0.437445,0,0.233007,0.248396,0.136027,0.119361,0.044044,0.213706,0.238748,0.116086,0.140713,0.492645,0.332047,0.100471,0.092275,0.20795,0.095426,0.164291,0.18814,0.229659,0.071817,0,0.417469,0.0,0.0,0.0,0.0,0.0,0.185185,0.466667,0.0,0.1,0.0,0.0,0.5,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.445596,0.001545,0.095238,0.178739,0.454545,0.187571,0.288889,0.309749,0.0,0.0,0.333333,0.0,0.166667,0.222222,0.0,0.8,0.166667,0.0,0.5,0.168973,0.414863,0.0,0.133333,0.25,0.0,0.083333,0.25,0.0,0.5,0.0,0.176775,0.408306,0.166667,0.0,0.2,0.0,0.4,0.779937,0.278155,0.303697,0.058824,0.0,0.0,0.315789,0.111111,0.99028,0.351038,0.5,0.7,6000000.0,2011


#### 9.1 Combine the columns

In [279]:
# Combine cafe counts
cafe_count_columns = [col for col in df_14.columns if 'cafe_count_' in col and 'price' not in col]
df_14['total_cafe_count'] = df_14[cafe_count_columns].sum(axis=1)
df_14 = df_14.drop(columns=cafe_count_columns)

In [281]:
# Combine cafe prices
cafe_price_columns = [col for col in df_14.columns if 'cafe_sum_' in col or 'cafe_avg_price_' in col or ('cafe_count_' in col and 'price' in col)]
df_14['avg_cafe_price'] = df_14[cafe_price_columns].mean(axis=1)
df_14 = df_14.drop(columns=cafe_price_columns)

In [283]:
# Combine church counts
church_count_columns = [col for col in df_14.columns if 'church_count_' in col or 'big_church_count_' in col]
df_14['total_church_count'] = df_14[church_count_columns].sum(axis=1)
df_14 = df_14.drop(columns=church_count_columns)

In [284]:
# Combine mosque counts
mosque_count_columns = [col for col in df_14.columns if 'mosque_count_' in col]
df_14['total_mosque_count'] = df_14[mosque_count_columns].sum(axis=1)
df_14 = df_14.drop(columns=mosque_count_columns)

In [285]:
# Combine leisure counts
leisure_count_columns = [col for col in df_14.columns if 'leisure_count_' in col]
df_14['total_leisure_count'] = df_14[leisure_count_columns].sum(axis=1)
df_14 = df_14.drop(columns=leisure_count_columns)

In [286]:
# Combine market counts
market_count_columns = [col for col in df_14.columns if 'market_count_' in col]
df_14['total_market_count'] = df_14[market_count_columns].sum(axis=1)
df_14 = df_14.drop(columns=market_count_columns)

In [287]:
# Combine shopping center counts
shopping_center_columns = [col for col in df_14.columns if 'trc_count_' in col]
df_14['total_shopping_centers'] = df_14[shopping_center_columns].sum(axis=1)
df_14 = df_14.drop(columns=shopping_center_columns)

In [288]:
df_14.info(1)

<class 'pandas.core.frame.DataFrame'>
Index: 16692 entries, 1 to 7660
Data columns (total 111 columns):
 #    Column                                 Dtype  
---   ------                                 -----  
 0    full_sq                                float64
 1    life_sq                                float64
 2    floor                                  float64
 3    max_floor                              float64
 4    material                               float64
 5    build_year                             float64
 6    num_room                               float64
 7    kitch_sq                               float64
 8    state                                  float64
 9    product_type                           int32  
 10   sub_area                               int32  
 11   area_m                                 float64
 12   raion_popul                            float64
 13   green_zone_part                        float64
 14   indust_part                            flo

### 10- Building Model-1 vs. Benchmark

In [201]:
# Separate the target variable and independent variables
target_column = 'price_doc'  

X = df_14.drop(columns=[target_column])
y = df_14[target_column]


In [202]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [203]:
# Create the Linear Regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

In [204]:
# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 4288706974657.939
R^2 Score: 0.43587691701118414


### 11- Feature Engineering 2

In [205]:
# Copy dataset
df_15 = df_14.copy()

In [207]:
df_15.info(1)

<class 'pandas.core.frame.DataFrame'>
Index: 16692 entries, 1 to 7660
Data columns (total 125 columns):
 #    Column                                 Dtype  
---   ------                                 -----  
 0    full_sq                                float64
 1    life_sq                                float64
 2    floor                                  float64
 3    max_floor                              float64
 4    material                               float64
 5    build_year                             float64
 6    num_room                               float64
 7    kitch_sq                               float64
 8    state                                  float64
 9    product_type                           int32  
 10   sub_area                               int32  
 11   area_m                                 float64
 12   raion_popul                            float64
 13   green_zone_part                        float64
 14   indust_part                            flo

#### 11.1 Combine the columns

In [None]:
# Combining all cafe count averages into a single column
df_15['avg_cafe_count_total'] = df_15[['cafe_count_500', 'cafe_count_1000', 'cafe_count_1500', 'cafe_count_2000', 'cafe_count_3000', 'cafe_count_5000']].mean(axis=1)

In [None]:
# Dropping the original columns after combining them
df_15 = df_15.drop(columns=['cafe_count_500', 'cafe_count_1000', 'cafe_count_1500', 
                            'cafe_count_2000', 'cafe_count_3000', 'cafe_count_5000'])

In [133]:
# Combining all church counts into a single column
df_15['total_church_count'] = df_15[['big_church_count_500', 'church_count_500', 'big_church_count_1000', 
                                     'church_count_1000', 'big_church_count_1500', 'church_count_1500', 
                                     'big_church_count_2000', 'church_count_2000', 'big_church_count_3000', 
                                     'church_count_3000', 'big_church_count_5000', 'church_count_5000']].sum(axis=1)

# Dropping the original columns after combining them
df_15 = df_15.drop(columns=['big_church_count_500', 'church_count_500', 'big_church_count_1000', 
                            'church_count_1000', 'big_church_count_1500', 'church_count_1500', 
                            'big_church_count_2000', 'church_count_2000', 'big_church_count_3000', 
                            'church_count_3000', 'big_church_count_5000', 'church_count_5000'])


KeyError: "['big_church_count_500', 'church_count_500', 'big_church_count_1000', 'church_count_1000', 'big_church_count_1500', 'church_count_1500', 'church_count_2000'] not in index"

In [None]:
# Combining all mosque counts into a single column
df_15['total_mosque_count'] = df_15[['mosque_count_500', 'mosque_count_1000', 'mosque_count_1500', 
                                     'mosque_count_2000', 'mosque_count_3000', 'mosque_count_5000']].sum(axis=1)

# Dropping the original columns after combining them
df_15 = df_15.drop(columns=['mosque_count_500', 'mosque_count_1000', 'mosque_count_1500', 
                            'mosque_count_2000', 'mosque_count_3000', 'mosque_count_5000'])


In [None]:
# Combining all leisure counts into a single column
df_15['total_leisure_count'] = df_15[['leisure_count_500', 'leisure_count_1000', 'leisure_count_1500', 
                                      'leisure_count_2000', 'leisure_count_3000', 'leisure_count_5000']].sum(axis=1)

# Dropping the original columns after combining them
df_15 = df_15.drop(columns=['leisure_count_500', 'leisure_count_1000', 'leisure_count_1500', 
                            'leisure_count_2000', 'leisure_count_3000', 'leisure_count_5000'])


In [None]:
# Combining all market counts into a single column
df_15['total_market_count'] = df_15[['market_count_500', 'market_count_1000', 'market_count_1500', 
                                     'market_count_2000', 'market_count_3000', 'market_count_5000']].sum(axis=1)

# Dropping the original columns after combining them
df_15 = df_15.drop(columns=['market_count_500', 'market_count_1000', 'market_count_1500', 
                            'market_count_2000', 'market_count_3000', 'market_count_5000'])


In [None]:
# Combining all shopping center counts into a single column
df_15['total_shopping_centers'] = df_15[['trc_count_500', 'trc_count_1000', 'trc_count_1500', 
                                         'trc_count_2000', 'trc_count_3000', 'trc_count_5000']].sum(axis=1)

# Dropping the original columns after combining them
df_15 = df_15.drop(columns=['trc_count_500', 'trc_count_1000', 'trc_count_1500', 
                            'trc_count_2000', 'trc_count_3000', 'trc_count_5000'])


### 8- Data Cleaning

#### 8.1- Drop Outliers

##### 8.1.1- Balance

##### 8.1.2- EstimatedSalary

#### 8.2- Standart Scaler

### 9- Building Alternative Models

#### 9.1- Model 1 - Decision Trees

#### 9.2- Model 2 - Random Forest

#### 9.3- Model 3 - Gradient Boosting Machines

#### 9.4- Model 4 - xgboost

#### 10- Handling Imbalanced Data

#### 11- Re-Train Models with Balanced Dataset 

### 12- Feature Engineering

#### 12.1 Correlation Test

#### 12.2 Recursive Feature Elimination

### 13- Hyperparameter Tuning

#### 13.1- Grid Search

#### 13.2- Random Search

### 14- Cross Validation