In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.decomposition import PCA
from sklearn.ensemble import  RandomForestRegressor
from datetime import datetime

In [2]:
# Veri Setini Yükleme
df = pd.read_csv('housing.csv')

In [3]:
q_75 = df['median_house_value'].quantile(q=0.75)  #  Uç değerlerle ilgilenilmesi
q_25 = df['median_house_value'].quantile(q=0.25)
iqr = q_75 - q_25
upper_bound = q_75 + 1.5 * iqr
df.loc[df['median_house_value'] > upper_bound, 'median_house_value'] = upper_bound

In [4]:
X= df[['longitude', 'latitude', 'housing_median_age', 'total_rooms',
                    'total_bedrooms', 'population', 'households', 'median_income', 'ocean_proximity']]
y = df['median_house_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, shuffle=True, test_size=0.3)

In [5]:
class DataPreprocessing:
    quantitative = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population',
                    'households', 'median_income']
    ssc = StandardScaler()

    def __init__(self, y=None):

        self.q_25 = None
        self.medians = None
        self.q_75 = None
        self.means = None

    def fit(self, X:pd.DataFrame, y=None) -> None:

        self.q_25 = X[DataPreprocessing.quantitative].quantile(q=0.25)
        self.medians = X[DataPreprocessing.quantitative].quantile(q=0.5)
        self.q_75 = X[DataPreprocessing.quantitative].quantile(q=0.75)
        self.means = X[DataPreprocessing.quantitative].mean()

    def transform(self, X:pd.DataFrame, y=None):
        for column in X[DataPreprocessing.quantitative].columns:  # Uç Değerlerin silinmesi
            q_3 = self.q_75[column]
            q_1 = self.q_25[column]
            iqr = q_3 - q_1
            upper_bound = q_3 + 1.5 * iqr
            lower_bound = q_1 - 1.5 * iqr
            X.loc[X[column] > upper_bound, column] = q_3
            X.loc[X[column] < lower_bound, column] = q_1

        for column in X[DataPreprocessing.quantitative].columns:
            X[column].fillna(self.means[column], inplace=True)

        # Veri Dönüştürme
        X['population_per_room'] = X['population'] / X['total_rooms']

        X['bedroom_share'] = X['total_bedrooms'] / X['total_rooms'] * 100

        X['diag_coord'] = X['longitude'] + X['latitude']

        dummy = pd.get_dummies(X['ocean_proximity'])
        X['_1H OCEAN'] = dummy['<1H OCEAN']
        X['INLAND'] = dummy['INLAND']
        X['ISLAND'] = dummy['ISLAND']
        X['NEAR BAY'] = dummy['NEAR BAY']
        X['NEAR OCEAN'] = dummy['NEAR OCEAN']
        X.drop(['ocean_proximity'],axis=1, inplace=True)

        X['age_cat'] = 0
        X.loc[X['housing_median_age'] <= 5, 'age_cat'] = 1
        X.loc[(X['housing_median_age'] > 5) & (X['housing_median_age'] <= 10), 'age_cat'] = 2
        X.loc[(X['housing_median_age'] > 10) & (X['housing_median_age'] <= 25), 'age_cat'] = 3
        X.loc[X['housing_median_age'] > 25, 'age_cat'] = 4


    def scaling(X:pd.DataFrame, data_type: str, y=None) -> pd.DataFrame:
        """Scaling (Data standardization) for linear models.
        ssc = StandardScaler(), ssc was define as a class argument"""
        if data_type.lower() == 'train':
            return pd.DataFrame(DataPreprocessing.ssc.fit_transform(X), columns=X.columns, index=X.index)
        if data_type.lower() == 'test':
            return pd.DataFrame(DataPreprocessing.ssc.transform(X), columns=X.columns, index=X.index)

preprocessing = DataPreprocessing()
preprocessing.fit(X)
preprocessing.transform(X_train)
preprocessing.transform(X_test)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(self.means[column], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(self.means[column], inplace=True)


In [6]:
X_train_scale = DataPreprocessing.scaling(X_train, 'train')
X_test_scale = DataPreprocessing.scaling(X_test, 'test')

In [7]:
df.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [9]:




# Random Forest Modeli Tanımlama ve Eğitme
rf_model = RandomForestRegressor(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

# Tahmin Yapma
y_pred = rf_model.predict(X_test)

# Performans Değerlendirme
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print(f"R-squared (R2): {r2}")

# Test setinde median_income ve median_house_value arasındaki ilişkiyi görselleştirme
results_df = X_test.copy()
results_df['median_house_value_actual'] = y_test
results_df['median_house_value_predicted'] = y_pred

# Grafiği Oluşturma
fig = px.scatter(
    results_df,
    x="median_income",
    y="median_house_value_actual",
    trendline="ols",
    color='median_house_value_actual',
    color_discrete_sequence=['steelblue'],
    color_continuous_scale=px.colors.sequential.Blues,
    template='plotly_dark',
    title='<b>OLS Regresyon Çizgisi</b>',
)

fig.update_layout(
    title_x=0.5,
    xaxis_title="Median Income",
    yaxis_title="Median House Value",
)

fig.show()


R-squared (R2): 0.8286757796724149
