In [20]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from datetime import datetime

In [21]:
df = pd.read_csv('housing.csv')#Veri setini okuyoruz

In [22]:
q_75 = df['median_house_value'].quantile(q=0.75)  #  1. ve 3. çeyrek değerleri hesaplanır. IQR yardımıyla uç değer sınırları belirlenir ve buranın dışında kalan değerler atılır
q_25 = df['median_house_value'].quantile(q=0.25)  # Bu sayede uç değerlerden yani veride diğer değerlerden daha anormal büyük ve küçük olan verilerden kurtulunur.
iqr = q_75 - q_25
upper_bound = q_75 + 1.5 * iqr
df.loc[df['median_house_value'] > upper_bound, 'median_house_value'] = upper_bound

In [23]:
X = df[['longitude', 'latitude', 'housing_median_age', 'total_rooms', # Bu kısımda bağımsız ve bağımlı değişkenler ayrılır
                    'total_bedrooms', 'population', 'households', 'median_income', 'ocean_proximity']]
y = df['median_house_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, shuffle=True, test_size=0.3) # Eğitim ve test verisi ayrılır

In [24]:
class DataPreprocessing:
    quantitative = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population',
                    'households', 'median_income']
    ssc = StandardScaler()

    def __init__(self, y=None):
        """Ana istatistiksel özelliklerin başlangıçta tanımlanması"""
        self.q_25 = None
        self.medians = None
        self.q_75 = None
        self.means = None

    def fit(self, X:pd.DataFrame, y=None) -> None:  # Verideki her nicel değişkenin çeyrek, ortalama ve medyan  gibi değerleri hesaplanır. Bu veriler daha sonra eksik değerleri doldurmak
        """Eğitim verisi üzerinde istatistiksel özelliklerin hesaplanması"""# ve uç değerleri işlememiz için kullanılır
        self.q_25 = X[DataPreprocessing.quantitative].quantile(q=0.25)
        self.medians = X[DataPreprocessing.quantitative].quantile(q=0.5)
        self.q_75 = X[DataPreprocessing.quantitative].quantile(q=0.75)
        self.means = X[DataPreprocessing.quantitative].mean()

    def transform(self, X:pd.DataFrame, y=None): # Yine IQR yardımıyla her sütun için uç değerler düzeltilir. IQR (Interquartile Range):
        for column in X[DataPreprocessing.quantitative].columns:  
            q_3 = self.q_75[column]
            q_1 = self.q_25[column]
            iqr = q_3 - q_1
            upper_bound = q_3 + 1.5 * iqr
            lower_bound = q_1 - 1.5 * iqr
            X.loc[X[column] > upper_bound, column] = q_3
            X.loc[X[column] < lower_bound, column] = q_1 #Sınırların dışında kalan değerler uç değer olarak atanırlar.

        # Eksik verilerin ortalama ile doldurulması.
        for column in X[DataPreprocessing.quantitative].columns:  
            X[column].fillna(self.means[column], inplace=True)

        # Özellik çıkarımı. Elimizdeki özellikleri kullanarak öğrenim ve yorum kalitesini arrttıracak anlamlı özellikler oluşturma.
        X['population_per_room'] = X['population'] / X['total_rooms']

        X['bedroom_share'] = X['total_bedrooms'] / X['total_rooms'] * 100

        X['diag_coord'] = X['longitude'] + X['latitude']

        dummy = pd.get_dummies(X['ocean_proximity']) # Kategorik değişkenin one-hot encoding ile dönüştürülmesi
        X['_1H OCEAN'] = dummy['<1H OCEAN']
        X['INLAND'] = dummy['INLAND']
        X['ISLAND'] = dummy['ISLAND']
        X['NEAR BAY'] = dummy['NEAR BAY']
        X['NEAR OCEAN'] = dummy['NEAR OCEAN']
        X.drop(['ocean_proximity'],axis=1, inplace=True) # Orijinal sütun kaldırılır

        # Yaş kategorisi oluşturulması
        X['age_cat'] = 0
        X.loc[X['housing_median_age'] <= 5, 'age_cat'] = 1  # 5 yaş ve altı
        X.loc[(X['housing_median_age'] > 5) & (X['housing_median_age'] <= 10), 'age_cat'] = 2  # 6-10 yaş arası
        X.loc[(X['housing_median_age'] > 10) & (X['housing_median_age'] <= 25), 'age_cat'] = 3  # 11-25 yaş arası
        X.loc[X['housing_median_age'] > 25, 'age_cat'] = 4  # 25 yaş üstü


    def scaling(X:pd.DataFrame, data_type: str, y=None) -> pd.DataFrame:
        """Scaling (Data standardization) for linear models.
        ssc = StandardScaler(), ssc was define as a class argument"""
        if data_type.lower() == 'train':
            return pd.DataFrame(DataPreprocessing.ssc.fit_transform(X), columns=X.columns, index=X.index)
        if data_type.lower() == 'test':
            return pd.DataFrame(DataPreprocessing.ssc.transform(X), columns=X.columns, index=X.index)

preprocessing = DataPreprocessing()
preprocessing.fit(X)
preprocessing.transform(X_train)
preprocessing.transform(X_test)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [25]:
X_train_scale = DataPreprocessing.scaling(X_train, 'train')
X_test_scale = DataPreprocessing.scaling(X_test, 'test')

In [26]:
# Model Oluşturma
mlp = Sequential()
mlp.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
mlp.add(Dense(128, activation='relu'))
mlp.add(Dense(64, activation='relu'))
mlp.add(Dense(1, activation='linear'))  # Regresyon için linear aktivasyon
mlp.summary()


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



In [27]:
# Model Derleme
mlp.compile(loss='mean_squared_error', optimizer='Adam', metrics=['mean_squared_error'])

# Model Eğitimi
history = mlp.fit(X_train, y_train, epochs=50, batch_size=128, validation_split=0.1)

# Model Performansı
y_pred_train = mlp.predict(X_train)
y_pred_test = mlp.predict(X_test)

train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

print(f"Train RMSE: {train_rmse}, Train R²: {train_r2}")
print(f"Test RMSE: {test_rmse}, Test R²: {test_r2}")


Epoch 1/50
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 50794074112.0000 - mean_squared_error: 50794074112.0000 - val_loss: 17928112128.0000 - val_mean_squared_error: 17928112128.0000
Epoch 2/50
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 16917576704.0000 - mean_squared_error: 16917576704.0000 - val_loss: 16248873984.0000 - val_mean_squared_error: 16248873984.0000
Epoch 3/50
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 15354663936.0000 - mean_squared_error: 15354663936.0000 - val_loss: 13182801920.0000 - val_mean_squared_error: 13182801920.0000
Epoch 4/50
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 12323742720.0000 - mean_squared_error: 12323742720.0000 - val_loss: 11251017728.0000 - val_mean_squared_error: 11251017728.0000
Epoch 5/50
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 11115075584.0000 -

In [28]:
# Eğitim ve Doğrulama Kaybı Grafiği
fig_loss = go.Figure()
fig_loss.add_trace(go.Scatter(
    y=history.history['loss'],
    mode='lines',
    name='Train Loss',
    line=dict(color='blue')
))
fig_loss.add_trace(go.Scatter(
    y=history.history['val_loss'],
    mode='lines',
    name='Validation Loss',
    line=dict(color='orange')
))
fig_loss.update_layout(
    title='<b>Epok Başına Kayıp Miktarı</b>',
    xaxis_title='Epoklar',
    yaxis_title='Kayıp (MSE)',
    template='plotly_dark',
    title_x=0.5
)
fig_loss.show()

In [32]:
# Gerçek ve Tahmin Karşılaştırması Grafiği
y_pred_test = mlp.predict(X_test).flatten()  # Test setine göre tahmin
fig_compare = go.Figure()
fig_compare.add_trace(go.Scatter(
    x=y_test,
    y=y_pred_test,
    mode='markers',
    name='Tahminler',
    marker=dict(color='red', opacity=0.7)
))
fig_compare.add_trace(go.Scatter(
    x=[y_test.min(), y_test.max()],
    y=[y_test.min(), y_test.max()],
    mode='lines',
    name='Trend Çizgisi',
    line=dict(color='green', dash='dash')
))
fig_compare.update_layout(
    title='<b>Gerçek ve Tahmin Edilen Değerler</b>',
    xaxis_title='Gerçek Değerler',
    yaxis_title='Tahmin Edilen Değerler',
    template='plotly_dark',
    title_x=0.5
)
fig_compare.show()

[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 461us/step
