In [80]:
# VERİ SETİNİ GETİRME
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

data = sns.load_dataset("tips")
df = pd.DataFrame(data)

# Eksik Verileri Ortalama ile Doldurma
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer = imputer.fit(df[["total_bill"]])
df[["total_bill"]] = imputer.transform(df[["total_bill"]])

# Kategorik Olmayan Değişkenler
total_bill = df[["total_bill"]]
tip = df[["tip"]]
size = df[["size"]]

# Ktegorik Değişkenler İçin 0-1 Dönüşümü
smoker = pd.get_dummies(df[["smoker"]])
time = pd.get_dummies(df[["day"]])

# Kukla Değişkeni Silme
smoker = smoker.drop("smoker_Yes", axis=1)

# Verileri Birleştirme
new_data = pd.concat([total_bill, size, smoker, time, tip], axis=1)

# Train ve Test Belirleme
training = new_data.iloc[:, 0:7]
testing = new_data.iloc[:, 7:8]

# Train Test Ayırma
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(training, testing, test_size=0.30, random_state=42)

In [81]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor(random_state = 0)

dtr.fit(x_train, y_train)


DecisionTreeRegressor(random_state=0)

In [82]:
y_pred = dtr.predict(x_test)

In [84]:
y_tahmin = pd.DataFrame(data = y_pred)
y_tahmin.index = y_test.index

result = pd.concat([y_tahmin, y_test],axis=1)
result.columns = [["Tahmin","Gerçek"]]

print(result)

    Tahmin Gerçek
24    2.75   3.18
6     1.32   2.00
153   3.61   2.00
211   2.00   5.16
198   2.00   2.00
..     ...    ...
165   4.00   3.48
154   3.00   2.00
216   3.09   3.00
79    3.50   2.71
29    2.75   3.00

[74 rows x 2 columns]


### Stats Model - Normal

In [85]:
import statsmodels.api as sm

decision_tree = sm.OLS(dtr.predict(x_train), x_train)
model = decision_tree.fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.494
Model:,OLS,Adj. R-squared:,0.475
Method:,Least Squares,F-statistic:,26.52
Date:,"Sat, 17 Oct 2020",Prob (F-statistic):,6.66e-22
Time:,11:10:48,Log-Likelihood:,-247.39
No. Observations:,170,AIC:,508.8
Df Residuals:,163,BIC:,530.7
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
total_bill,0.0962,0.011,8.511,0.000,0.074,0.119
size,0.2751,0.109,2.517,0.013,0.059,0.491
smoker_No,0.2959,0.180,1.645,0.102,-0.059,0.651
day_Thur,0.2817,0.300,0.938,0.350,-0.311,0.874
day_Fri,0.4295,0.342,1.257,0.210,-0.245,1.104
day_Sat,0.2260,0.296,0.763,0.447,-0.359,0.811
day_Sun,0.1750,0.325,0.539,0.591,-0.466,0.816

0,1,2,3
Omnibus:,14.568,Durbin-Watson:,2.0
Prob(Omnibus):,0.001,Jarque-Bera (JB):,25.635
Skew:,0.422,Prob(JB):,2.71e-06
Kurtosis:,4.705,Cond. No.,150.0


### MSE - Normal

In [86]:
from sklearn.metrics import mean_squared_error 

mse = np.sqrt(mean_squared_error(y_train, model.predict(x_train)))
print("Train RMSE: ", mse)
mse = np.sqrt(mean_squared_error(y_test, model.predict(x_test)))
print("Test RMSE: ", mse)

Train RMSE:  1.0420494266419704
Test RMSE:  0.9584301084717695


### Model Tuning

In [88]:
# Train ve Test Belirleme
training = new_data.iloc[:, 0:2]
testing = new_data.iloc[:, 7:8]

# Train Test Ayırma
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(training, testing, test_size=0.30, random_state=42)

### Stats Model - Tuned

In [89]:
import statsmodels.api as sm
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor(random_state = 0)
dtr.fit(x_train, y_train)

decision_tree = sm.OLS(dtr.predict(x_train), x_train)
model = decision_tree.fit()
model.summary()


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.905
Model:,OLS,Adj. R-squared (uncentered):,0.904
Method:,Least Squares,F-statistic:,797.8
Date:,"Sat, 17 Oct 2020",Prob (F-statistic):,1.6900000000000002e-86
Time:,11:14:58,Log-Likelihood:,-250.15
No. Observations:,170,AIC:,504.3
Df Residuals:,168,BIC:,510.6
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
total_bill,0.0963,0.010,9.219,0.000,0.076,0.117
size,0.4203,0.085,4.966,0.000,0.253,0.587

0,1,2,3
Omnibus:,8.731,Durbin-Watson:,1.96
Prob(Omnibus):,0.013,Jarque-Bera (JB):,17.772
Skew:,0.005,Prob(JB):,0.000138
Kurtosis:,4.584,Cond. No.,23.5


### MSE - Tuned

In [90]:
from sklearn.metrics import mean_squared_error 

mse = np.sqrt(mean_squared_error(y_train, model.predict(x_train)))
print("Train RMSE: ", mse)
mse = np.sqrt(mean_squared_error(y_test, model.predict(x_test)))
print("Test RMSE: ", mse)

Train RMSE:  1.0617040982230845
Test RMSE:  0.9700308638353677
