In [1]:
# VERİ SETİNİ GETİRME
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

data = sns.load_dataset("tips")
df = pd.DataFrame(data)

# Eksik Verileri Ortalama ile Doldurma
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer = imputer.fit(df[["total_bill"]])
df[["total_bill"]] = imputer.transform(df[["total_bill"]])

# Kategorik Olmayan Değişkenler
total_bill = df[["total_bill"]]
tip = df[["tip"]]
size = df[["size"]]

# Ktegorik Değişkenler İçin 0-1 Dönüşümü
smoker = pd.get_dummies(df[["smoker"]])
time = pd.get_dummies(df[["day"]])

# Kukla Değişkeni Silme
smoker = smoker.drop("smoker_Yes", axis=1)

# Verileri Birleştirme
new_data = pd.concat([total_bill, size, smoker, time, tip], axis=1)

# Train ve Test Belirleme
training = new_data.iloc[:, 0:7]
testing = new_data.iloc[:, 7:8]

# Train Test Ayırma
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(training, testing, test_size=0.30, random_state=42)

In [2]:
from warnings import filterwarnings
filterwarnings('ignore')

### Random Forest Model

In [3]:
from sklearn.ensemble import RandomForestRegressor

randomforest_regressor = RandomForestRegressor()
randomforest_regressor.fit(x_train, y_train)

RandomForestRegressor()

#### Stats Model

In [4]:
import statsmodels.api as sm

model = sm.OLS(randomforest_regressor.predict(x_train), x_train)
model = model.fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.669
Model:,OLS,Adj. R-squared:,0.657
Method:,Least Squares,F-statistic:,54.95
Date:,"Sun, 18 Oct 2020",Prob (F-statistic):,1.1e-36
Time:,21:17:46,Log-Likelihood:,-180.99
No. Observations:,170,AIC:,376.0
Df Residuals:,163,BIC:,397.9
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
total_bill,0.0924,0.008,12.078,0.000,0.077,0.108
size,0.2791,0.074,3.774,0.000,0.133,0.425
smoker_No,0.2964,0.122,2.435,0.016,0.056,0.537
day_Thur,0.2950,0.203,1.452,0.148,-0.106,0.696
day_Fri,0.5303,0.231,2.295,0.023,0.074,0.987
day_Sat,0.2838,0.201,1.415,0.159,-0.112,0.680
day_Sun,0.2730,0.220,1.242,0.216,-0.161,0.707

0,1,2,3
Omnibus:,6.557,Durbin-Watson:,1.856
Prob(Omnibus):,0.038,Jarque-Bera (JB):,9.722
Skew:,0.157,Prob(JB):,0.00774
Kurtosis:,4.129,Cond. No.,150.0


### Model Tuning

In [5]:
training = new_data.iloc[:, 0:3]
testing = new_data.iloc[:, 7:8]

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(training, testing, test_size=0.30, random_state=42)

#### Grid Search

In [6]:
randomforest_params = {"n_estimators": [1,10,20,30,40,50,100],
                       "criterion": ["mse", "mae"],
                       "max_features": ["sqrt", "auto"],
                       "min_samples_split":np.arange(1,10,1)}

In [7]:
from sklearn.model_selection import GridSearchCV

randomforest_model = GridSearchCV(randomforest_regressor, randomforest_params, cv=10)
randomforest_model.fit(x_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid={'criterion': ['mse', 'mae'],
                         'max_features': ['sqrt', 'auto'],
                         'min_samples_split': array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
                         'n_estimators': [1, 10, 20, 30, 40, 50, 100]})

In [8]:
print("Best Parameter: ", randomforest_model.best_params_)

Best Parameter:  {'criterion': 'mae', 'max_features': 'sqrt', 'min_samples_split': 9, 'n_estimators': 50}


#### Random Forest Tuned

In [9]:
from sklearn.ensemble import RandomForestRegressor

randomforest_tuned = RandomForestRegressor(criterion="mae", max_features="sqrt", min_samples_split = 9, n_estimators=50)
randomforest_tuned.fit(x_train, y_train)

RandomForestRegressor(criterion='mae', max_features='sqrt', min_samples_split=9,
                      n_estimators=50)

#### Stats Model

In [10]:
import statsmodels.api as sm

model = sm.OLS(randomforest_tuned.predict(x_train), x_train)
model = model.fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.977
Model:,OLS,Adj. R-squared (uncentered):,0.976
Method:,Least Squares,F-statistic:,2336.0
Date:,"Sun, 18 Oct 2020",Prob (F-statistic):,4.43e-136
Time:,21:32:45,Log-Likelihood:,-116.91
No. Observations:,170,AIC:,239.8
Df Residuals:,167,BIC:,249.2
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
total_bill,0.0801,0.005,16.597,0.000,0.071,0.090
size,0.4135,0.044,9.379,0.000,0.326,0.500
smoker_No,0.4175,0.074,5.605,0.000,0.270,0.565

0,1,2,3
Omnibus:,3.163,Durbin-Watson:,1.775
Prob(Omnibus):,0.206,Jarque-Bera (JB):,3.499
Skew:,-0.033,Prob(JB):,0.174
Kurtosis:,3.7,Cond. No.,47.1
