In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

from warnings import filterwarnings
filterwarnings("ignore")

In [2]:
url = "https://bilkav.com/odev_tenis.csv"

df = pd.read_csv(url)
df

Unnamed: 0,outlook,temperature,humidity,windy,play
0,sunny,85,85,False,no
1,sunny,80,90,True,no
2,overcast,83,86,False,yes
3,rainy,70,96,False,yes
4,rainy,68,80,False,yes
5,rainy,65,70,True,no
6,overcast,64,65,True,yes
7,sunny,72,95,False,no
8,sunny,69,70,False,yes
9,rainy,75,80,False,yes


In [3]:
label_encoder = preprocessing.LabelEncoder()

columns = ["windy","play"]

for col in columns:
    df[col] = label_encoder.fit_transform(df[col])
    
df.head()

Unnamed: 0,outlook,temperature,humidity,windy,play
0,sunny,85,85,0,0
1,sunny,80,90,1,0
2,overcast,83,86,0,1
3,rainy,70,96,0,1
4,rainy,68,80,0,1


In [4]:
dms = pd.get_dummies(df["outlook"])
dms.head()

Unnamed: 0,overcast,rainy,sunny
0,0,0,1
1,0,0,1
2,1,0,0
3,0,1,0
4,0,1,0


In [5]:
df.drop(["outlook"], axis=1, inplace=True)
df.head()

Unnamed: 0,temperature,humidity,windy,play
0,85,85,0,0
1,80,90,1,0
2,83,86,0,1
3,70,96,0,1
4,68,80,0,1


In [6]:
df = pd.concat([dms, df], axis=1)
df.head()

Unnamed: 0,overcast,rainy,sunny,temperature,humidity,windy,play
0,0,0,1,85,85,0,0
1,0,0,1,80,90,1,0
2,1,0,0,83,86,0,1
3,0,1,0,70,96,0,1
4,0,1,0,68,80,0,1


In [7]:
X = df.drop(["play"], axis=1)
y = df[["play"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [8]:
model = LinearRegression().fit(X_train, y_train)
y_pred = model.predict(X_test)

print(
    "MSE >>>", mean_squared_error(y_pred, y_test), "\n"
    "R²  >>>", r2_score(y_pred, y_test),
)

MSE >>> 0.14946437379855324 
R²  >>> -2.7634149688383376


In [9]:
X = np.append(arr = np.ones(14).astype(int), values = X.iloc[:,:])

X_l = df.iloc[:,[0,1,2,3,4,5]].values
X_l = np.array(X_l, dtype=float)

model = sm.OLS(y, X_l).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   play   R-squared:                       0.483
Model:                            OLS   Adj. R-squared:                  0.160
Method:                 Least Squares   F-statistic:                     1.493
Date:                Sun, 25 Apr 2021   Prob (F-statistic):              0.292
Time:                        06:29:00   Log-Likelihood:                -4.9501
No. Observations:                  14   AIC:                             21.90
Df Residuals:                       8   BIC:                             25.73
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1             3.4007      1.859      1.829      0.1

In [10]:
X_l = df.iloc[:,[0,1,2,4,5]].values
X_l = np.array(X_l, dtype=float)

model = sm.OLS(y, X_l).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   play   R-squared:                       0.462
Model:                            OLS   Adj. R-squared:                  0.223
Method:                 Least Squares   F-statistic:                     1.931
Date:                Sun, 25 Apr 2021   Prob (F-statistic):              0.189
Time:                        06:29:00   Log-Likelihood:                -5.2280
No. Observations:                  14   AIC:                             20.46
Df Residuals:                       9   BIC:                             23.65
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1             2.5317      1.021      2.480      0.0

In [11]:
X_l = df.iloc[:,[0,1,2,5]].values
X_l = np.array(X_l, dtype=float)

model = sm.OLS(y, X_l).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   play   R-squared:                       0.345
Model:                            OLS   Adj. R-squared:                  0.148
Method:                 Least Squares   F-statistic:                     1.754
Date:                Sun, 25 Apr 2021   Prob (F-statistic):              0.219
Time:                        06:29:00   Log-Likelihood:                -6.6049
No. Observations:                  14   AIC:                             21.21
Df Residuals:                      10   BIC:                             23.77
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1             1.1471      0.261      4.395      0.0

In [12]:
X = df.drop(["temperature","humidity","play"], axis=1)
y = df[["play"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [13]:
model = LinearRegression().fit(X_train, y_train)
y_pred = model.predict(X_test)

print(
    "MSE >>>", mean_squared_error(y_pred, y_test), "\n"
    "R²  >>>", r2_score(y_pred, y_test),
)

MSE >>> 0.2037760416666667 
R²  >>> -3.3271889400921673
