# Task 1-I : Linear Models

* We will learn linear regression

In [None]:
###################
## Run this cell ##
###################
import pandas as pd
from sklearn.datasets import load_boston

boston = load_boston()

df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['MEDV'] = boston.target
print("1 row is not about one house but one town ")
print(boston.DESCR)

1 row is not about one house but one town 
.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property

# Q1. Split the df into training set & test set

1. x : all columns in df except 'MEDV'
2. y : the column 'MEDV' in df (df['MEDV'])
3. variable names :
    * x_train, y_train
    * x_test, y_test
4. train : test = 8 : 2
5. randomstate : 2021

Question : Why we need to prepare test set?

**Your Answer : to estimate generalization error**

In [None]:
####################
## Your code here ##
####################
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df.drop(['MEDV'], axis=1), df['MEDV'],
                                                    test_size=0.2, random_state=2021 )

# Q2. Train linear regression model

1. declare your model as lr

In [None]:
####################
## Your code here ##
####################
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# Q3. Make a prediction
1. y_pred_train : prediction on training set
2. y_pred_test : prediction on test set

In [None]:
####################
## Your code here ##
####################
y_pred_train = lr.predict(x_train)
y_pred_test = lr.predict(x_test)

# Q4. Evaluate the model on the training set & test set

* Use RMSE

Question : If rmse is 4, can we say the error of our model is $4? 

**Your Answer : rmse 4 means rmse \$4,000. The unit of y is $1,000**

In [None]:
####################
## Your code here ##
####################
from sklearn.metrics import mean_squared_error as MSE

rmse_train = MSE(y_train, y_pred_train )**.5
rmse_test = MSE(y_test, y_pred_test )**.5

print(f"RMSE on training set : {rmse_train:.3f}")

print(f"RMSE on test set     : {rmse_test:.3f}")

RMSE on training set : 4.675
RMSE on test set     : 4.827


# Q5. Complete the equation of the linear regression model

\begin{align}
MEDV = \beta_0 &\ + \beta_1*CRIM + \beta_2*ZN + \beta_3*INDUS + \beta_4*CHAS \\
&+ \beta_5*NOX + \beta_6*RM + \beta_7*AGE + \beta_8*DIS + \beta_9*RAD \\
& + \beta_{10}*TAX + \beta_{11}*PTRATIO + \beta_{12}*B + \beta_{13}*LSTAT
\end{align}

* print $ \beta_0 $ ~ $ \beta_{13} $ with feature(column) name
* example
```
beta_0 for intercept : 21
beta_1 for CRIM : - 12
~~~
beta_13 for LSTAT : -5 
```



In [None]:
####################
## Your code here ##
####################

for i in range(14):

    beta = f"beta_{i}"

    if i == 0 :
        coef = lr.intercept_
        feature_name = 'Intercept'
    else :
        i = i-1
        coef = lr.coef_[i]
        feature_name = x_train.columns[i]

    print(f"{beta} for {feature_name} : {coef:.4f}")

beta_0 for Intercept : 35.0744
beta_1 for CRIM : -0.1146
beta_2 for ZN : 0.0532
beta_3 for INDUS : 0.0033
beta_4 for CHAS : 3.5085
beta_5 for NOX : -18.1357
beta_6 for RM : 3.8252
beta_7 for AGE : 0.0111
beta_8 for DIS : -1.5300
beta_9 for RAD : 0.3392
beta_10 for TAX : -0.0119
beta_11 for PTRATIO : -0.8842
beta_12 for B : 0.0095
beta_13 for LSTAT : -0.5782


# Q6. Analyze the effect of 'RM' (average average number of rooms per dwelling)

**assumption1 : every other features are fixed.**<br>
**assumption2 : use training set to anlayze.**
1. How does the 'MEDV(house price)' change when 'RM' increases by 1 ?
2. What is the change in the 'MEDV' due to the standard deviation(std) of 'RM'
    * hint : beta_6 * std('RM')
    * you can regard std('RM') as a mean variablity of 'RM' ( roughly )
3. What is the change in the 'MEDV' due to the maximum change of 'RM'
    * hint : maximum change of 'RM' = max('RM') - min('RM')


In [None]:
####################
## Your code here ##
####################
std_rm = x_train['RM'].std()
max_ch_rm = x_train['RM'].max() - x_train['RM'].min()


print(f"A1 : {lr.coef_[5]:.3f}")
print(f"A2 : {lr.coef_[5]*std_rm:.3f}")
print(f"A2 : {lr.coef_[5]*max_ch_rm:.3f}")

A1 : 3.825
A2 : 2.696
A2 : 19.754


# Q7. Analyze the effect of 'NOX' ( nitric oxides concentration (parts per 10 million) )

**assumption1 : every other features are fixed.**<br>
**assumption2 : use training set to anlayze.**
1. How does the 'MEDV(house price)' change when 'NOX' increases by 1 ?
2. Can 'NOX' change by 1 in the data?
3. What is the change in the 'MEDV' due to the standard deviation(std) of 'NOX'
4. What is the change in the 'MEDV' due to the maximum change of 'NOX'


In [None]:
####################
## Your code here ##
####################
std_nx = x_train['NOX'].std()
max_ch_nx = x_train['NOX'].max() - x_train['NOX'].min()


print(f"A1 : {lr.coef_[4]:.3f}")
print(f"A2 : No, maximum change(range) of NOX : {max_ch_nx}")
print(f"A3 : {lr.coef_[4]*std_nx:.3f}")
print(f"A4 : {lr.coef_[4]*max_ch_nx:.3f}")

A1 : -18.136
A2 : No, maximum change(range) of NOX : 0.486
A3 : -2.168
A4 : -8.814


# Q8. Anlayze the intercept
**assumption1 : use training set to anlayze.**
1. What is the expected mean value of 'MEDV' in $ when all features(x) have no effect
    * you can regard intercept as default value of 'MEDV' ( roughly )
    * be careful : in $, not in $1,000
2. Can all features(x) be zero in the data?
    * Can all features(x) have no effect?


In [None]:
####################
## Your code here ##
####################

print(f"A1 : ${lr.intercept_*1000:.3f}" )

print(f"A2 : Impossible, just two features can be zero")

print(x_train.min())

A1 : $35074.446
A2 : Impossible, just two features can be zero
CRIM         0.00906
ZN           0.00000
INDUS        0.46000
CHAS         0.00000
NOX          0.38500
RM           3.56100
AGE          2.90000
DIS          1.12960
RAD          1.00000
TAX        187.00000
PTRATIO     12.60000
B            0.32000
LSTAT        1.92000
dtype: float64
