### Multiple linear regression (MLR) 
- https://archive.ics.uci.edu/ml/datasets/student+performance

#### Import libraries  

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.preprocessing import scale
from sklearn.metrics import mean_squared_error

#### Read CSV file

In [None]:
csv_in = 'student-por-gp.csv'
df = pd.read_csv(csv_in, sep=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

#### Separate explanatory variables and objective variable  
説明変数と目的変数を分ける  

In [None]:
X = df.drop(columns='G3')  # explanatory variables, 2D
#X = df.loc[:, 'age':'absences']
y = df['G3']  # objective variable, 1D
print('X:', X.shape)
display(X.head())
print('y:', y.shape)
print(y.head())

#### Encoding of categorical variables  

##### Assign integers  
整数を割り当てる  

In [None]:
print(X['studytime'].value_counts())

In [None]:
X['studytime'] = X['studytime'].replace(
    {'<2 hours':'1', '2 to 5 hours':'2',
     '5 to 10 hours':'3', '>10 hours':'4',
    }
).astype('int')
print(X.info())
display(X.head())

#### Apply get_dummies()  
ダミー変数化  

In [None]:
print(X['reason'].value_counts())

In [None]:
X_dumm = pd.get_dummies(X, drop_first=True, dtype='uint8')
print('X_dumm:', X_dumm.shape)
display(X_dumm.head())

#### MLR calculation without standardization    
標準化なしで線形重回帰分析  

In [None]:
X_dumm_c = sm.add_constant(X_dumm)
model = sm.OLS(y, X_dumm_c)
results = model.fit()
print(results.summary())

#### R2 and Adjusted R2    
決定係数と自由度調整済み決定係数  

In [None]:
print('R2:', results.rsquared)
print('Adj R2:', results.rsquared_adj)

#### Partial regression coefficients  
偏回帰係数  

In [None]:
print(results.params)

#### MLR calculation with standardization  
全説明変数と目的変数を標準化して線形重回帰分析  

In [None]:
X_scaled_ar = scale(X_dumm)
y_scaled_ar = scale(y)

In [None]:
# make DataFrames corresponding to X_scaled and y_scaled.
X_scaled = pd.DataFrame(X_scaled_ar, columns=X_dumm.columns)
y_scaled = pd.Series(y_scaled_ar, name=y.name)
model = sm.OLS(y_scaled, X_scaled)
results_scaled = model.fit()
print(results_scaled.summary())

#### Compare standardized partial regression coefficients    
標準化偏回帰係数を比較　　

In [None]:
print(results_scaled.params.sort_values(key=np.abs, ascending=False))

#### Do prediction with obtained model    
得られたモデルを用いて、予測を行う。  

In [None]:
X_test = pd.DataFrame([[18, 4, 0, 0, 1, 1,
                        2, 0, 0, 0, 1],
                       [15, 1, 3, 1, 0, 4,
                        3, 10, 0, 1, 0],
                        ],
                        columns=X_dumm.columns)  # example
print('X for prediction:')
display(X_test)

In [None]:
X_test_c = sm.add_constant(X_test, has_constant='add')
y_test = results.predict(X_test_c)
print('Predicted y:')
print(y_test)

#### (Adv) Visualization of the fitting of regression formula  

In [None]:
y_pred = results.predict(X_dumm_c)
print('Predicted y:')
print(y_pred.head())

In [None]:
y_min = np.append(y_pred, y).min()
y_max = np.append(y_pred, y).max()
y_margin = (y_max - y_min) * 0.1
y_min -= y_margin
y_max += y_margin

In [None]:
plt.scatter(y_pred, y, alpha=0.3)
plt.plot([y_min,y_max], [y_min,y_max])
plt.gca().set_aspect('equal', adjustable='box')
plt.xlim(y_min, y_max)
plt.ylim(y_min, y_max)
plt.xlabel('Pred')
plt.ylabel('True')
plt.show()

In [None]:
mse = mean_squared_error(y, y_pred)
print('MSE, RMSE:', mse, np.sqrt(mse))