In [1]:
import numpy as np
import pandas as pd

$$ \hat{\beta} = (X^T X)^{-1}X^T y$$ 

$$ \hat{y} = X\hat{\beta} $$

$$ R^2 = 1 - \frac{SSE}{SST} $$

$$MSE = {\displaystyle\sum_{i=1}^n(y_i - \hat{y_i})^2}/(n-p-1)$$

$$ R^2_{pred} = 1 - \frac{SSE_{pred}}{SST_{pred}}$$ 

$$ MAE = \frac{1}{n}\displaystyle\sum_{i=1}^n|y_i - \hat{y}_i| $$

$$ MAPE = \frac{1}{n}\displaystyle\sum_{i=1}^n\frac{|y_i - \hat{y}_i|}{|y_i|} $$

$$ RMSE = \sqrt{{\displaystyle\sum_{i=1}^n(y_i - \hat{y_i})^2}/n}$$

In [2]:
def multi_reg_model_pred_performance():
    # input data file name
    train_data=input("Enter the name of training data file [(ex) boston_tr.csv] : ") # data name
    test_data=input("Enter the name of test data file [(ex) boston_tst.csv] : ") # data name

    coding_fm=int(input("Select the data Seperator(1 = ' ' or 2 = ','): ")) # data separator of training data
    separator_fm={coding_fm ==1 : " "}.get(True, ",")

    # input header & assign response variables
    header=input("Does the data have column header? (y/n) : ")
    if(header=="y") :
        trdata=pd.read_csv(train_data, sep=separator_fm) # loading data with header
        tstdata = pd.read_csv(test_data,sep = separator_fm)
        res_pos = input(f'Enter the column name of response variable among {list(trdata.columns)} : ')

        y_tr = trdata[res_pos]
        X_tr = trdata.drop(res_pos, axis = 1)

        y_tst = tstdata[res_pos]
        X_tst = tstdata.drop(res_pos, axis = 1)

    else : 
        trdata=pd.read_csv(train_data, sep=separator_fm, header=None) # loading data without header
        tstdata=pd.read_csv(test_data, sep=separator_fm, header=None)
        res_pos = int(input(f'Enter the column position of the response variable : \n [from 1 to {trdata.shape[1]}] : '))
        X_index = []
        for i in range(len(trdata.columns)): 
            if i == res_pos-1 : continue
            X_index.append(i)

        # define response variable and predictor
        y_tr = trdata.iloc[:,res_pos-1]
        X_tr = trdata.iloc[:,X_index]
        y_tst = tstdata.iloc[:,res_pos-1]
        X_tst = tstdata.iloc[:,X_index]


    # input name of output file
    out_name=input("Enter the output file name to export [(ex) result.txt] : ")

    # insert constants
    X_tr.insert(loc = 0 , column= 'constant', value = np.ones(trdata.shape[0]))
    X_tst.insert(loc = 0 , column= 'constant', value = np.ones(tstdata.shape[0]))

    # make beta_hat
    beta_hat = np.linalg.inv(X_tr.transpose() @ X_tr) @ np.array(X_tr).transpose() @ y_tr

    # make y_hat
    y_hat_tr = X_tr @ beta_hat

    # R-squre
    resid = y_tr - y_hat_tr
    sse = (resid**2).sum()
    sst = ((y_tr - y_tr.mean())**2).sum()
    r_sq = 1 - sse/sst
    r_sq

    # MSE
    MSE = sse/(len(resid)-len(X_tr.columns))

    # predicted R_squared
    y_hat_tst = X_tst @ beta_hat
    resid_pred = y_tst - y_hat_tst
    sse_pred = (resid_pred**2).sum()
    sst_pred = ((y_tst - y_tst.mean())**2).sum()
    pred_r_sq = 1 - sse_pred/sst_pred
    pred_r_sq

    # MAE
    MAE = abs(resid_pred).sum()/len(resid_pred)

    # MAPE
    MAPE = (abs(resid_pred)/abs(y_tst)).sum() / len(resid_pred)

    #RMSE
    RMSE = np.sqrt((resid_pred**2).sum()/len(resid_pred))
    
    # make txt file

    # Coefficient
    f = open(out_name,'w+')
    f.write('Coefficients\n')
    f.write('-------------\n')
    f.write(f'Constant: {round(beta_hat[0],3)}\n')
    for i in range(1,len(beta_hat)):
        f.write(f'Beta {i} : {round(beta_hat[i],3)}\n')

    #  summary
    f.write('\nModel Summary\n')
    f.write('-------------\n')
    f.write(f'R-square = {round(r_sq,4)}\n')
    f.write(f'MSE = {round(MSE,3)}')

    # Prediction Performance
    f.write('\nPrediction Performance\n')
    f.write('------------------------\n')
    f.write(f'Predictive R_square = {round(pred_r_sq,4)}\n')
    f.write(f'MAE = {round(MAE,3)}\n')
    f.write(f'MAPE = {round(MAPE,3)}\n')
    f.write(f'RMSE = {round(RMSE,3)}\n')

    f.close()
    
    print('------------------------')
    print('file has been saved')
    print('result is\n\n')
    
    r = open(out_name,'r')
    for line in r.readlines():
        print(line)
    pass

In [4]:
multi_reg_model_pred_performance()

Enter the name of training data file [(ex) boston_tr.csv] :  boston_tr.csv
Enter the name of test data file [(ex) boston_tst.csv] :  boston_tst.csv
Select the data Seperator(1 = ' ' or 2 = ','):  2
Does the data have column header? (y/n) :  y
Enter the column name of response variable among ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'black', 'lstat', 'medv'] :  medv
Enter the output file name to export [(ex) result.txt] :  result.txt


------------------------
file has been saved
result is


Coefficients

-------------

Constant: 23.685

Beta 1 : -0.074

Beta 2 : 0.03

Beta 3 : -0.075

Beta 4 : 1.109

Beta 5 : -5.275

Beta 6 : 4.001

Beta 7 : -0.036

Beta 8 : -1.08

Beta 9 : -0.005

Beta 10 : -0.677

Beta 11 : 0.007

Beta 12 : -0.365



Model Summary

-------------

R-square = 0.7648

MSE = 14.56

Prediction Performance

------------------------

Predictive R_square = 0.7636

MAE = 2.905

MAPE = 0.158

RMSE = 3.972

