In [1]:
#imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error

import statsmodels.api as sm

from scipy.stats.stats import pearsonr

# import datasets
df = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')

df['TotalSF'] = df['GrLivArea'] + df['TotalBsmtSF']
df_reg = df[['TotalSF','YearRemodAdd','GarageArea','FullBath']].fillna(0)
y = df['SalePrice']

reg = LinearRegression()
poly = PolynomialFeatures(4)

x_train, x_test, y_train, y_test = train_test_split(df_reg, y, test_size = 0.3, random_state = 0)

In [2]:
x = poly.fit_transform(x_train)
x = sm.add_constant(x)
model = sm.OLS(y_train,x)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.853
Model:                            OLS   Adj. R-squared:                  0.845
Method:                 Least Squares   F-statistic:                     104.3
Date:                Sun, 05 Dec 2021   Prob (F-statistic):               0.00
Time:                        20:24:20   Log-Likelihood:                -11982.
No. Observations:                1022   AIC:                         2.407e+04
Df Residuals:                     967   BIC:                         2.435e+04
Df Model:                          54                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.3408      0.305      1.117      0.2

In [3]:
coefficients = results.pvalues
print(coefficients)

const    0.264248
x1       0.272108
x2       0.271897
x3       0.270516
x4       0.271439
           ...   
x65      0.278452
x66      0.662723
x67      0.510133
x68      0.767481
x69      0.310159
Length: 70, dtype: float64


In [4]:
coeff = []
index = []
for j,i in enumerate(results.pvalues):
    if i < 0.05:
        coeff.append(results.params[j])
        index.append(j)
print(coeff, index)

[59.554939169212744, -0.06223418223186346, -7.3728477437553765e-09, 2.1901825468849578e-05, 1.624187346383936e-05] [5, 16, 35, 38, 39]


In [5]:
x_test = poly.fit_transform(x_test)
print(x_test)

[[1.000000e+00 4.550000e+03 1.975000e+03 ... 2.108304e+06 1.306800e+04
  8.100000e+01]
 [1.000000e+00 2.384000e+03 1.950000e+03 ... 5.760000e+04 2.400000e+02
  1.000000e+00]
 [1.000000e+00 1.912000e+03 1.950000e+03 ... 1.239040e+05 3.520000e+02
  1.000000e+00]
 ...
 [1.000000e+00 4.222000e+03 1.995000e+03 ... 2.815684e+06 6.712000e+03
  1.600000e+01]
 [1.000000e+00 3.146000e+03 2.002000e+03 ... 1.183744e+06 4.352000e+03
  1.600000e+01]
 [1.000000e+00 4.694000e+03 2.010000e+03 ... 2.689600e+06 6.560000e+03
  1.600000e+01]]


In [6]:
def predictions (n, index, coeff):
    prediction = 0
    for i, j in enumerate(coeff):
        prediction += n[index[i]]*j
    return prediction

In [7]:
x_test = poly.fit_transform(x_test)
pred = []
for n in x_test:
    pred.append(predictions(n,index,coeff))
print(pred)

[-4859117290.372242, -771791789.4777335, -381002485.948647, -1647676924.5273867, -238608047.3626954, -478952746.82106054, -1171719827.1466105, -503429495.7472635, -14402226139.127632, -233461341.0843738, -1027981583.9740729, -1920836317.9919183, -1336188402.4227562, -306892883.326597, -577235668.2149428, -1012383952.6315792, -952037100.3333905, -482959982.9395792, -833794044.3494742, -1200711927.654025, -561266292.9112245, -258648758.6726893, -221319524.4854012, -568936838.812831, -1078798102.3426454, -993984585.9361434, -664055214.3354913, -124459215.10743482, -1777370636.2838256, -471234535.1698039, -1621747176.0675635, -841425321.2525202, -302821188.932112, -1639805039.137912, -3010392912.2040925, -548507747.1413692, -1522227415.3928792, -286094855.28833604, -1323676324.0749333, -3177114705.5693274, -1880957765.7353377, -263578013.85714203, -961121159.7754109, -2144962722.0206902, -3427569082.743422, -2249344452.343397, -208652974.60887244, -317086447.660435, -924946906.4425318, -44

In [8]:
print("r^2: %.3f" % r2_score(pred,y_test))

r^2: -0.203


In [9]:
print(np.corrcoef(pred,y_test))

[[ 1.         -0.34126211]
 [-0.34126211  1.        ]]
