In [None]:
import pandas as pd
import numpy as np

import statsmodels.api as sm 
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg

import matplotlib.pyplot as plt
%matplotlib inline

import patsy
import seaborn as sns

from sklearn import datasets
from sklearn import model_selection 
from sklearn import linear_model

import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [None]:
df = pd.read_csv("house.csv")
df.head()

In [None]:
#df.info()
df.head()
#df.tail()

In [None]:
df.describe()

In [None]:
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("correlation map")
plt.figure(figsize=(10, 10))
#plt.savefig("Correlation_map")
plt.show() 

In [None]:
df["bedroom_count*net_sqm"] =df["net_sqm"] * df["bedroom_count"]
df["center_distance*metro_distance"] = df["center_distance"] * df["metro_distance"]  

In [None]:
df.head()

In [None]:
y = df["price"]
X = df[["bedroom_count","net_sqm", "center_distance","metro_distance", "floor", "age", "bedroom_count*net_sqm", "center_distance*metro_distance"]]
X = sm.add_constant(X)
model = sm.OLS(y, X)
result = model.fit()
print(result.summary())

In [None]:
X_train, X_test, y_train, y_test = \
model_selection.train_test_split(X, y, train_size=0.8, random_state=42)
X_train.shape, y_train.shape
X_test.shape, y_test.shape
model = linear_model.LinearRegression() 
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

In [None]:
def sse(resid):
    return np.sum(resid**2)  
resid_train = y_train - model.predict(X_train)
resid_test = y_test - model.predict(X_test)
sse_train = sse(resid_train)
sse_test = sse(resid_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print("Sum of squared error:", sse_train, sse_test)
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)

In [None]:
def plot_residuals_and_coeff(resid_train, resid_test, coeff):
    fig, axes = plt.subplots(1, 3, figsize=(12, 3))
    axes[0].bar(np.arange(len(resid_train)), resid_train)
    axes[0].set_xlabel("sample number")
    axes[0].set_ylabel("residual")
    axes[0].set_title("training data")
    axes[1].bar(np.arange(len(resid_test)), resid_test)
    axes[1].set_xlabel("sample number")
    axes[1].set_ylabel("residual")
    axes[1].set_title("testing data")
    axes[2].bar(np.arange(len(coeff)), coeff)
    axes[2].set_xlabel("coefficient number")
    axes[2].set_ylabel("coefficient")
    fig.tight_layout()
    #plt.savefig("residualandcoefficient")
    return fig, axes
fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)
plt.show() 

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.legend(['Data Points', 'Prediction Line'], loc='upper left', fontsize=14)
plt.title('Actual vs Predicted')
#plt.savefig("actualvpredictedval")
plt.show() 

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
smg.qqplot(result.resid, line="s", ax=ax)
ax.set_title("QQ Plot", fontsize=14, fontweight="bold")
fig.tight_layout()
#plt.savefig("qqplot")
plt.show() 

In [None]:
residuals = y_test - y_pred
plt.subplots(figsize=(8, 6))
fig = sns.scatterplot(x=y_pred, y=residuals)
fig.set_xlabel("Fitted Values", fontsize=12, fontweight="bold")
fig.set_ylabel("Residuals", fontsize=12, fontweight="bold")
fig.set_title("Fitted Values v. Residuals", fontsize=14, fontweight="bold")
fig.axhline(0)
#plt.savefig("fitvresiduals")
plt.show() 

In [None]:
from sklearn.linear_model import Ridge
model = Ridge(alpha=150.0)
model.fit(X_train, y_train)
model.score(X_train, y_train), model.score(X_test, y_test)  

In [None]:
from sklearn.linear_model import Lasso
model = Lasso(alpha=100.0)
model.fit(X_train, y_train)
model.score(X_train, y_train), model.score(X_test, y_test)

In [None]:
from sklearn.linear_model import ElasticNetCV
model = ElasticNetCV(cv=2)
model.fit(X_train,y_train)
model.alpha_
model.l1_ratio_
model.score(X_train, y_train), model.score(X_test, y_test)  