### Multiple Linear Regression with many models - Housing Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import r2_score,mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import LinearSVR

In [2]:
data = pd.read_csv('./Housing.csv')
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().mean()*100

In [None]:
for i in data.columns:
    sns.histplot(x=data[i])
    plt.show();

In [None]:
for i in data.columns:
    sns.boxplot(y=data[i])
    plt.show();

In [None]:
for i in data.columns:
    sns.kdeplot(x=data[i])
    plt.show();

### Correlation 

In [None]:
sns.heatmap(data.corr()[["csMPa"]].sort_values(by="csMPa", ascending=False),
            vmin=-1, vmax=1,annot=True, cmap="Blues");

In [None]:
data.columns

In [None]:
x = data.iloc[:,:-1].values

In [None]:
y = data.iloc[:, -1].values

In [None]:
sc = StandardScaler()

In [None]:
x = sc.fit_transform(x)

#### Splitting the data into training and testing data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=0)

In [None]:
x_train.shape

In [None]:
x_test.shape

#### Model building

In [None]:
model = LinearRegression()

In [None]:
model.fit(x_train, y_train)

#### Predicting

In [None]:
y_pred = model.predict(x_test)

#### Visualizing the results

In [None]:
OUTPUT = pd.DataFrame(zip(y, y_pred), columns=['ACTUAL', 'PREDICTION'], dtype=float)
OUTPUT.head()

In [None]:
sns.regplot(data=OUTPUT, x='ACTUAL', y='PREDICTION')
plt.show()

#### It is a very bad results. How to improve it?
* Lets choose only the columns with good correlation

In [None]:
models = {
    "LinearRegression": LinearRegression(),
    "Ridge":Ridge(),
    "LinearSVR":LinearSVR(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "GradientBoostingRegressor":GradientBoostingRegressor(),
    "AdaBoostRegressor":AdaBoostRegressor(),
    "RandomForestRegressor":RandomForestRegressor(random_state=0),
}

In [None]:
for name, model in models.items():
    scores = cross_val_score(model, x,y, scoring="neg_mean_squared_error",cv=10,n_jobs=-1)
    print("cross validation model : {}".format(name))
    rmse = np.sqrt(-scores)
    rmse_average = np.mean(rmse)
    print("AVERAGE RMSE: ",rmse_average)
    print("*"*100)

#### Selecting GradientBoostingRegressor and doing regression

In [None]:
GBR = GradientBoostingRegressor()

In [None]:
GBR.fit(x_train, y_train)
y_pred = GBR.predict(x)

In [None]:
sns.scatterplot(x=y, y=y_pred)
plt.xlabel('Actual Sales')
plt.ylabel('Predicted Sales')
plt.title('Actual VS Prediction')
plt.show()