In [1]:
#import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Loading the datasets

In [2]:
from sklearn.datasets import load_boston

In [3]:
boston = load_boston()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [4]:
type(boston)

sklearn.utils.Bunch

In [5]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename', 'data_module'])

## Checking the description of datasets

In [None]:
print(boston.DESCR)

: 

## Preparation of datasets

In [None]:
dataset=pd.DataFrame(boston.data,columns=boston.feature_names)

: 

In [None]:
dataset.head()

: 

## Defining the target

In [None]:
dataset['Price']=boston.target

: 

In [None]:
dataset.head()

: 

In [None]:
dataset.info()

: 

In [None]:
## Summarizing The Stats of the data
dataset.describe()

: 

In [None]:
## Check the missing Values
dataset.isnull().sum()

: 

In [None]:
### EXploratory Data Analysis
## Correlation
dataset.corr()

: 

In [None]:
import seaborn as sns
sns.pairplot(dataset)

: 

In [None]:
dataset.corr()

: 

## Insights:
### As we can see above that correlation between feature: CRIM and target: Price is -vely correlated and it is clear that if the crime rate in the area will increase then the price will decrease

In [None]:
plt.scatter(dataset['CRIM'],dataset['Price'])
plt.xlabel("Crime Rate")
plt.ylabel("Price")

: 

In [None]:
plt.scatter(dataset['RM'],dataset['Price'])
plt.xlabel("Rooms(Avg.")
plt.ylabel("Price")

: 

In [None]:
import seaborn as sns
sns.regplot(dataset['RM'],dataset['Price'])
plt.xlabel("Rooms(Avg.")
plt.ylabel("Price")

: 

In [None]:
sns.regplot(x='LSTAT',y='Price',data=dataset)

: 

In [None]:
sns.regplot(x="CHAS",y="Price",data=dataset)

: 

In [None]:
sns.regplot(x="PTRATIO",y="Price",data=dataset)

: 

In [None]:
## Independent and Dependent features

X=dataset.iloc[:,:-1]
y=dataset.iloc[:,-1]

: 

In [None]:
X.head()

: 

In [None]:
y.head()

: 

In [None]:
#Train-Test splitting
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

: 

In [None]:
X_train

: 

In [None]:
X_test

: 

In [None]:
#Standarize the datasets
from sklearn.preprocessing import StandardScaler
scaler =  StandardScaler()

: 

In [None]:
X_train = scaler.fit_transform(X_train)

: 

In [None]:
X_test = scaler.transform(X_test)

: 

In [None]:
import pickle
pickle.dump(scaler,open('scaling.pkl','wb'))

## Model Training

In [None]:
from sklearn.linear_model import LinearRegression

: 

In [None]:
regression = LinearRegression()

: 

In [None]:
regression.fit(X_train,y_train)

: 

In [None]:
#Print the coefficient and intercept
print("Coefficient",regression.coef_)
print("Intercept",regression.intercept_)

: 

In [None]:
## on which parameters the model has been trained
regression.get_params()

: 

In [None]:
#prediction with test data
reg_pred = regression.predict(X_test)

: 

In [None]:
reg_pred

: 

## Assumption

In [None]:
plt.scatter(y_test,reg_pred)
plt.xlabel("Actual Testing Values")
plt.ylabel("Predicted Testing Values")

: 

In [None]:
#Residuals
residual = y_test-reg_pred

: 

In [None]:
residual

: 

In [None]:
#Plot the residual
sns.displot(residual,kind='kde')

: 

In [None]:
## Scatter plot with respect to prediction and residuals
## uniform distribution
plt.scatter(reg_pred,residual)

: 

## Performance metrics

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

print("MAE",mean_absolute_error(y_test,reg_pred))
print("MSE",mean_squared_error(y_test,reg_pred))

: 

In [None]:
#Root mean square error
print("RMSE",np.sqrt(mean_squared_error(y_test,reg_pred)))

: 

## R-Square & Adjusted R-Square scores

## Formula:
#### R^2 Score = 1 - (SSR/SST)
#### Where
#### R^2 Score=> Coefficient of determination
#### SSR=> Sum of residual
#### SST=>Sum of square total

In [None]:
from sklearn.metrics import r2_score
score = r2_score(y_test,reg_pred)
print("R2_Score",score)

: 

## Adjusted R2 = 1 – [(1-R2)*(n-1)/(n-k-1)]**

#### where:

#### R2: The R2 of the model
#### n: The number of observations
#### k: The number of predictor variables

In [None]:
#displaying the adjusted R2 score
1-(1-score)*(len(y_test)-1)/(len(y_test)-(X_test.shape[1])-1)

: 

## New Data Prediction

In [None]:
boston.data[0]

: 

In [None]:
boston.data[0].shape

: 

In [None]:
boston.data[0].reshape(1,-1).shape

: 

In [None]:
regression.predict(boston.data[0].reshape(1,-1))

: 

## As we can see above that it is showing the -ve value for the price it is because the new test values is not scaled so we have to scale it  and then apply prediction

In [None]:
regression.predict(scaler.transform(boston.data[0].reshape(1,-1)))

: 

## Pickle the model for deployment

In [None]:
import pickle

: 

In [None]:
pickle.dump(regression,open("regmodel.pkl","wb"))

: 

In [None]:
pickle_model = pickle.load(open("regmodel.pkl",'rb'))

: 

In [None]:
pickle_model.predict(scaler.transform(boston.data[0].reshape(1,-1)))

: 

: 