In [1]:
# Importing the packages
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize, StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression

In [2]:
# Reading the dataset from a csv
dataset = pd.read_csv('kc_house_data.csv')
dataset.head(5)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [3]:
# Having a summary about the dataset
dataset.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,21613.0,4580302000.0,2876566000.0,1000102.0,2123049000.0,3904930000.0,7308900000.0,9900000000.0
price,21613.0,540088.1,367127.2,75000.0,321950.0,450000.0,645000.0,7700000.0
bedrooms,21613.0,3.370842,0.9300618,0.0,3.0,3.0,4.0,33.0
bathrooms,21613.0,2.114757,0.7701632,0.0,1.75,2.25,2.5,8.0
sqft_living,21613.0,2079.9,918.4409,290.0,1427.0,1910.0,2550.0,13540.0
sqft_lot,21613.0,15106.97,41420.51,520.0,5040.0,7618.0,10688.0,1651359.0
floors,21613.0,1.494309,0.5399889,1.0,1.0,1.5,2.0,3.5
waterfront,21613.0,0.007541757,0.0865172,0.0,0.0,0.0,0.0,1.0
view,21613.0,0.2343034,0.7663176,0.0,0.0,0.0,0.0,4.0
condition,21613.0,3.40943,0.650743,1.0,3.0,3.0,4.0,5.0


In [4]:
# Separating only the predictive labels (X)
col = 'price'
X = dataset.loc[:, dataset.columns !=col]
X.head(5)

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [5]:
# Separating the target label (Y)
Y = dataset[col]
Y.head(5)

0    221900.0
1    538000.0
2    180000.0
3    604000.0
4    510000.0
Name: price, dtype: float64

In [6]:
# Droping the labels that doesn't matter to the predictive model
X = X.drop(['id','date'], axis = 1)
X.head(5)

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [7]:
# There is no missing values present in the dataset
X.isna().sum()

bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [8]:
# Getting the train and test subsets
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

In [9]:
# Making the first predictive model with linear regression
lnr = LinearRegression()

lnr.fit(x_train, y_train)

y_pred = lnr.predict(x_test)


In [10]:
# Getting the predictive model score
score = r2_score(y_test, y_pred)
score

0.6965104310298178

In [11]:
# The yr_renovated label has more 0 values than others, then we will remove it from the predictive model
dataset.yr_renovated.value_counts().head()

0       20699
2014       91
2013       37
2003       36
2005       35
Name: yr_renovated, dtype: int64

In [12]:
# sqft_basemente have the same problem, so we'll drop it
dataset.sqft_basement.value_counts().head()

0      13126
600      221
700      218
500      214
800      206
Name: sqft_basement, dtype: int64

In [13]:
# Dropping the sqft_basement and yr_renovated labels
X = X.drop(['sqft_basement','yr_renovated'], axis=1)
X.head(5)

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,yr_built,zipcode,lat,long,sqft_living15,sqft_lot15
0,3,1.0,1180,5650,1.0,0,0,3,7,1180,1955,98178,47.5112,-122.257,1340,5650
1,3,2.25,2570,7242,2.0,0,0,3,7,2170,1951,98125,47.721,-122.319,1690,7639
2,2,1.0,770,10000,1.0,0,0,3,6,770,1933,98028,47.7379,-122.233,2720,8062
3,4,3.0,1960,5000,1.0,0,0,5,7,1050,1965,98136,47.5208,-122.393,1360,5000
4,3,2.0,1680,8080,1.0,0,0,3,8,1680,1987,98074,47.6168,-122.045,1800,7503


In [14]:
# Splitting the values into train and test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)

# Making the regression model and the predictions
lnr_v2 = LinearRegression()
lnr_v2.fit(x_train, y_train)
y_predv2 = lnr_v2.predict(x_test)

score_v2 = r2_score(y_test, y_predv2)


In [15]:
# An overview about the coeficients
lnr_v2.coef_

array([-3.71812923e+04,  4.19498868e+04,  1.48091232e+02,  1.53253888e-01,
        1.02884245e+04,  5.44352928e+05,  4.87616517e+04,  2.35165472e+04,
        9.98761319e+04,  2.21961313e+01, -2.75720645e+03, -5.71370678e+02,
        5.94382793e+05, -1.98557518e+05,  2.48033346e+01, -3.76334136e-01])

In [16]:
# Normalize
x_normalized = normalize(X, axis = 0)
x_normalized

# Get the train and test subsets
x_train,x_test,y_train,y_test = train_test_split(x_normalized, Y, test_size=0.3)

# Construct the model, fit and predict
lnr_v3 = LinearRegression()
lnr_v3.fit(x_train, y_train)
y_predv3 = lnr_v3.predict(x_test)

# Getting the score by R2
score_v3 = r2_score(y_test, y_predv3)
score_v3

0.6890911033763046

In [17]:
# There is a bit difference between the second and third model coeficients
lnr_v3.coef_

array([-1.99078558e+07,  1.66868342e+07,  5.29930280e+07,  7.14103007e+05,
        9.79486865e+04,  7.85095701e+06,  6.61235858e+06,  1.16697405e+07,
        1.07446498e+08,  1.10343583e+07, -8.24868162e+08, -8.67508259e+09,
        4.18747390e+09, -3.81788773e+09,  3.77917919e+06, -1.93500099e+06])

In [18]:
# Dropping more labels
X = X.drop(['zipcode','lat','long'], axis=1)
X

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,yr_built,sqft_living15,sqft_lot15
0,3,1.00,1180,5650,1.0,0,0,3,7,1180,1955,1340,5650
1,3,2.25,2570,7242,2.0,0,0,3,7,2170,1951,1690,7639
2,2,1.00,770,10000,1.0,0,0,3,6,770,1933,2720,8062
3,4,3.00,1960,5000,1.0,0,0,5,7,1050,1965,1360,5000
4,3,2.00,1680,8080,1.0,0,0,3,8,1680,1987,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,3,2.50,1530,1131,3.0,0,0,3,8,1530,2009,1530,1509
21609,4,2.50,2310,5813,2.0,0,0,3,8,2310,2014,1830,7200
21610,2,0.75,1020,1350,2.0,0,0,3,7,1020,2009,1020,2007
21611,3,2.50,1600,2388,2.0,0,0,3,8,1600,2004,1410,1287


In [19]:
# Get the train and test subsets
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

# Constructing the model and make predictions
lnr_v4 = LinearRegression()
lnr_v4.fit(x_train,y_train)
y_predv4 = lnr_v4.predict(x_test)

# Saving the score R2
score_v4 = r2_score(y_test, y_predv4)

In [20]:
# Excluding the id and date labels to use a scaler
sx = dataset.loc[:, dataset.columns !=col]
sx = sx.drop(['id','date'], axis=1)

# Instantiating and scalling the dataset
scaler = StandardScaler()
X_scalled = scaler.fit_transform(sx)
X_scalled

array([[-0.39873715, -1.44746357, -0.97983502, ..., -0.30607896,
        -0.9433552 , -0.26071541],
       [-0.39873715,  0.1756067 ,  0.53363434, ..., -0.74634143,
        -0.43268619, -0.18786773],
       [-1.47395936, -1.44746357, -1.42625404, ..., -0.13565477,
         1.07013975, -0.17237524],
       ...,
       [-1.47395936, -1.77207762, -1.15404732, ..., -0.60432128,
        -1.41025258, -0.39414129],
       [-0.39873715,  0.50022075, -0.52252773, ...,  1.02891048,
        -0.8412214 , -0.42051149],
       [-1.47395936, -1.77207762, -1.15404732, ..., -0.60432128,
        -1.41025258, -0.41794772]])

In [21]:
# Get the train and test subsets
x_train, x_test, y_train, y_test = train_test_split(X_scalled, Y, test_size=0.3)

# Constructing the model and make predictions
lnr_v5 = LinearRegression()
lnr_v5.fit(x_train, y_train)
y_predv5 = lnr_v5.predict(x_test)

# Saving the score R2
score_v5 = r2_score(y_test, y_predv5)


In [22]:
# All the scores
print(f"""
Score V1 with all variables: {score*100:.2f}%
Score V2 with less 2 variables: {score_v2*100:.2f}%
Score V3 with normalized variables: {score_v3*100:.2f}%
Score V4 with less 3 variables: {score_v4*100:.2f}%
Score V5 with scalled variables: {score_v5*100:.2f}%
""")


Score V1 with all variables: 69.65%
Score V2 with less 2 variables: 69.98%
Score V3 with normalized variables: 68.91%
Score V4 with less 3 variables: 67.22%
Score V5 with scalled variables: 70.06%



In [23]:
(score+score_v2+score_v3+score_v4+score_v4)/5

0.6859657309670272

**Conclusion**: Using only the Linear Regression function we had an accuracy average nearby 68%, the objective of this experiment was check how good we can predict values only using the base variables and only the basic Linear Regression from Scikit Learn package. 