## Multiple linear regression analysis

### This script contains the following:

#### 01. Import libraries and data
#### 02. Clean data
#### 03. Data prep for regression analysis
#### 04. Regression analysis
#### 05. Evaluate performance after removing outliers 

### Note: The top-performing model, seen below, incorporated kitchen, district pop, state pop, picture count, popdensity, balcony, lift and living space to predict total rent with an R-squared of 72 and mean absolute error of 182. 
### The persistence of the large error and relatively low r-squared score suggests that multiple linear regression isn't the best approach for this dataset.

### 1. Import libraries and data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# This option ensures that the graphs you create are displayed within the notebook without the need to "call" them specifically.

%matplotlib inline

In [3]:
path = r'C:\Users\jacym\Desktop\Career Foundry projects\german rent\02 data'

In [4]:
df = pd.read_csv(os.path.join(path, 'cleaned data', 'rent_district_pop_merged.csv'))

### 2. Data cleaning

In [5]:
df.columns

Index(['scoutId', 'regio1', 'baseRent', 'totalRent', 'pricetrend',
       'livingSpace', 'plz', 'picturecount', 'newlyConst', 'yearConstructed',
       'firingTypes', 'heatingType', 'hasKitchen', 'cellar', 'condition',
       'street', 'lift', 'typeOfFlat', 'noRooms', 'floor', 'balcony', 'garden',
       'baseRentRange', 'noRoomsRange', 'livingSpaceRange',
       'yearConstructedRange', 'regio2', 'regio3', 'description', 'date',
       'population', 'populationTrend', 'popTrendCat', 'surfaceArea',
       'populationDensity', 'space/person', 'people/apt', 'districtPopTrend',
       'districtPop'],
      dtype='object')

In [7]:
df.head(20)

Unnamed: 0,scoutId,regio1,baseRent,totalRent,pricetrend,livingSpace,plz,picturecount,newlyConst,yearConstructed,...,population,populationTrend,popTrendCat,surfaceArea,populationDensity,space/person,people/apt,districtPopTrend,districtPop,price/unit
0,96107057,Nordrhein_Westfalen,595.0,840.0,4.62,86.0,44269,6,False,1965.0,...,17935147,-0.02,stable,34098,526,41.41,2.12,0.33,587010,9.767442
1,92798563,Nordrhein_Westfalen,972.6,1320.65,3.76,87.0,44229,12,True,2018.0,...,17935147,-0.02,stable,34098,526,41.41,2.12,0.33,587010,15.179885
2,114894763,Nordrhein_Westfalen,396.8,493.8,4.1,62.0,44137,0,False,1958.0,...,17935147,-0.02,stable,34098,526,41.41,2.12,0.33,587010,7.964516
3,90046012,Nordrhein_Westfalen,310.0,460.0,3.28,55.0,44329,14,False,1930.0,...,17935147,-0.02,stable,34098,526,41.41,2.12,0.33,587010,8.363636
4,87928570,Nordrhein_Westfalen,301.0,,4.41,30.07,44137,31,False,1979.0,...,17935147,-0.02,stable,34098,526,41.41,2.12,0.33,587010,
5,79343710,Nordrhein_Westfalen,1550.0,2205.0,3.85,148.0,44229,32,False,2015.0,...,17935147,-0.02,stable,34098,526,41.41,2.12,0.33,587010,14.898649
6,62267211,Nordrhein_Westfalen,440.0,525.0,4.41,67.0,44137,6,False,1965.0,...,17935147,-0.02,stable,34098,526,41.41,2.12,0.33,587010,7.835821
7,93403649,Nordrhein_Westfalen,370.0,500.0,4.55,50.0,44287,13,False,,...,17935147,-0.02,stable,34098,526,41.41,2.12,0.33,587010,10.0
8,113557346,Nordrhein_Westfalen,403.75,653.75,3.71,85.0,44143,8,False,1981.0,...,17935147,-0.02,stable,34098,526,41.41,2.12,0.33,587010,7.691176
9,111370962,Nordrhein_Westfalen,512.0,672.0,4.44,64.0,44225,5,False,1964.0,...,17935147,-0.02,stable,34098,526,41.41,2.12,0.33,587010,10.5


In [8]:
df.shape

(257607, 40)

In [9]:
df.columns

Index(['scoutId', 'regio1', 'baseRent', 'totalRent', 'pricetrend',
       'livingSpace', 'plz', 'picturecount', 'newlyConst', 'yearConstructed',
       'firingTypes', 'heatingType', 'hasKitchen', 'cellar', 'condition',
       'street', 'lift', 'typeOfFlat', 'noRooms', 'floor', 'balcony', 'garden',
       'baseRentRange', 'noRoomsRange', 'livingSpaceRange',
       'yearConstructedRange', 'regio2', 'regio3', 'description', 'date',
       'population', 'populationTrend', 'popTrendCat', 'surfaceArea',
       'populationDensity', 'space/person', 'people/apt', 'districtPopTrend',
       'districtPop', 'price/unit'],
      dtype='object')

In [10]:
df['hasKitchen'] = df['hasKitchen'].astype(int)

In [11]:
df['balcony'] = df['balcony'].astype(int)

In [12]:
df['lift'] = df['lift'].astype(int)

In [27]:
# Create subset with just the factors that interest me for regression
reg_sub = df[['livingSpace', 'totalRent', 'pricetrend', 'populationTrend', 'districtPopTrend', 'districtPop', 'hasKitchen', 'picturecount', 'populationDensity', 'balcony', 'lift']]

##### Missing values

In [28]:
# Check for missing values

reg_sub.isnull().sum()


livingSpace             70
totalRent            39303
pricetrend            1708
populationTrend          0
districtPopTrend         0
districtPop              0
hasKitchen               0
picturecount             0
populationDensity        0
balcony                  0
lift                     0
dtype: int64

In [29]:
# drop blank values 
reg_sub_trim = reg_sub.dropna(inplace=False)

In [30]:
reg_sub_trim.isnull().sum()

livingSpace          0
totalRent            0
pricetrend           0
populationTrend      0
districtPopTrend     0
districtPop          0
hasKitchen           0
picturecount         0
populationDensity    0
balcony              0
lift                 0
dtype: int64

##### Duplicates check

In [38]:
dups = reg_sub_trim.duplicated()

In [39]:
dups.shape # No dups

(216760,)

### 3. Data prep for regression analysis 

In [18]:
from sklearn import linear_model

In [31]:
X = reg_sub_trim[['populationTrend', 'livingSpace', 'pricetrend', 'districtPopTrend', 'districtPop', 'hasKitchen', 'picturecount', 'populationDensity', 'balcony', 'lift']]
y = reg_sub_trim['totalRent']

In [20]:
y

0          840.00
1         1320.65
2          493.80
3          460.00
5         2205.00
           ...   
257601     590.00
257602    1000.00
257604     700.00
257605     300.00
257606     840.00
Name: totalRent, Length: 218251, dtype: float64

In [32]:
#Splitting the dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)

In [33]:
mlr = LinearRegression()  
mlr.fit(x_train, y_train)

### 4. Regression analysis

In [34]:
print('Intercept:', mlr.intercept_)
print('Coefficients:')
list(zip(X, mlr.coef_))

Intercept: -275.4105102805373
Coefficients:


[('populationTrend', 236.16518128908652),
 ('livingSpace', 11.667524304827383),
 ('pricetrend', 24.978416951516532),
 ('districtPopTrend', -2.7801659552113875),
 ('districtPop', 0.0003991258900415823),
 ('hasKitchen', 100.59463407298603),
 ('picturecount', 3.636129686058994),
 ('populationDensity', -0.28343951714445265),
 ('balcony', 12.46540899543645),
 ('lift', 181.7351344668637)]

In [35]:
# Prediction of test set
y_pred_mlr= mlr.predict(x_test)
# Predicted values
print('Prediction for test set: {}'.format(y_pred_mlr))

Prediction for test set: [ 282.12573586  422.59624585 1088.11806101 ...  500.7991107   164.14383556
  899.02549286]


In [36]:
mlr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_mlr})
mlr_diff.head()

Unnamed: 0,Actual value,Predicted value
109587,392.81,282.125736
203583,710.0,422.596246
104682,698.0,1088.118061
201752,1130.0,1231.008947
249071,1030.0,1144.704398


# This model performed the best

In [37]:
from sklearn import metrics
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_mlr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr))
print('R squared: {:.2f}'.format(mlr.score(X,y)*100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)
# kitchen, district pop, state pop, picture count, popdensity, balcony, lift, living space, district pop, pricetrend

R squared: 72.47
Mean Absolute Error: 178.57121047262856
Mean Square Error: 84669.75586375213
Root Mean Square Error: 290.9806795368932


# Other model results below

In [26]:
from sklearn import metrics
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_mlr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr))
print('R squared: {:.2f}'.format(mlr.score(X,y)*100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)
# kitchen, district pop, state pop, picture count, popdensity, balcony, lift, living space, district pop

R squared: 72.02
Mean Absolute Error: 182.53619731711117
Mean Square Error: 87635.94427378477
Root Mean Square Error: 296.03368773466434


In [22]:
from sklearn import metrics
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_mlr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr))
print('R squared: {:.2f}'.format(mlr.score(X,y)*100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)
# kitchen, district pop, state pop, picture count, popdensity, balcony, lift, living space

R squared: 68.22
Mean Absolute Error: 192.66423514943415
Mean Square Error: 100922.41120483697
Root Mean Square Error: 317.6828783627424


In [199]:
from sklearn import metrics
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_mlr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr))
print('R squared: {:.2f}'.format(mlr.score(X,y)*100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)
# kitchen, district pop, state pop, picture count, popdensity, balcony with PRICEPERUNIT

R squared: 30.06
Mean Absolute Error: 2.7915192246075753
Mean Square Error: 21.144783600099927
Root Mean Square Error: 4.5983457460373645


In [176]:
from sklearn import metrics
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_mlr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr))
print('R squared: {:.2f}'.format(mlr.score(X,y)*100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)
# kitchen, district pop, state pop, livingspace, picture count, popdensity, balcony

R squared: 65.90
Mean Absolute Error: 200.41431779783144
Mean Square Error: 108240.0376943712
Root Mean Square Error: 328.998537526189


In [187]:
#note- worse with range
from sklearn import metrics
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_mlr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr))
print('R squared: {:.2f}'.format(mlr.score(X,y)*100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)
# kitchen, district pop, state pop, livingspaceRange, picture count, popdensity, balcony

R squared: 58.63
Mean Absolute Error: 212.49381327641376
Mean Square Error: 124941.64928145382
Root Mean Square Error: 353.4708605832365


In [159]:
from sklearn import metrics
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_mlr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr))
print('R squared: {:.2f}'.format(mlr.score(X,y)*100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)
# kitchen, district pop, state pop, livingspace, picturecount

R squared: 65.11
Mean Absolute Error: 204.27879864101092
Mean Square Error: 110624.24597058757
Root Mean Square Error: 332.6022338628945
