## Multiple linear regression analysis

### This script contains the following:

#### 01. Import libraries and data
#### 02. Clean data
#### 03. Data prep for regression analysis
#### 04. Regression analysis
#### 05. Model performance stats
#### 06. Other model results

### Note: The top-performing model, seen below, incorporated kitchen, district population, district population trend, picture count, popdensity, balcony, lift and living space to predict total rent with an R-squared of 72 and mean absolute error of 182. 
### The persistence of the large error and relatively low r-squared score suggests that multiple linear regression isn't the best approach for this dataset.

### 1. Import libraries and data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# This option ensures that the graphs you create are displayed within the notebook without the need to "call" them specifically.

%matplotlib inline

In [3]:
path = r'C:\Users\jacym\Desktop\Career Foundry projects\german rent\02 data'

In [4]:
df = pd.read_csv(os.path.join(path, 'cleaned data', 'rent_district_pop_merged.csv'))

### 2. Data cleaning

In [5]:
df.columns

Index(['scoutId', 'regio1', 'baseRent', 'totalRent', 'price/unit',
       'pricetrend', 'livingSpace', 'plz', 'picturecount', 'newlyConst',
       'yearConstructed', 'firingTypes', 'heatingType', 'hasKitchen', 'cellar',
       'condition', 'street', 'lift', 'typeOfFlat', 'noRooms', 'floor',
       'balcony', 'garden', 'baseRentRange', 'noRoomsRange',
       'livingSpaceRange', 'yearConstructedRange', 'regio2', 'regio3',
       'description', 'date', 'population', 'populationTrend', 'popTrendCat',
       'surfaceArea', 'populationDensity', 'space/person', 'people/apt',
       'districtPopTrend', 'districtPop'],
      dtype='object')

In [6]:
df.head(10)

Unnamed: 0,scoutId,regio1,baseRent,totalRent,price/unit,pricetrend,livingSpace,plz,picturecount,newlyConst,...,date,population,populationTrend,popTrendCat,surfaceArea,populationDensity,space/person,people/apt,districtPopTrend,districtPop
0,96107057,Nordrhein_Westfalen,595.0,840.0,9.767442,4.62,86.0,44269,6,False,...,2019-05-10,17935147,-0.02,stable,34098,526,41.41,2.12,0.33,587010
1,92798563,Nordrhein_Westfalen,972.6,1320.65,15.179885,3.76,87.0,44229,12,True,...,2019-05-10,17935147,-0.02,stable,34098,526,41.41,2.12,0.33,587010
2,114894763,Nordrhein_Westfalen,396.8,493.8,7.964516,4.1,62.0,44137,0,False,...,2020-02-01,17935147,-0.02,stable,34098,526,41.41,2.12,0.33,587010
3,90046012,Nordrhein_Westfalen,310.0,460.0,8.363636,3.28,55.0,44329,14,False,...,2019-05-10,17935147,-0.02,stable,34098,526,41.41,2.12,0.33,587010
4,87928570,Nordrhein_Westfalen,301.0,,,4.41,30.07,44137,31,False,...,2018-09-22,17935147,-0.02,stable,34098,526,41.41,2.12,0.33,587010
5,79343710,Nordrhein_Westfalen,1550.0,2205.0,14.898649,3.85,148.0,44229,32,False,...,2019-05-10,17935147,-0.02,stable,34098,526,41.41,2.12,0.33,587010
6,62267211,Nordrhein_Westfalen,440.0,525.0,7.835821,4.41,67.0,44137,6,False,...,2019-05-10,17935147,-0.02,stable,34098,526,41.41,2.12,0.33,587010
7,93403649,Nordrhein_Westfalen,370.0,500.0,10.0,4.55,50.0,44287,13,False,...,2019-05-10,17935147,-0.02,stable,34098,526,41.41,2.12,0.33,587010
8,113557346,Nordrhein_Westfalen,403.75,653.75,7.691176,3.71,85.0,44143,8,False,...,2020-02-01,17935147,-0.02,stable,34098,526,41.41,2.12,0.33,587010
9,111370962,Nordrhein_Westfalen,512.0,672.0,10.5,4.44,64.0,44225,5,False,...,2019-05-10,17935147,-0.02,stable,34098,526,41.41,2.12,0.33,587010


In [7]:
df.shape

(257607, 40)

In [8]:
df.columns

Index(['scoutId', 'regio1', 'baseRent', 'totalRent', 'price/unit',
       'pricetrend', 'livingSpace', 'plz', 'picturecount', 'newlyConst',
       'yearConstructed', 'firingTypes', 'heatingType', 'hasKitchen', 'cellar',
       'condition', 'street', 'lift', 'typeOfFlat', 'noRooms', 'floor',
       'balcony', 'garden', 'baseRentRange', 'noRoomsRange',
       'livingSpaceRange', 'yearConstructedRange', 'regio2', 'regio3',
       'description', 'date', 'population', 'populationTrend', 'popTrendCat',
       'surfaceArea', 'populationDensity', 'space/person', 'people/apt',
       'districtPopTrend', 'districtPop'],
      dtype='object')

In [9]:
df['hasKitchen'] = df['hasKitchen'].astype(int)

In [10]:
df['balcony'] = df['balcony'].astype(int)

In [11]:
df['lift'] = df['lift'].astype(int)

In [12]:
# Create subset with just the factors that interest me for regression
reg_sub = df[['livingSpace', 'totalRent', 'pricetrend', 'populationTrend', 'districtPopTrend', 'districtPop', 'hasKitchen', 'picturecount', 'populationDensity', 'balcony', 'lift']]

##### Missing values

In [13]:
# Check for missing values

reg_sub.isnull().sum()


livingSpace             70
totalRent            39303
pricetrend            1708
populationTrend          0
districtPopTrend         0
districtPop              0
hasKitchen               0
picturecount             0
populationDensity        0
balcony                  0
lift                     0
dtype: int64

In [14]:
# drop blank values 
reg_sub_trim = reg_sub.dropna(inplace=False)

In [15]:
reg_sub_trim.isnull().sum()

livingSpace          0
totalRent            0
pricetrend           0
populationTrend      0
districtPopTrend     0
districtPop          0
hasKitchen           0
picturecount         0
populationDensity    0
balcony              0
lift                 0
dtype: int64

##### Duplicates check

In [16]:
dups = reg_sub_trim.duplicated()

In [17]:
dups.shape # No dups

(216760,)

### 3. Data prep for regression analysis 

In [18]:
from sklearn import linear_model

In [19]:
X = reg_sub_trim[['populationTrend', 'livingSpace', 'pricetrend', 'districtPopTrend', 'districtPop', 'hasKitchen', 'picturecount', 'populationDensity', 'balcony', 'lift']]
y = reg_sub_trim['totalRent']

In [20]:
y

0          840.00
1         1320.65
2          493.80
3          460.00
5         2205.00
           ...   
257601     590.00
257602    1000.00
257604     700.00
257605     300.00
257606     840.00
Name: totalRent, Length: 216760, dtype: float64

In [21]:
#Splitting the dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)

In [22]:
mlr = LinearRegression()  
mlr.fit(x_train, y_train)

### 4. Regression analysis

In [23]:
print('Intercept:', mlr.intercept_)
print('Coefficients:')
list(zip(X, mlr.coef_))

Intercept: -275.4105102805373
Coefficients:


[('populationTrend', 236.16518128908652),
 ('livingSpace', 11.667524304827383),
 ('pricetrend', 24.978416951516532),
 ('districtPopTrend', -2.7801659552113875),
 ('districtPop', 0.0003991258900415823),
 ('hasKitchen', 100.59463407298603),
 ('picturecount', 3.636129686058994),
 ('populationDensity', -0.28343951714445265),
 ('balcony', 12.46540899543645),
 ('lift', 181.7351344668637)]

In [24]:
# Prediction of test set
y_pred_mlr= mlr.predict(x_test)
# Predicted values
print('Prediction for test set: {}'.format(y_pred_mlr))

Prediction for test set: [ 282.12573586  422.59624585 1088.11806101 ...  500.7991107   164.14383556
  899.02549286]


In [25]:
mlr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_mlr})
mlr_diff.head()

Unnamed: 0,Actual value,Predicted value
109587,392.81,282.125736
203583,710.0,422.596246
104682,698.0,1088.118061
201752,1130.0,1231.008947
249071,1030.0,1144.704398


## 05. Model performance stats

In [26]:
from sklearn import metrics
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_mlr)
rootMeanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr, squared=False)
print('R squared: {:.2f}'.format(mlr.score(X,y)*100))
print('Mean Absolute Error:', meanAbErr)
print('Root Mean Square Error:', rootMeanSqErr)
# kitchen, district pop, picture count, popdensity, balcony, lift, living space, district pop, pricetrend

R squared: 72.47
Mean Absolute Error: 178.57121047262856
Root Mean Square Error: 290.9806795368932


# 06. Other model results 

## Living space, district population, picture count, population density, kitchen, balcony, lift

In [27]:
X2 = reg_sub_trim[['livingSpace', 'districtPop', 'hasKitchen', 'picturecount', 'populationDensity', 'balcony', 'lift']]
y2 = reg_sub_trim['totalRent']

In [28]:
#Splitting the dataset
from sklearn.model_selection import train_test_split
x2_train, x2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.3, random_state = 100)

In [29]:
mlr = LinearRegression()  
mlr.fit(x2_train, y2_train)

### Regression analysis

In [30]:
print('Intercept:', mlr.intercept_)
print('Coefficients:')
list(zip(X2, mlr.coef_))

Intercept: -318.71320710337443
Coefficients:


[('livingSpace', 12.181414790597183),
 ('districtPop', 0.00043627986259962305),
 ('hasKitchen', 162.0002540169191),
 ('picturecount', 3.2030733439862327),
 ('populationDensity', -0.25123165994201213),
 ('balcony', 25.145541432626725),
 ('lift', 196.13976284918564)]

In [31]:
# Prediction of test set
y_pred_mlr= mlr.predict(x2_test)
# Predicted values
print('Prediction for test set: {}'.format(y_pred_mlr))

Prediction for test set: [ 365.78195707  286.69994155 1176.53019063 ...  523.07684623  254.69365457
  840.13263554]


In [32]:
mlr_diff = pd.DataFrame({'Actual value': y2_test, 'Predicted value': y_pred_mlr})
mlr_diff.head()

Unnamed: 0,Actual value,Predicted value
109587,392.81,365.781957
203583,710.0,286.699942
104682,698.0,1176.530191
201752,1130.0,1084.131017
249071,1030.0,1148.861006


In [36]:
from sklearn import metrics
meanAbErr = metrics.mean_absolute_error(y2_test, y_pred_mlr)
rootMeanSqErr = metrics.mean_squared_error(y2_test, y_pred_mlr, squared=False)
print('R squared: {:.2f}'.format(mlr.score(X2,y2)*100))
print('Mean Absolute Error:', meanAbErr)
print('Root Mean Square Error:', rootMeanSqErr)
# kitchen, district pop, picture count, popdensity, balcony, lift, living space

R squared: 69.31
Mean Absolute Error: 195.73629144684992
Root Mean Square Error: 307.54326135735874


## Living space, district population trend and district population

In [59]:
X3 = reg_sub_trim[['livingSpace', 'districtPop', 'districtPopTrend']]
y3 = reg_sub_trim['totalRent']

In [60]:
#Splitting the dataset
from sklearn.model_selection import train_test_split
x3_train, x3_test, y3_train, y3_test = train_test_split(X3, y3, test_size = 0.3, random_state = 100)

In [61]:
mlr = LinearRegression()  
mlr.fit(x3_train, y3_train)

### Regression analysis

In [62]:
print('Intercept:', mlr.intercept_)
print('Coefficients:')
list(zip(X3, mlr.coef_))

Intercept: -259.6530514553625
Coefficients:


[('livingSpace', 12.597134101716383),
 ('districtPop', 0.0001702190223418785),
 ('districtPopTrend', 160.7714486261447)]

In [63]:
# Prediction of test set
y_pred_mlr= mlr.predict(x3_test)
# Predicted values
print('Prediction for test set: {}'.format(y_pred_mlr))

Prediction for test set: [ 402.28161172  553.5131752  1152.85412752 ...  433.1362373   329.76127667
  736.61465923]


In [64]:
mlr_diff = pd.DataFrame({'Actual value': y3_test, 'Predicted value': y_pred_mlr})
mlr_diff.head()

Unnamed: 0,Actual value,Predicted value
109587,392.81,402.281612
203583,710.0,553.513175
104682,698.0,1152.854128
201752,1130.0,1108.55232
249071,1030.0,1381.54978


In [65]:
from sklearn import metrics
meanAbErr = metrics.mean_absolute_error(y3_test, y_pred_mlr)
rootMeanSqErr = metrics.mean_squared_error(y3_test, y_pred_mlr, squared=False)
print('R squared: {:.2f}'.format(mlr.score(X3,y3)*100))
print('Mean Absolute Error:', meanAbErr)
print('Root Mean Square Error:', rootMeanSqErr)
# livingSpace, district population trend and district population

R squared: 63.90
Mean Absolute Error: 209.73663503785565
Root Mean Square Error: 334.4886909663185
