In [76]:
import numpy as np 
import pandas as pd
import seaborn as sns
import sklearn 
from sklearn import linear_model

In [58]:
def import_data(file):
    return pd.read_csv(file, header=0)

df = import_data("houses_to_rent_v2.csv")

In [59]:
# What's the shape of our data, how many houses, how many variables? 

print(df.shape)

print(df)

(10692, 13)
                 city  area  rooms  bathroom  parking spaces floor     animal  \
0           São Paulo    70      2         1               1     7      acept   
1           São Paulo   320      4         4               0    20      acept   
2        Porto Alegre    80      1         1               1     6      acept   
3        Porto Alegre    51      2         1               0     2      acept   
4           São Paulo    25      1         1               0     1  not acept   
...               ...   ...    ...       ...             ...   ...        ...   
10687    Porto Alegre    63      2         1               1     5  not acept   
10688       São Paulo   285      4         4               4    17      acept   
10689  Rio de Janeiro    70      3         3               0     8  not acept   
10690  Rio de Janeiro   120      2         2               2     8      acept   
10691       São Paulo    80      2         1               0     -      acept   

           furn

We have 10692 houses, with 13 dimensions. Here we don't have a useless ID or something so we get rid of some 

In [60]:
# It seems in the column floor that some hyphens are used instead of 0s

df['floor'].replace(("-"),(0),inplace=True)

print(df)

                 city  area  rooms  bathroom  parking spaces floor     animal  \
0           São Paulo    70      2         1               1     7      acept   
1           São Paulo   320      4         4               0    20      acept   
2        Porto Alegre    80      1         1               1     6      acept   
3        Porto Alegre    51      2         1               0     2      acept   
4           São Paulo    25      1         1               0     1  not acept   
...               ...   ...    ...       ...             ...   ...        ...   
10687    Porto Alegre    63      2         1               1     5  not acept   
10688       São Paulo   285      4         4               4    17      acept   
10689  Rio de Janeiro    70      3         3               0     8  not acept   
10690  Rio de Janeiro   120      2         2               2     8      acept   
10691       São Paulo    80      2         1               0     0      acept   

           furniture  hoa (

In [61]:
#Here we get a sense of the data swe're dealing with

print(df.head(5))


df.describe()

           city  area  rooms  bathroom  parking spaces floor     animal  \
0     São Paulo    70      2         1               1     7      acept   
1     São Paulo   320      4         4               0    20      acept   
2  Porto Alegre    80      1         1               1     6      acept   
3  Porto Alegre    51      2         1               0     2      acept   
4     São Paulo    25      1         1               0     1  not acept   

       furniture  hoa (R$)  rent amount (R$)  property tax (R$)  \
0      furnished      2065              3300                211   
1  not furnished      1200              4960               1750   
2  not furnished      1000              2800                  0   
3  not furnished       270              1112                 22   
4  not furnished         0               800                 25   

   fire insurance (R$)  total (R$)  
0                   42        5618  
1                   63        7973  
2                   41        3841 

Unnamed: 0,area,rooms,bathroom,parking spaces,hoa (R$),rent amount (R$),property tax (R$),fire insurance (R$),total (R$)
count,10692.0,10692.0,10692.0,10692.0,10692.0,10692.0,10692.0,10692.0,10692.0
mean,149.21792,2.506079,2.236813,1.609147,1174.022,3896.247194,366.704358,53.300879,5490.487
std,537.016942,1.171266,1.407198,1.589521,15592.31,3408.545518,3107.832321,47.768031,16484.73
min,11.0,1.0,1.0,0.0,0.0,450.0,0.0,3.0,499.0
25%,56.0,2.0,1.0,0.0,170.0,1530.0,38.0,21.0,2061.75
50%,90.0,2.0,2.0,1.0,560.0,2661.0,125.0,36.0,3581.5
75%,182.0,3.0,3.0,2.0,1237.5,5000.0,375.0,68.0,6768.0
max,46335.0,13.0,10.0,12.0,1117000.0,45000.0,313700.0,677.0,1120000.0


In [62]:
# some columsn are text, whilst they should be numeric i.e. animal, let's change that

df['animal'].replace(('acept', 'not acept'), (1, 0), inplace=True)

df['furniture'].replace(('furnished', 'not furnished'), (1,0), inplace=True)


In [63]:
# the city column is an issue, perhaps three different models will be needed
# let's first create a model without the city

dfwo = df.drop("city",axis=1, inplace=False)

dfwo.head(5)

In [67]:
# we turn the entire dataset into integers

dfwo = dfwo.astype(dtype=np.int64)

In [72]:
#we check that it did turn everything into an integer

dfwo.isin(['Incluso']).any()

area                   False
rooms                  False
bathroom               False
parking spaces         False
floor                  False
animal                 False
furniture              False
hoa (R$)               False
rent amount (R$)       False
property tax (R$)      False
fire insurance (R$)    False
total (R$)             False
dtype: bool

In [75]:
# Wwe shuffle the data to make sure that there wasn't a bias whilst inputing data

dfwo = dfwo.sample(frac=1).reset_index(drop=True)



In [89]:
dfwo.describe()

Unnamed: 0,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa (R$),rent amount (R$),property tax (R$),fire insurance (R$),total (R$)
count,10692.0,10692.0,10692.0,10692.0,10692.0,10692.0,10692.0,10692.0,10692.0,10692.0,10692.0,10692.0
mean,149.21792,2.506079,2.236813,1.609147,5.067995,0.777778,0.243734,1174.022,3896.247194,366.704358,53.300879,5490.487
std,537.016942,1.171266,1.407198,1.589521,6.06905,0.415759,0.429354,15592.31,3408.545518,3107.832321,47.768031,16484.73
min,11.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,450.0,0.0,3.0,499.0
25%,56.0,2.0,1.0,0.0,1.0,1.0,0.0,170.0,1530.0,38.0,21.0,2061.75
50%,90.0,2.0,2.0,1.0,3.0,1.0,0.0,560.0,2661.0,125.0,36.0,3581.5
75%,182.0,3.0,3.0,2.0,8.0,1.0,0.0,1237.5,5000.0,375.0,68.0,6768.0
max,46335.0,13.0,10.0,12.0,301.0,1.0,1.0,1117000.0,45000.0,313700.0,677.0,1120000.0


In [80]:
# Now to start our multiple regression, we want to 

X = dfwo[['area','rooms','bathroom','parking spaces','floor','animal','furniture','hoa (R$)','rent amount (R$)','property tax (R$)','fire insurance (R$)']]
y = dfwo ['total (R$)']

regr = linear_model.LinearRegression()
regr.fit(X, y)

LinearRegression()

In [91]:
# Finally we can predict a house price based on some factors

price = regr.predict([[149.217920,2.506079,2.236813,1.609147,5.067995,0.777778,0.243734,1.174022,3896.247194,366.704358,53.300879]])

print(price)



[4317.52520759]
