### The goal of this project is to calculate the rent value of a house based on its features and location.

In [1]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

#configuring pandas
pd.set_option('display.max_rows',200)
pd.set_option('display.max_columns',100)
#configuring matplotlib
plt.rcParams['figure.figsize']=(15,6)
plt.style.use('seaborn-darkgrid')

### Exploring data

In [2]:
Base_Dados = pd.read_csv('house_data.csv')

In [3]:
Base_Dados.shape

(10692, 13)

In [4]:
Base_Dados.head(3)

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa (R$),rent amount (R$),property tax (R$),fire insurance (R$),total (R$)
0,São Paulo,70,2,1,1,7,acept,furnished,2065,3300,211,42,5618
1,São Paulo,320,4,4,0,20,acept,not furnished,1200,4960,1750,63,7973
2,Porto Alegre,80,1,1,1,6,acept,not furnished,1000,2800,0,41,3841
3,Porto Alegre,51,2,1,0,2,acept,not furnished,270,1112,22,17,1421
4,São Paulo,25,1,1,0,1,not acept,not furnished,0,800,25,11,836


In [5]:
# selecting the features to remove
Base_Dados.drop(columns=['fire insurance (R$)','total (R$)'],inplace=True)

In [7]:
# checking if 2 columns were deleted. should have 11 now
Base_Dados.shape

(10692, 11)

In [8]:
# checking null values
Base_Dados.isnull().sum().sort_values(ascending=False)

city                 0
area                 0
rooms                0
bathroom             0
parking spaces       0
floor                0
animal               0
furniture            0
hoa (R$)             0
rent amount (R$)     0
property tax (R$)    0
dtype: int64

In [9]:
# counting unique values in each column
Base_Dados.nunique()

city                    5
area                  517
rooms                  11
bathroom               10
parking spaces         11
floor                  35
animal                  2
furniture               2
hoa (R$)             1679
rent amount (R$)     1195
property tax (R$)    1243
dtype: int64

In [12]:
#checking column types
Base_Dados.info()
Base_Dados.dtypes.value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10692 entries, 0 to 10691
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   city               10692 non-null  object
 1   area               10692 non-null  int64 
 2   rooms              10692 non-null  int64 
 3   bathroom           10692 non-null  int64 
 4   parking spaces     10692 non-null  int64 
 5   floor              10692 non-null  object
 6   animal             10692 non-null  object
 7   furniture          10692 non-null  object
 8   hoa (R$)           10692 non-null  int64 
 9   rent amount (R$)   10692 non-null  int64 
 10  property tax (R$)  10692 non-null  int64 
dtypes: int64(7), object(4)
memory usage: 919.0+ KB


int64     7
object    4
dtype: int64

## Exploratory Data Analysis (EDA)

In [15]:
# Selecting the categorial columns and no categorical
Categorical_columns = Base_Dados.columns[Base_Dados.dtypes == object]
Columns_no_categorical = Base_Dados.columns[Base_Dados.dtypes != object]

Categorical_columns,Columns_no_categorical

(Index(['city', 'floor', 'animal', 'furniture'], dtype='object'),
 Index(['area', 'rooms', 'bathroom', 'parking spaces', 'hoa (R$)',
        'rent amount (R$)', 'property tax (R$)'],
       dtype='object'))

In [17]:
# Analisando campos
Base_Dados['city'].value_counts(normalize=True)*100

São Paulo         55.059858
Rio de Janeiro    14.038533
Belo Horizonte    11.765806
Porto Alegre      11.157875
Campinas           7.977927
Name: city, dtype: float64

In [21]:
for column in Categorical_columns:
    analysis = Base_Dados[column].value_counts(normalize=True)*100
    print(column, '\n', analysis, '\n')

city 
 São Paulo         55.059858
Rio de Janeiro    14.038533
Belo Horizonte    11.765806
Porto Alegre      11.157875
Campinas           7.977927
Name: city, dtype: float64 

floor 
 -      23.017209
1      10.110363
2       9.212495
3       8.707445
4       6.995885
5       5.611672
6       5.041152
7       4.648335
8       4.582866
9       3.451178
10      3.338945
11      2.833895
12      2.403666
13      1.870557
14      1.589974
15      1.374860
16      1.019454
17      0.897868
18      0.701459
19      0.495698
20      0.411523
21      0.392817
25      0.233820
23      0.233820
22      0.224467
26      0.187056
24      0.177703
27      0.074822
28      0.056117
29      0.046764
32      0.018706
35      0.009353
46      0.009353
301     0.009353
51      0.009353
Name: floor, dtype: float64 

animal 
 acept        77.777778
not acept    22.222222
Name: animal, dtype: float64 

furniture 
 not furnished    75.626637
furnished        24.373363
Name: furniture, dtype: float64 



In [22]:
# data cleaning
Base_Dados.loc[Base_Dados['floor']=='301']

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa (R$),rent amount (R$),property tax (R$)
2562,Belo Horizonte,80,3,2,2,301,acept,not furnished,750,2600,164


In [23]:
Base_Dados.iloc[2562,5]

'301'

In [25]:
Base_Dados.iloc[2562,5]=30

In [27]:
Base_Dados.loc[Base_Dados['floor']==30]

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa (R$),rent amount (R$),property tax (R$)
2562,Belo Horizonte,80,3,2,2,30,acept,not furnished,750,2600,164


In [28]:
# replacing the "-" with 0
Base_Dados['floor'].apply(lambda Registro: 0 if Registro=='-' else Registro)

0         7
1        20
2         6
3         2
4         1
         ..
10687     5
10688    17
10689     8
10690     8
10691     0
Name: floor, Length: 10692, dtype: object

In [29]:
Base_Dados.tail()

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa (R$),rent amount (R$),property tax (R$)
10687,Porto Alegre,63,2,1,1,5,not acept,furnished,402,1478,24
10688,São Paulo,285,4,4,4,17,acept,not furnished,3100,15000,973
10689,Rio de Janeiro,70,3,3,0,8,not acept,furnished,980,6000,332
10690,Rio de Janeiro,120,2,2,2,8,acept,furnished,1585,12000,279
10691,São Paulo,80,2,1,0,-,acept,not furnished,0,1400,165


In [30]:
Base_Dados['floor']= Base_Dados['floor'].apply(lambda Registro: 0 if Registro=='-' else Registro)

In [32]:
Base_Dados.dtypes

city                 object
area                  int64
rooms                 int64
bathroom              int64
parking spaces        int64
floor                object
animal               object
furniture            object
hoa (R$)              int64
rent amount (R$)      int64
property tax (R$)     int64
dtype: object

In [33]:
pd.to_numeric(Base_Dados['floor'])

0         7
1        20
2         6
3         2
4         1
         ..
10687     5
10688    17
10689     8
10690     8
10691     0
Name: floor, Length: 10692, dtype: int64

In [34]:
Base_Dados['floor'] = pd.to_numeric(Base_Dados['floor'])

In [35]:
Base_Dados.head()

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa (R$),rent amount (R$),property tax (R$)
0,São Paulo,70,2,1,1,7,acept,furnished,2065,3300,211
1,São Paulo,320,4,4,0,20,acept,not furnished,1200,4960,1750
2,Porto Alegre,80,1,1,1,6,acept,not furnished,1000,2800,0
3,Porto Alegre,51,2,1,0,2,acept,not furnished,270,1112,22
4,São Paulo,25,1,1,0,1,not acept,not furnished,0,800,25


In [38]:
Base_Dados['floor'].value_counts(normalize=True)*100

0     23.017209
1     10.110363
2      9.212495
3      8.707445
4      6.995885
5      5.611672
6      5.041152
7      4.648335
8      4.582866
9      3.451178
10     3.338945
11     2.833895
12     2.403666
13     1.870557
14     1.589974
15     1.374860
16     1.019454
17     0.897868
18     0.701459
19     0.495698
20     0.411523
21     0.392817
25     0.233820
23     0.233820
22     0.224467
26     0.187056
24     0.177703
27     0.074822
28     0.056117
29     0.046764
32     0.018706
35     0.009353
46     0.009353
30     0.009353
51     0.009353
Name: floor, dtype: float64