In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model

In [2]:
# Get full dataframes
df_houses_full = pd.read_csv(r'csv/casas-full.csv')
df_departments_full = pd.read_csv(r'csv/departamentos-full.csv')

In [3]:
# Get only milenio dataframes
dfHouses = df_houses_full[df_houses_full['web-scraper-start-url'].str.contains("milenio", na=False, case=False)]
dfDepartments = df_departments_full[df_departments_full['web-scraper-start-url'].str.contains("milenio", na=False, case=False)]

In [4]:
# See valid urls fpr houses
print(f'There are {len(dfHouses)} elements')
list(dfHouses['web-scraper-start-url'])

There are 740 elements


['https://www.inmuebles24.com/casas-en-venta-en-milenio-pagina-19.html',
 'https://www.inmuebles24.com/casas-en-venta-en-milenio-pagina-16.html',
 'https://www.inmuebles24.com/casas-en-venta-en-milenio-pagina-12.html',
 'https://www.inmuebles24.com/casas-en-venta-en-milenio-pagina-7.html',
 'https://www.inmuebles24.com/casas-en-venta-en-milenio-pagina-13.html',
 'https://www.inmuebles24.com/casas-en-venta-en-milenio-pagina-2.html',
 'https://www.inmuebles24.com/casas-en-venta-en-milenio-pagina-18.html',
 'https://www.inmuebles24.com/casas-en-venta-en-milenio-pagina-15.html',
 'https://www.inmuebles24.com/casas-en-venta-en-milenio-pagina-18.html',
 'https://www.inmuebles24.com/casas-en-venta-en-milenio-pagina-19.html',
 'https://www.inmuebles24.com/casas-en-venta-en-milenio-pagina-19.html',
 'https://www.inmuebles24.com/casas-en-venta-en-milenio-pagina-13.html',
 'https://www.inmuebles24.com/casas-en-venta-en-milenio-pagina-4.html',
 'https://www.inmuebles24.com/casas-en-venta-en-mileni

In [5]:
# See valid urls for departments
print(f'There are {len(dfDepartments)} elements')
list(dfDepartments['web-scraper-start-url'])

There are 202 elements


['https://www.lamudi.com.mx/queretaro-arteaga/queretaro/milenio-iii-10/departamento/for-rent/?page=2',
 'https://www.lamudi.com.mx/queretaro-arteaga/queretaro/milenio-iii-10/departamento/for-rent/?page=1',
 'https://www.lamudi.com.mx/queretaro-arteaga/queretaro/milenio-iii-10/departamento/for-rent/?page=1',
 'https://www.lamudi.com.mx/queretaro-arteaga/queretaro/milenio-iii-10/departamento/for-rent/?page=2',
 'https://www.lamudi.com.mx/queretaro-arteaga/queretaro/milenio-iii-10/departamento/for-rent/?page=3',
 'https://www.lamudi.com.mx/queretaro-arteaga/queretaro/milenio-iii-10/departamento/for-rent/?page=2',
 'https://www.lamudi.com.mx/queretaro-arteaga/queretaro/milenio-iii-10/departamento/for-rent/?page=2',
 'https://www.lamudi.com.mx/queretaro-arteaga/queretaro/milenio-iii-10/departamento/for-rent/?page=1',
 'https://www.lamudi.com.mx/queretaro-arteaga/queretaro/milenio-iii-10/departamento/for-rent/?page=1',
 'https://www.lamudi.com.mx/queretaro-arteaga/queretaro/milenio-iii-10/de

In [6]:
# Get dataframes for training
dfTrainHouses = dfHouses[0:int(len(dfHouses) * .7)]
dfTrainDepartments = dfDepartments[0:int(len(dfDepartments) * .7)]
print(f'70% for training are {len(dfTrainHouses)} elements for houses')
print(f'70% for training are {len(dfTrainDepartments)} elements for departments')

# Get dataframes for testing
dfTestHouses = dfHouses[len(dfTrainHouses):len(dfHouses)]
dfTestDepartments = dfDepartments[len(dfTrainDepartments):len(dfDepartments)]
print(f'30% for testing are {len(dfTestHouses)} elements for houses')
print(f'30% for testing are {len(dfTestDepartments)} elements for departments')

70% for training are 518 elements for houses
70% for training are 141 elements for departments
30% for testing are 222 elements for houses
30% for testing are 61 elements for departments


In [7]:
# Get elements for training

housesMeters = pd.DataFrame(dfTrainHouses['mets'])
housesMeters.reset_index(drop=True, inplace=True)
housesMeters.rename(columns = {'mets' : '0'}, inplace = True)
departmentsMeters = pd.DataFrame(dfTrainDepartments['mets'])
departmentsMeters.reset_index(drop=True, inplace=True)
departmentsMeters.rename(columns = {'mets' : '0'}, inplace = True)

housesPrices = pd.DataFrame(dfTrainHouses['precio'])
housesPrices.reset_index(drop=True, inplace=True)
housesPrices.rename(columns = {'precio' : '0'}, inplace = True)
departmentsPrices = pd.DataFrame(dfTrainDepartments['precio'])
departmentsPrices.reset_index(drop=True, inplace=True)
departmentsPrices.rename(columns = {'precio' : '0'}, inplace = True)

In [8]:
# Model and Fit
modelHouses = linear_model.LinearRegression()
modelHouses.fit(housesMeters, housesPrices)

modelDepartments = linear_model.LinearRegression()
modelDepartments.fit(departmentsMeters, departmentsPrices)

LinearRegression()

In [10]:
# Prepare Test Dataframes
housesMetersTest = pd.DataFrame(dfTestHouses['mets'])
housesMetersTest.reset_index(drop=True, inplace=True)
housesMetersTest.rename(columns = {'mets' : '0'}, inplace = True)
housesPricesTest = pd.DataFrame(dfTestHouses['precio'])
housesPricesTest.reset_index(drop=True, inplace=True)
housesPricesTest.rename(columns = {'precio' : '0'}, inplace = True)

departmentsMetersTest = pd.DataFrame(dfTestDepartments['mets'])
departmentsMetersTest.reset_index(drop=True, inplace=True)
departmentsMetersTest.rename(columns = {'mets' : '0'}, inplace = True)
departmentsPricesTest = pd.DataFrame(dfTrainDepartments['precio'])
departmentsPricesTest.reset_index(drop=True, inplace=True)
departmentsPricesTest.rename(columns = {'precio' : '0'}, inplace = True)

In [16]:
# Make predictions using Test elements
houseMetersToPredict = housesMetersTest['0'][0]
housePredictedPrice = modelHouses.predict(pd.DataFrame([houseMetersToPredict]))[0][0]
print(f'Price for house of {houseMetersToPredict}m is ${housePredictedPrice}')

departmentMetersToPredict = departmentsMetersTest['0'][0]
departmentPredictedPrice = modelHouses.predict(pd.DataFrame([departmentMetersToPredict]))[0][0]
print(f'Price for department of {departmentMetersToPredict}m is ${departmentPredictedPrice}')

Price for house of 480.0m is $7022609.063484356
Price for department of 134m is $2835564.6351362146
