In [1]:
import pandas as pd
from urllib.request import urlopen
import json
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import csv
import requests
from bs4 import BeautifulSoup
import numpy as np
import googleCreds
from geopy.geocoders import Nominatim, GoogleV3, Bing
from shapely.geometry import Point, shape

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [13]:
def getAccommodationDF(minPrice = 500, maxPrice = 2000, maxPages = 1, dfPath = "", savePickle = True):

    """
    The function returns a dataframe with following features:
    - Accommodation id (id): str
    - Accommodation type (type): str
    - Accommodation address (address): str
    - Accommodation price (price): float
    - Coordinates of an accommodation as a list [latitude, longitude] (coords): list of floats

    The data is scrapped from: https://www.immobiliare.it/

    Inputs:

    minPrice    (float): minimum price (rent),
                default: 500

    maxPrice    (float): maxmum price (rent),
                default: 5000

    maxPages    (float): number of result pages to be processed (50 items per page),
                default: 10

    dfPath      (str):  path to save dataframes as pickle files,
                default: "static/miluogo/data/"

    savePickle  (bool): True, if pickle files should be saved to dfPath,
          default: True

    Outputs:

    pandas DataFrame
    """

    urlList = [f"https://www.immobiliare.it/affitto-case/milano/?criterio=rilevanza&prezzoMinimo={minPrice}&prezzoMassimo={maxPrice}"]
    urlList+=[f"https://www.immobiliare.it/affitto-case/milano/?criterio=rilevanza&prezzoMinimo={minPrice}&prezzoMassimo={maxPrice}&pag={i+1}" for i in range(maxPages)]

    objects = {"id":[],
               "type":[],
               "address":[],
               "price":[]}

    for url in urlList:
        session = requests.session()
        response = session.get(url)

        soup = BeautifulSoup(response.text, 'html.parser')
        #print(soup)
        print(soup.find_all('a', class_='Card_in-card__title__234gH'))
        print(soup.find_all('a', class_='Card_in-card__title__234gH')[0])
        input()
        for element in soup.find_all('p', class_='titolo text-primary'):
            try:
                name = element.text.strip().split()
                objects["id"].append(element.find('a').get("id"))
                objects["type"].append(name[0])
                objects["address"].append(" ".join(name[1:]))
            except:
                pass

        for element in soup.find_all('li', class_='lif__item lif__pricing'):
            try:
                if element.find('div')==None:

                    price = element.text.strip().split()[1]
                    price= price.replace(".","")
                    objects["price"].append(int(price))
                else:

                    price = element.find('div').text.strip().split()[1]
                    price= price.replace(".","")

                    objects["price"].append(int(price))
            except:
                pass

            #Sanity check
    assert len(objects["id"]) == len(objects["type"]) == len(objects["address"]) == len(objects["price"])
    print(f"All successful. {len(objects['id'])} objects has been added")

    dfObjects = pd.DataFrame(objects)
    dfObjects["coords"] = dfObjects["address"].map(getLoc)
    dfObjects.dropna(inplace = True)
    if savePickle:
        dfObjects.to_pickle(dfPath+'dfAccommodationsExpanded.pkl')
    return dfObjects

In [31]:
minPrice=100
maxPrice=2000
maxPages=3
urlList = [f"https://www.immobiliare.it/affitto-case/milano/?criterio=rilevanza&prezzoMinimo={minPrice}&prezzoMassimo={maxPrice}"]
#urlList+=[f"https://www.immobiliare.it/affitto-case/milano/?criterio=rilevanza&prezzoMinimo={minPrice}&prezzoMassimo={maxPrice}&pag={i+1}" for i in range(maxPages)]

objects = {"id":[],
           "type":[],
           "address":[],
           "price":[]}

for url in urlList:
    session = requests.session()
    response = session.get(url)

    soup = BeautifulSoup(response.text, 'html.parser')

    for element in zip(soup.find_all('a', class_='Card_in-card__title__234gH'),soup.find_all('li', class_="Features_nd-list__item__3hWVx Features_in-feat__item__2-hIE Features_in-feat__item--main__3EFFl RealEstateListCard_in-realEstateListCard__features--main__2uSci")):
        print(element[0]['href'].split("/")[-2])
        print(element[0]['title'].split()[0])
        print(' '.join(element[0]['title'].split()[1:]))
        print(int(element[1].text.split("/")[0].split()[1].replace('.','')))

80901971
Trilocale
via Col Moschin 7, Corso San Gottardo, Milano
1350
84095940
Bilocale
via PAOLO DA CANNOBIO 37, Missori, Milano
1290
89279357
Bilocale
via SOLFERINO 11, Lanza, Milano
1458
89512713
Bilocale
via WASHINGTON 71, Washington, Milano
950
88109391
Trilocale
via RAMAZZINI 11, Morgagni, Milano
1875
89494519
Quadrilocale
via Salvio Giuliano 4, Washington, Milano
1450
89162681
Bilocale
via SPARTACO 11, Montenero, Milano
1250
88083511
Trilocale
via SALVIO GIULIANO 4, Washington, Milano
1190
81732252
Bilocale
via Giovanni Battista Piranesi 43, Porta Vittoria, Milano
970
89294681
Bilocale
corso di Porta Nuova 52, Porta Nuova, Milano
1085
89490517
Trilocale
piazza Greco 10, Greco - Segnano, Milano
940
88752273
Bilocale
via Melchiorre Gioia 41, Melchiorre Gioia, Milano
800
85234687
Trilocale
via Francesco Londonio 20-A, Paolo Sarpi, Milano
2000
88505289
Trilocale
via Pinamonte da Vimercate, Moscova, Milano
1900
87694184
Bilocale
via Sofonisba Anguissola 26, Gambara, Milano
700
891554

In [14]:
def getZone(loc, js='data/zonedecentramento.geojson'):
    point = Point(min(loc), max(loc))
    with open(js) as f:
        js = json.load(f)
    # check each polygon to see if it contains the point
    for feature in js['features']:
        polygon = shape(feature['geometry'])
        if polygon.contains(point):
            return feature["properties"]['ZONADEC']

In [21]:
df = pd.read_pickle('data/dfAccommodationsExpanded.pkl')
df

Unnamed: 0,id,type,address,price,coords,contamination,distanceToDangerZone,distanceToMedianJobLocation,zone,GreenConc,priceScaled,score
0,link_ad_85188979,Bilocale,"via Cozzi, Greco - Segnano, Milano",900,"[45.5092767, 9.2099263]",0.333668,0.643889,0.922359,9,1.0,0.225,15.62458
1,link_ad_63453592,Attico,"via Piero Maroncelli 5, Garibaldi - Corso Como...",2900,"[45.4828706, 9.183481]",0.29538,0.879735,0.299515,9,1.0,0.725,15.998146
2,link_ad_86792470,Bilocale,"via Louis Pasteur 16, Pasteur, Milano",780,"[45.4902223, 9.2193974]",0.092183,0.557633,0.694444,2,0.487172,0.195,10.132168
3,link_ad_86886580,Quadrilocale,"via Borgogna, Borgogna - Largo Augusto, Milano",4000,"[45.4636501, 9.1990135]",0.413717,0.664893,0.258368,1,0.658288,1.0,14.976329
4,link_ad_85281355,Trilocale,"via Elba 5, Washington, Milano",1900,"[45.4656953, 9.1562597]",0.140285,0.637378,0.367297,7,0.970177,0.475,12.950692
5,link_ad_82383218,Bilocale,"piazza Santa Maria Beltrade 1, Carrobbio, Milano",1700,"[45.4628843, 9.186818599999999]",0.355223,0.475489,0.12614,1,0.658288,0.425,10.2007
6,link_ad_75892020,Appartamento,"via Motta, Vercelli - Wagner, Milano",4000,"[45.468491, 9.1552276]",0.151161,0.718022,0.378781,7,0.970177,1.0,16.090708
7,link_ad_86186172,Appartamento,"via Giulio Uberti 9, Indipendenza, Milano",2916,"[45.4696682, 9.2158532]",0.354462,0.915819,0.482037,3,0.698995,0.729,15.901566
8,link_ad_86197160,Trilocale,"via Amedeo Modigliani 4, Solari, Milano",1750,"[45.4562768, 9.1586758]",0.160098,0.409005,0.407334,6,0.667785,0.4375,10.408608
9,link_ad_86638504,Attico,"corso San Gottardo, Ascanio Sforza, Milano",2000,"[45.44644, 9.177026]",0.52633,0.101734,0.443413,5,0.335678,0.5,9.535774


In [18]:
dfG = pd.read_pickle('data/dfGreen.pkl')
df.join(dfG.set_index('ZONADEC')['GreenConc'], on='zone')

Unnamed: 0,id,type,address,price,coords,contamination,distanceToDangerZone,distanceToMedianJobLocation,zone,GreenConc
0,link_ad_85188979,Bilocale,"via Cozzi, Greco - Segnano, Milano",900,"[45.5092767, 9.2099263]",0.200892,2.351895,5.072288,9,0.174133
1,link_ad_63453592,Attico,"via Piero Maroncelli 5, Garibaldi - Corso Como...",2900,"[45.4828706, 9.183481]",0.17784,3.213352,1.647109,9,0.174133
2,link_ad_86792470,Bilocale,"via Louis Pasteur 16, Pasteur, Milano",780,"[45.4902223, 9.2193974]",0.055501,2.036832,3.818928,2,0.084833
3,link_ad_86886580,Quadrilocale,"via Borgogna, Borgogna - Largo Augusto, Milano",4000,"[45.4636501, 9.1990135]",0.249087,2.428614,1.420831,1,0.11463
4,link_ad_85281355,Trilocale,"via Elba 5, Washington, Milano",1900,"[45.4656953, 9.1562597]",0.084462,2.328112,2.019863,7,0.16894
5,link_ad_82383218,Bilocale,"piazza Santa Maria Beltrade 1, Carrobbio, Milano",1700,"[45.4628843, 9.186818599999999]",0.213869,1.73679,0.693675,1,0.11463
6,link_ad_75892020,Appartamento,"via Motta, Vercelli - Wagner, Milano",4000,"[45.468491, 9.1552276]",0.09101,2.622673,2.083016,7,0.16894
7,link_ad_86186172,Appartamento,"via Giulio Uberti 9, Indipendenza, Milano",2916,"[45.4696682, 9.2158532]",0.213412,3.345155,2.650846,3,0.121718
8,link_ad_86197160,Trilocale,"via Amedeo Modigliani 4, Solari, Milano",1750,"[45.4562768, 9.1586758]",0.09639,1.493946,2.240036,6,0.116283
9,link_ad_86638504,Attico,"corso San Gottardo, Ascanio Sforza, Milano",2000,"[45.44644, 9.177026]",0.316888,0.371596,2.438443,5,0.058453


In [41]:
def predictContamination(lt, lg, model):
    """ The function returns contamination values based on location
    Input:
    lt (float): latitude
    lg (float): longitude

    Output:
    Scaled contamaination value
    """
    pf = PolynomialFeatures(degree=3)
    return model.predict(pf.fit_transform([np.array([lt,lg])]))[0][0]

def generateContaminationModel(data):
    pf = PolynomialFeatures(degree=3)
    X = pf.fit_transform(data[["lt","lg"]])

    y = data[["normv"]]
    lr = LinearRegression()
    lr.fit(X,y)
    return lr

In [42]:
predictContamination(45.43,9.17,generateContaminationModel(df))

0.5010502970253583

In [44]:
print(generateContaminationModel(df).coef_)

[[-1.84811346e-03 -5.49690763e+00  1.20524618e+01 -2.52431980e+02
   2.48381631e+02  1.10711092e+02  4.04228511e+00 -4.65709075e+00
  -5.98977581e+00  5.11241110e+00]]


,


stazione_id,data,inquinante,valore



KeyboardInterrupt: 

In [None]:
for station in dfAirRaw["stazione_id"].unique():
    fig, ax = plt.subplots(figsize=(20,10))
    ax = sns.barplot(x = dfAirRaw["data"].dt.strftime('%Y-%m-%d'), y = "normv",
                     data = dfAirRaw[dfAirRaw["stazione_id"] == station],
                     hue = "inquinante")
    plt.title(f"Station #{station}")
    ax.xaxis.set_major_locator(ticker.MaxNLocator())
