In [21]:
import requests
import pandas as pd
import numpy as np
from pymongo import MongoClient
import math
from bs4 import BeautifulSoup
#import src.functions as fn

In [2]:
client = MongoClient("mongodb://localhost/companies")
db = client.get_database()

Para obtener las primeras localizaciones, voy a generar una query que sólo me traiga de Companies aquellas empresas que como máximo tengan 10 años de antigüedad y que además no tengan $0 ganados.

In [3]:
q1={"$and": [{"founded_year":{"$gte": 2007}},{"total_money_raised": {"$ne": "$0"}}]}
companies=list(db["companies"].find(q1,{"name":1,"founded_year":1,"total_money_raised":1, "offices":1,"category_code":1}))

In [4]:
df = pd.DataFrame(companies)


Voy a tratar el df para desagregar la columna offices, dejando cada oficina en una línea distina y luego desdoblando las columnas que contienen la información de cada oficina.

In [5]:
#Desagrego las oficinas, para que por compañía cada oficina aparezca en una línea independiente:
df = df.explode('offices')
#Desagrego la columna office para obtener los datos en columnas de cada oficina:
df_offices = df[["offices"]].apply(lambda x: x.offices, result_type="expand", axis=1)
#Unifico los dos df anteriores para tener un único df completo con toda la información
clean_data = pd.concat([df,df_offices], axis=1)

display(df.head())
display(df_offices.head())
display(clean_data.head())

Unnamed: 0,_id,name,category_code,founded_year,total_money_raised,offices
0,52cdef7c4bab8bd675297d97,Scribd,news,2007,$25.8M,"{'description': 'HQ', 'address1': '539 Bryant ..."
1,52cdef7c4bab8bd675297d9c,MeetMoi,social,2007,$5.58M,"{'description': None, 'address1': '', 'address..."
2,52cdef7c4bab8bd675297da0,Babelgum,games_video,2007,$13.2M,"{'description': '', 'address1': '', 'address2'..."
3,52cdef7c4bab8bd675297da8,OpenX,advertising,2008,$75.5M,"{'description': 'Headquarters', 'address1': '8..."
3,52cdef7c4bab8bd675297da8,OpenX,advertising,2008,$75.5M,"{'description': 'New York', 'address1': '584 B..."


Unnamed: 0,description,address1,address2,zip_code,city,state_code,country_code,latitude,longitude
0,HQ,539 Bryant Street,,94107.0,San Francisco,CA,USA,37.789634,-122.404052
1,,,,,New York City,NY,USA,40.757929,-73.985506
2,,,,,London,,GBR,53.344104,-6.267494
3,Headquarters,888 East Walnut Street,,91101.0,Pasadena,CA,USA,34.149471,-118.132747
3,New York,584 Broadway,8th Floor,10012.0,New York,NY,USA,,


Unnamed: 0,_id,name,category_code,founded_year,total_money_raised,offices,description,address1,address2,zip_code,city,state_code,country_code,latitude,longitude
0,52cdef7c4bab8bd675297d97,Scribd,news,2007,$25.8M,"{'description': 'HQ', 'address1': '539 Bryant ...",HQ,539 Bryant Street,,94107.0,San Francisco,CA,USA,37.789634,-122.404052
1,52cdef7c4bab8bd675297d9c,MeetMoi,social,2007,$5.58M,"{'description': None, 'address1': '', 'address...",,,,,New York City,NY,USA,40.757929,-73.985506
2,52cdef7c4bab8bd675297da0,Babelgum,games_video,2007,$13.2M,"{'description': '', 'address1': '', 'address2'...",,,,,London,,GBR,53.344104,-6.267494
3,52cdef7c4bab8bd675297da8,OpenX,advertising,2008,$75.5M,"{'description': 'Headquarters', 'address1': '8...",Headquarters,888 East Walnut Street,,91101.0,Pasadena,CA,USA,34.149471,-118.132747
3,52cdef7c4bab8bd675297da8,OpenX,advertising,2008,$75.5M,"{'description': 'New York', 'address1': '584 B...",New York,584 Broadway,8th Floor,10012.0,New York,NY,USA,,


Elimino la columna _id, ya que se generará un nuevo id cuando cargue la nueva colección, y también elimino office que ya está toda la información desagregada y no nos sirve más.

In [6]:
clean_data = clean_data.drop(columns=["_id","offices"])
clean_data.head()

Unnamed: 0,name,category_code,founded_year,total_money_raised,description,address1,address2,zip_code,city,state_code,country_code,latitude,longitude
0,Scribd,news,2007,$25.8M,HQ,539 Bryant Street,,94107.0,San Francisco,CA,USA,37.789634,-122.404052
1,MeetMoi,social,2007,$5.58M,,,,,New York City,NY,USA,40.757929,-73.985506
2,Babelgum,games_video,2007,$13.2M,,,,,London,,GBR,53.344104,-6.267494
3,OpenX,advertising,2008,$75.5M,Headquarters,888 East Walnut Street,,91101.0,Pasadena,CA,USA,34.149471,-118.132747
3,OpenX,advertising,2008,$75.5M,New York,584 Broadway,8th Floor,10012.0,New York,NY,USA,,


Con la siguiente función, daré el formato GeoJson, necesario para poder generar los indexes en MongoDB Compass. En aquellos casos en los que latitud o longitud sean valores NaN, se indicará None, para evitar posibles conflicos en la carga de la colección.

In [7]:
def asGeoJSON(lat,lng):
    try:
        lat = float(lat)
        lng = float(lng)
        if not math.isnan(lat) and not math.isnan(lng):
            return {
                "type":"Point",
                "coordinates":[lng,lat]
            }
    except Exception:
        print("Invalid data")
        return None
        

clean_data["location"] = clean_data[["latitude","longitude"]].apply(lambda x:asGeoJSON(x.latitude,x.longitude), axis=1)
clean_data[["latitude","longitude","location"]]

Unnamed: 0,latitude,longitude,location
0,37.789634,-122.404052,"{'type': 'Point', 'coordinates': [-122.404052,..."
1,40.757929,-73.985506,"{'type': 'Point', 'coordinates': [-73.985506, ..."
2,53.344104,-6.267494,"{'type': 'Point', 'coordinates': [-6.267494, 5..."
3,34.149471,-118.132747,"{'type': 'Point', 'coordinates': [-118.1327468..."
3,,,
...,...,...,...
1364,,,
1364,,,
1365,50.839970,4.346472,"{'type': 'Point', 'coordinates': [4.3464721, 5..."
1366,,,


Exporto el df limpio a json:

In [8]:
clean_data.to_json("../output/cleaned_offices.json", orient="records")

In [9]:
#Quiero comprobar cuántas oficinas de tipo games_video hay en cada país:

q_games={"category_code": {"$eq":"games_video"}}
games=list(db["offices"].find(q_games,{"name":1,"country_code":1,"city":1,"total_money_raised":1,"category_code":1}))
df_games=pd.DataFrame (games)
df_games.groupby ('city').count().sort_values('name', ascending=False).head(20)

Unnamed: 0_level_0,_id,name,category_code,total_money_raised,country_code
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
San Francisco,13,13,13,13,13
New York,12,12,12,12,12
,7,7,7,7,7
Los Angeles,6,6,6,6,6
Palo Alto,6,6,6,6,6
London,5,5,5,5,5
Austin,4,4,4,4,4
Ramat Gan,3,3,3,3,3
Buenos Aires,3,3,3,3,3
Tel Aviv,3,3,3,3,3


Lo importo en Mongo DB Compass: 
$ mongoimport --db companies --collection offices --jsonArray --drop cleaned_offices.json

Una vez importado, creo un geospartial index:
Indexes > Create Index > (write any indexname) > Select fieldname:"location" and 2dsphere

### Más información:
He localizado en Github un json con todas las coordenadas de los aeropuertos del mundo, y lo he cargado en Compass.
Ahora voy a hacer scrapping de la web https://es.numbeo.com/calidad-de-vida/clasificaciones-por-pa%C3%ADs para obtener un dataframe con el índice de calidad de vida por país.

In [10]:
data=requests.get("https://es.numbeo.com/calidad-de-vida/clasificaciones-por-pa%C3%ADs").text
soup= BeautifulSoup(data, 'html.parser')

def procesaIndices(fila):
    m = fila.find_all("td")
    #print(m[2])
    return {
        "country":m[1].text.strip(),
        "calidad_vida":float((m[2].text).replace(',','.')),
        "poder_adquisitivo":float((m[3].text).replace(',','.')),
        "seguridad":float((m[4].text).replace(',','.')),
        "costo_vida":float((m[5].text).replace(',','.')),
        "relacion_precio_vs_ingresos":float((m[6].text).replace(',','.')),
        "tiempo_desplazamiento":float((m[7].text).replace(',','.')),
        "contaminación":float((m[8].text).replace(',','.')),
        "clima":float((m[9].text).replace(',','.'))
    }
    
indice = soup.find_all('table')[2]
indice_dict = [procesaIndices(fila) for fila in indice.find_all("tr")[1:]]


In [11]:
#Genero df:
df_indices=pd.DataFrame(indice_dict)
display(df_indices.head())

#Exporto en formato json y luego importo en MongoDB Compass:
df_indices.to_json("../output/indices_calidad.json", orient="records")

Unnamed: 0,country,calidad_vida,poder_adquisitivo,seguridad,costo_vida,relacion_precio_vs_ingresos,tiempo_desplazamiento,contaminación,clima
0,Dinamarca,192.67,100.88,74.9,80.0,83.0,7.45,28.85,21.33
1,Suiza,192.01,119.53,78.4,72.44,122.4,8.68,29.09,22.39
2,Finlandia,190.22,99.93,76.68,75.79,70.29,8.35,29.9,11.55
3,Australia,186.21,107.31,58.64,77.38,73.54,7.52,34.73,23.46
4,Países Bajos,183.67,90.73,72.38,74.65,73.75,7.51,29.43,27.41


## GooglePlaces

## Prueba GeoQuery

In [12]:
def geocode(address):
    data = requests.get(f"https://geocode.xyz/{address}?json=1").json()
    print(data)
    return {
        "type":"Point",
        "coordinates":[float(data["longt"]),float(data["latt"])]
    }

mad = geocode("España")


{'standard': {'addresst': {}, 'city': 'España', 'prov': 'ES', 'countryname': 'Spain', 'postal': {}, 'confidence': '0.90'}, 'longt': '-4.04423', 'alt': {'loc': {'longt': '-4.04423', 'prov': 'ES', 'city': 'España', 'countryname': 'Spain', 'postal': '28292', 'region': {}, 'latt': '40.61332'}}, 'elevation': {}, 'latt': '40.61332'}


In [13]:
def withGeoQuery(location,maxDistance=10000,minDistance=0,field="location"):
    return {
       field: {
         "$near": {
           "$geometry": location if type(location)==dict else geocode(location),
           "$maxDistance": maxDistance,
           "$minDistance": minDistance
         }
       }
    }

withGeoQuery(mad)

{'location': {'$near': {'$geometry': {'type': 'Point',
    'coordinates': [-4.04423, 40.61332]},
   '$maxDistance': 10000,
   '$minDistance': 0}}}

In [14]:
query = {'location': {'$near': {'$geometry': {'type': 'Point',
    'coordinates': [-4.04423, 40.61332]},
   '$maxDistance': 100000,
   '$minDistance': 0}}}
cumplen = list(db["offices"].find(query))
df_cumplen = pd.DataFrame(cumplen)
df_cumplen

Unnamed: 0,_id,name,category_code,founded_year,total_money_raised,description,address1,address2,zip_code,city,state_code,country_code,latitude,longitude,location
0,5e3f1ab3b0f7cd9d99acf66a,Comunitae,finance,2008,€3.5M,Headquarter,Calle Nuria 29,,28034,Madrid,,ESP,40.490511,-3.70334,"{'type': 'Point', 'coordinates': [-3.7033403, ..."
1,5e3f1ab3b0f7cd9d99acf82f,ticketea,ecommerce,2010,$5.73M,Office,Marques de Lema,7,28003,Madrid,,ESP,40.445515,-3.706176,"{'type': 'Point', 'coordinates': [-3.7061764, ..."
2,5e3f1ab3b0f7cd9d99acf404,Videoplaza,advertising,2007,€15.9M,Madrid,Forum Business Center c/ Capitán Haya,1 planta 15 Edificio Eurocentro,28020,Madrid,,ESP,40.459624,-3.692328,"{'type': 'Point', 'coordinates': [-3.6923285, ..."
3,5e3f1ab3b0f7cd9d99acf325,eRepublik,games_video,2007,€2.75M,Office,Almagro 22,,28010,Madrid,,ESP,40.43094,-3.69529,"{'type': 'Point', 'coordinates': [-3.6952896, ..."
4,5e3f1ab3b0f7cd9d99acf829,Tagmore Solutions,mobile,2007,€250k,Office,Serrano 16,First floor,28001,Madrid,,ESP,40.422692,-3.688307,"{'type': 'Point', 'coordinates': [-3.6883074, ..."
5,5e3f1ab3b0f7cd9d99acf5e5,Busuu,social,2008,$4.7M,,"Calle Columela 2, 1D",,28001,Madrid,,ESP,40.422549,-3.683032,"{'type': 'Point', 'coordinates': [-3.6830316, ..."


## Prueba representación las oficinas que se han filtrado en el anterior proceso.

In [15]:
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster


In [16]:
start_lat = 40.408561
start_lon = -3.6917665
heat_m = folium.Map(location=[start_lat, start_lon],tiles='cartodbpositron', zoom_start=12)
heat_m

In [17]:
# Add a marker to the heat map
uni_lon, uni_lat = -3.698127, 40.4055128
Marker([uni_lat, uni_lon], icon=folium.Icon(color='red')).add_to(heat_m)
heat_m

In [18]:
df_cumplen.apply(lambda row:folium.Marker(location=[row["latitude"], 
                                                  row["longitude"]]).add_to(heat_m),axis=1)
heat_m

In [19]:
#Create groups
#df_madrid = df_cumplen[df_cumplen.horario=='Madrid']

madrid_group = folium.FeatureGroup(name="Madrid")

HeatMap(data=df_cumplen[['latitude', 'longitude']],radius=15).add_to(df_cumplen)
df_cumplen.add_to(heat_m)

heat_m


AttributeError: 'DataFrame' object has no attribute 'add_child'