In [255]:
import pandas as pd
import numpy as np
import requests
import regex as re
import geopandas, shapely
import json
from functools import reduce
import operator
from pymongo import MongoClient,GEOSPHERE
import src.api as api
import src.df as df
import src.insertdb as ins

In [2]:
# Set connection with MongoDB
conn = MongoClient("localhost:27017")
db = conn.get_database("final_project")
p = db.get_collection("places")
h = db.get_collection("homes")

In [3]:
p.create_index([("geometry",GEOSPHERE)])
h.create_index([("geometry",GEOSPHERE)])

'geometry_2dsphere'

In [4]:
def insert_object(df, collection):
    '''
    This function inserts all the information from the dataframe we created with the previous function as a Mongodb object.
    This object will have two field: title (name of the film or show) and reviews (array with all the reviews users have made)
    '''

    collection.insert_many(df.to_dict('records'))

    return "Data succesfully added"

### Getting home data and cleaning it

In [6]:
# Download JSON file 
data = json.load(open("data_vivienda.json", 'r', encoding='utf-8'))

In [7]:
data_vivienda = pd.DataFrame(data)
data_vivienda.columns

Index(['propertyCode', 'thumbnail', 'externalReference', 'numPhotos', 'floor',
       'price', 'propertyType', 'operation', 'size', 'exterior', 'rooms',
       'bathrooms', 'address', 'province', 'municipality', 'district',
       'country', 'neighborhood', 'latitude', 'longitude', 'showAddress',
       'url', 'hasVideo', 'status', 'newDevelopment', 'hasLift', 'priceByArea',
       'detailedType', 'suggestedTexts', 'hasPlan', 'has3DTour', 'has360',
       'parkingSpace', 'newDevelopmentFinished'],
      dtype='object')

In [8]:
data_vivienda = data_vivienda.drop(["thumbnail","propertyCode", "externalReference", "numPhotos", "hasVideo", "priceByArea", "detailedType", "country", "suggestedTexts", "hasPlan", "has3DTour", "has360", "newDevelopmentFinished"], axis = 1)

In [9]:
# I'm only interested in those houses inside Madrid, so I'm going to delete all observations that belong to another municipality
data_vivienda = data_vivienda[data_vivienda["municipality"]=="Madrid"]

In [10]:
data_vivienda.dtypes

floor              object
price             float64
propertyType       object
operation          object
size              float64
exterior             bool
rooms               int64
bathrooms           int64
address            object
province           object
municipality       object
district           object
neighborhood       object
latitude          float64
longitude         float64
showAddress          bool
url                object
status             object
newDevelopment       bool
hasLift            object
parkingSpace       object
dtype: object

In [60]:
data_vivienda.isnull().sum()
# Parkingspace has too many null values, and as we cannot fill the with a proper value the best option is to drop the whole column (explain in API the reason for the lack of information)
# Let's check the floor, status and hasLift variables

floor              108
price                0
propertyType         0
operation            0
size                 0
exterior             0
rooms                0
bathrooms            0
address              0
province             0
municipality         0
district             0
neighborhood         0
latitude             0
longitude            0
showAddress          0
url                  0
status             556
newDevelopment       0
hasLift            154
parkingSpace      6018
dtype: int64

In [11]:
data_vivienda = data_vivienda.drop("parkingSpace", axis = 1)

In [74]:
data_vivienda.status.value_counts()
# In this case we are going to fill the null values with 'unspecified'

good              8000
renew               24
newdevelopment       8
Name: status, dtype: int64

In [12]:
# There a couple of values that are not numbers, so I'm going to change them to a value that can be valid. In this case '0', as after looking a Idelista we can assume they are street level homes
data_vivienda.floor = data_vivienda.floor.replace(["bj","en","st","ss"], "0", regex = True)

In [13]:
data_vivienda.status = data_vivienda.status.fillna('unspecified')

In [14]:
data_vivienda.hasLift = data_vivienda.hasLift.fillna("unspecified")

In [15]:
data_vivienda = data_vivienda.dropna(how = 'any')

In [16]:
def geo_frame(df):
    df = geopandas.GeoDataFrame(df, geometry=geopandas.points_from_xy(df.latitude, df.longitude))
    df['geometry']=df['geometry'].apply(lambda x:shapely.geometry.mapping(x))
    return df

In [17]:
data_vivienda = geo_frame(data_vivienda)



In [180]:
insert_object(data_vivienda, h)

'Data succesfully added'

### Google API

In [18]:
my_key = open("keys\\google_key.txt").readlines()[0]

In [7]:
response = requests.get(url)

#### Find places with google API

In [19]:
base_url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json?"

In [56]:
location = '40.446067,-3.691247' # I have picked Nuevos Minesterios as center of all queries
radius = '10000'
tipo = "restaurant"
page_token = my_key
url = f"{base_url}location={location}&radius={radius}&type={tipo}&key={my_key}"

In [57]:
response = requests.get(url)

In [59]:
prueba = response.json()

In [63]:
next_page_token = prueba["next_page_token"]
url_next = f"{base_url}pagetoken={next_page_token}&key={my_key}"

In [66]:
response = requests.get(url_next)

In [67]:
prueba = response.json()

### FOURSQUARE API

In [74]:
client_id = open("keys\\foursquare_key.txt").readlines()[0]
client_secret = open("keys\\foursquare_key.txt").readlines()[1]

In [75]:

def extract(url):
    
    results = requests.get(url)

    code = json.loads(results.text)

    decoding = code.get("response")

    decoded = decoding.get("groups")[0]

    return decoded.get("items")

### Prueba para encontrar punto central de acuerdo a condiciones 

In [129]:
# IDs
nightlife = "4d4b7105d754a06376d81259"
gym = "4bf58dd8d48988d175941735"
park = "4bf58dd8d48988d163941735"
hospital = "4bf58dd8d48988d196941735"
elementary_school = "4f4533804b9074f6e4fb0105"
high_school = "4bf58dd8d48988d13d941735"
middle_school = "4f4533814b9074f6e4fb0106"
clothing_store = "4bf58dd8d48988d103951735"
daycare = "4f4532974b9074f6e4fb0104"
grocery_store = "4bf58dd8d48988d118951735"
supermarket = "52f2ab2ebcbc57f1066b8b46"
pharmacy = "4bf58dd8d48988d10f951735"
general_entertainment = "4d4b7104d754a06370d81259"
bar = "4bf58dd8d48988d116941735"

In [77]:
# Restaurantes
version = '20180323'
limit = 300
radius = 9000
location = '40.446067,-3.691247' # I have picked Nuevos Minesterios as center of all queries
radius = '9000'

In [None]:
url = f'https://api.foursquare.com/v2/venues/explore?&client_id={client_id}&client_secret={client_secret}&v={version}&ll=40.446067,-3.691247&radius={radius}&limit={limit}&categoryId={cat_id}'
hospitals = extract(url)
hospitals_df = places_df(hospitals, "medical_centre")
hospitals_df.head()

In [None]:
def getFromDict(diccionario,mapa):
    return reduce(operator.getitem,mapa,diccionario)

def places_df(frame,category):
    
    nombre = ["venue","name"]
    latitud = ["venue", "location","lat"]
    longitud = ["venue","location","lng"]
    barrio = ["venue", "location", "neighborhood"]
    tipo = ["venue", "categories", "pluralName"]
    x = []
    
    for diccionario in frame:
        lista = {}
        lista["name"] = getFromDict(diccionario,nombre)
        lista["longitude"] = getFromDict(diccionario,longitud)
        lista["latitude"] = getFromDict(diccionario,latitud)

        
        x.append(lista)
        
        df = pd.DataFrame(x)

    c = [category for _ in range(len(frame))]
        
    df["category"] = c
    
    return df

In [97]:
def locations(radius, limit, cat_id, category ):
    url = f'https://api.foursquare.com/v2/venues/explore?&client_id={client_id}&client_secret={client_secret}&v={version}&ll=40.446067,-3.691247&radius={radius}&limit={limit}&categoryId={cat_id}'
    x = extract(url)
    return places_df(x, category)

In [157]:
# Gyms
gym_df = locations(9000, 300, gym, "gym")

In [153]:
# Museos
ent_df = locations(9000, 300, general_entertainment, "general_entertainment")

In [146]:
pharmacy_df = locations(9000, 300, pharmacy, "medical_centre")

In [159]:
supermarkets_df = locations(9000, 300, supermarket, "supermarket")

In [166]:
clothes_stores = locations(9000, 300, clothing_store, "clothing_store")

In [170]:
conjunto = pd.concat([ent_df, gyms_df, hospitals_df, pharmacy_df, supermarkets_df, clothes_stores])

In [None]:
conjunto = geo_frame(conjunto)

In [179]:
insert_object(conjunto, p)

'Data succesfully added'

In [256]:
ins.insert_object(conjunto, p)

TypeError: insert_object() missing 2 required positional arguments: 'genres' and 'collection'

In [182]:
conjunto.shape

(543, 5)

### Trying geo queries with differente collections

In [237]:
def lugar(coords):
    return  list(p.find(
        {"geometry": {"$near": {
            "$geometry": {"type": "Point" ,
             "coordinates": coords
       }, "$maxDistance": 1000}}},
    ))
    

In [238]:
coords = [40.455228,-3.784494]
prueba = lugar(coords)

In [221]:
coordinates = [list(i["coordinates"]) for i in data_vivienda.geometry] 

In [228]:
for i in coordinates:
    print(type(i))
    break

<class 'list'>


In [240]:
prueba_2 = list(map(lugar, coordinates))

In [244]:
len(prueba_2)

8480