# Proyecto Tableau - Semana 4 - Mongodb y Folium

### Definción del cliente:

#### Negocios
1. Estar en la zona en donde haya menos empresas estancadas (Deadpooled)
2. Estar en la zona con mayores empresas del tipo "web", "social", "hardware", "software"
3. Reducir la búsqueda a USA
4. Ponderar a las empresas anteriores por la cantidad de empleados
5. Encontrar empresa ancla con empresas satelite en un radio de 10km


In [3]:
import pandas as pd
import numpy as np
import pymongo as pm
import folium
import json

Filtrar las caracteristicas de Negocio de la base de datos companies para tener una base de análisis. De aqui elegir al menos 3 ciudades con potencial. Mostrar los mapas de calor de todo el pais 

In [4]:
#Estableciendo instancia de cliente al servidor local
client=pm.MongoClient('mongodb://localhost:27017')
#Creando referencia a la base de datos "companies" a través de la variable db
#Nota: companies en esta instancia se refiere a la base da datos
#no a la colección (tabla)
db=client.companies


In [5]:
#Ingresando la primera búsqueda de empresas que no estén estancadas 
#y poniendolas en un data frame

df_ndp=pd.DataFrame(db.companies.find(
                                      {'$and':
                                       [
                                        {'deadpooled_year':None},
                                        {'offices':{'$exists':True}},
                                        {'offices.0.latitude':{'$gt':-180}},
                                        {'offices.0.country_code':'USA'},
                                        {'$or':[
                                                {'category_code':'web'},
                                                {'category_code':'social'},
                                                {'category_code':'hardware'},
                                                {'category_code':'software'}
                                               ]
                                        }
                                       ]
                                      },
                                      {'name':1,'_id':0,'number_of_employees':1,
                                       'offices':1}
                                     ).sort('number_of_employees',-1)
                   )

In [6]:
df_ndp=df_ndp.dropna()
df_ndp=df_ndp.reset_index()
df_ndp=df_ndp.drop(columns='index')
len(df_ndp)

1523

In [7]:
from pandas.io.json import json_normalize
def getnprods(prods):#Recibe campo en json de los productos e.g. df_ndp.products[i]
    return len(json_normalize(prods))

In [8]:
def getcity(city):#Recibe campo en json offices e.g. df_ndp.offices[i]
     return json_normalize(city).city[0]

In [9]:
def getgps(regloc):#Recibe campo en json offices e.g. df_ndp.offices[i]
    lat=json_normalize(regloc).latitude[0]
    lon=json_normalize(regloc).longitude[0]
    return [lat,lon]

In [10]:
def getsizene(nemp):#Recibe campo en json de n de empleados e.g. df_ndp.number_of_employees[]
    if nemp > 100000:
        return 14
    elif nemp >50000:
        return 12
    elif nemp >10000:
        return 10
    elif nemp >1000:
        return 8
    elif nemp >500:
        return 6
    elif nemp >100:
        return 4
    else:
        return 1
    

In [104]:
from math import sin, cos, sqrt, atan2, radians
def pond_distKm(p1,p2,r): #Recibe dos puntos en dos listas y el rango de validez en km
    lat1 = p1[0]
    lon1 = p1[1]
    lat2 = p2[0]
    lon2 = p2[1]
    
    #R = 6373.0
    #dlon = lon2 - lon1
    #dlat = lat2 - lat1

    #a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    #c = 2 * atan2(sqrt(a), sqrt(1 - a))

    #d = c*R
    
    d=sqrt((lat2-lat1)**2+(lon2-lon1)**2)*111.1
    
    #return d
    if d>r:
        return 0
    else:
        return 1

In [113]:
from tqdm import tqdm_notebook as tqdm
import multiprocessing as mp

In [138]:
def pond(df):
    pond_lugares=[]
    for i in tqdm(range(len(df)),'Main Loop'):
        pond_lugares.append([])
        for j in range(len(df)):
            if getcity(df.offices[i]) == getcity(df.offices[j]) and i!=j:
                pond_lugares[i].append(pond_distKm(getgps(df.offices[i]),
                                       getgps(df.offices[j]),10)\
                                      *getsizene(df.number_of_employees[i])\
                                      *getsizene(df.number_of_employees[j]))
            else:
                pond_lugares[i].append(0)
    return pond_lugares           

In [191]:
%time
p=[]
p=pond(df_ndp)
#pool=mp.Pool(4)
#pond_lugares=pool.map(pond, df_ndp[:100])

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 23.1 µs


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, description='Main Loop', max=1523.0, style=ProgressStyle(description_w…




In [192]:
ranklugar=[]
for i in range(len(p)):
    ranklugar.append(sum(p[i]))
ranklugar=pd.DataFrame(ranklugar)

In [193]:
len(ranklugar)

1523

In [194]:
dftemp=df_ndp[:]

In [195]:
dftemp['rank']=ranklugar

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [196]:
dftemp=dftemp.sort_values(by=['rank'], ascending=False)
dftemp.reset_index
dftemp.head(10)

Unnamed: 0,name,number_of_employees,offices,rank
43,ThoughtWorks Studios,1100.0,"[{'description': 'TW Studios SF', 'address1': ...",1392
38,Twitter,1300.0,"[{'description': '', 'address1': '1355 Market ...",1384
45,Riverbed Technology,1000.0,"[{'description': 'Headquarters', 'address1': '...",1056
66,OpenTable,550.0,"[{'description': '', 'address1': 'One Montgome...",1050
16,AOL,8000.0,"[{'description': 'HQ', 'address1': '770 Broadw...",824
173,PowerReviews,120.0,"[{'description': 'US Office', 'address1': '22 ...",712
116,Eventbrite,200.0,"[{'description': 'HQ', 'address1': '651 Branna...",712
68,Splunk,500.0,"[{'description': 'Headquarters', 'address1': '...",712
132,Lyft,180.0,"[{'description': 'Lyft HQ', 'address1': '', 'a...",708
128,SquareTrade,190.0,"[{'description': None, 'address1': '575 Market...",708


In [197]:
mapa=folium.Map(tiles='openstreetmap',zoom_start=7,zoom_max = 10)
for i in range(len(dftemp[:10])):
    folium.CircleMarker(getgps(dftemp.offices[i]), 
                        radius=getsizene(df_ndp.number_of_employees[i]),
                        icon=folium.Icon()).add_to(mapa)    

In [198]:
mapa

Complementar la busqueda con los criterios de Empleados complementadndo con las bases de datos de estaciones del metro de las ciudades con un potencial adecuado. Mostrar los mapas de calor de las 3 ciudades o zonas