In [1]:
from pymongo import MongoClient
import json
from geojson import Polygon
import pandas as pd
import numpy as np

In [2]:
client = MongoClient()
db = client.companies
geo=db.geo.find({})
df=pd.DataFrame(geo)

A continuación se realiza una petición basada en la cercanía de los registros. Se obtiene el número de empresas que tiene cerca cada una de las empresas, la varianza en el número de empleados entre las empresas cercanas y la categorías de cada una de las empresas cercanas a una empresa. La varianza en el numero de empleados nos asegura que exista una mayor variedad de tamaños de empresas (startups vs grandes empresas). Las categorías se guardarán como una lista con la cual podremos comprobar si entre las empresas cercanas existen empresas de video_games.

In [3]:
def nearLoc(lon,lat):
    
    nearLocation = {"lng": lon, "lat": lat}

    x = pd.DataFrame(db.geo.find({
        "geo": {
         "$near": {
           "$geometry": {
              "type": "Point" ,
              "coordinates": [ nearLocation["lat"] , nearLocation["lng"]]
           },
           "$maxDistance": 10000, # In meters
         }
       }
    }))
    
    number_of_companies=x.shape[0]
    size_variance=x['number_of_employees'].std()
    categories=list(x['category_code'])
    
    return [number_of_companies,size_variance,categories]

res=list(map(nearLoc,df['latitude'], df['longitude']))

Se crean las nuevas columnas con la información sobre el número de empresas cercanas, la varianza en el número de empleados y la listas de categorías de las empresas cercanas.

In [4]:
df['near_companies']=list(i[0] for i in res)
df['size_variance']=list(i[1] for i in res)
df['categories']=list(i[2] for i in res)
df.head()

Unnamed: 0,_id,category_code,city,founded_year,geo,latitude,longitude,name,number_of_employees,near_companies,size_variance,categories
0,5cd52f5440b61af8a9809714,web,Seattle,2005,"{'type': 'Point', 'coordinates': [-122.333253,...",47.603122,-122.333253,Wetpaint,47.0,113,45.232297,"[web, software, software, network_hosting, web..."
1,5cd52f5440b61af8a9809715,web,New York,2005,"{'type': 'Point', 'coordinates': [-73.9964312,...",40.723731,-73.996431,Wetpaint,47.0,356,4591.503346,"[web, other, enterprise, web, public_relations..."
2,5cd52f5440b61af8a9809716,web,Menlo Park,2003,"{'type': 'Point', 'coordinates': [-122.169472,...",37.48413,-122.169472,Gizmoz,,175,526.495217,"[web, games_video, enterprise, biotech, mobile..."
3,5cd52f5440b61af8a9809717,mobile,Los Angeles,2005,"{'type': 'Point', 'coordinates': [-118.446596,...",34.057498,-118.446596,Helio,,94,44.114945,"[mobile, web, web, ecommerce, advertising, web..."
4,5cd52f5440b61af8a9809718,social,New York City,2007,"{'type': 'Point', 'coordinates': [-73.985506, ...",40.757929,-73.985506,MeetMoi,15.0,352,4610.878102,"[social, network_hosting, games_video, web, ne..."


Se ordenan los registros según el número de empresas cercanas (y la varianza en el número de empleados como valor secundario) y se llama a esto model1. 

In [5]:
model1=df.sort_values(by=['near_companies','size_variance'],ascending=False)
model1.head()

Unnamed: 0,_id,category_code,city,founded_year,geo,latitude,longitude,name,number_of_employees,near_companies,size_variance,categories
1755,5cd52f5440b61af8a9809def,games_video,San Francisco,2008,"{'type': 'Point', 'coordinates': [-122.371203,...",37.804619,-122.371203,Brightstorm,,393,111.391583,"[games_video, network_hosting, hardware, web, ..."
1490,5cd52f5440b61af8a9809ce6,web,San Francisco,2002,"{'type': 'Point', 'coordinates': [-122.3693356...",37.825523,-122.369336,Blogcritics,,391,110.793936,"[web, web, games_video, web, cleantech, games_..."
1643,5cd52f5440b61af8a9809d7f,web,San Francisco,2002,"{'type': 'Point', 'coordinates': [-122.3693356...",37.825523,-122.369336,Blogcritics,,391,110.793936,"[web, web, games_video, web, cleantech, games_..."
15,5cd52f5440b61af8a9809723,games_video,San Francisco,2006,"{'type': 'Point', 'coordinates': [-122.409173,...",37.788482,-122.409173,Kyte,40.0,387,112.380609,"[games_video, web, enterprise, web, enterprise..."
18,5cd52f5440b61af8a9809726,web,San Francisco,2005,"{'type': 'Point', 'coordinates': [-122.402567,...",37.793148,-122.402567,Wesabe,,387,112.380609,"[web, search, enterprise, advertising, web, co..."


Se ordenan los registros según la varianza en el número de empleados (y el número de empresas cercanas como valor secundario) y se llama a esto model2. 

In [6]:
model2=df.sort_values(by=['size_variance','near_companies'],ascending=False)
model2.head()

Unnamed: 0,_id,category_code,city,founded_year,geo,latitude,longitude,name,number_of_employees,near_companies,size_variance,categories
2496,5cd52f5440b61af8a980a0d4,ecommerce,Boston,2009,"{'type': 'Point', 'coordinates': [-71.0213788,...",42.398366,-71.021379,Action Sports Branding,3.0,88,14280.528585,"[ecommerce, network_hosting, web, ecommerce, w..."
1745,5cd52f5440b61af8a9809de5,search,Boston,2005,"{'type': 'Point', 'coordinates': [-71.127226, ...",42.415663,-71.127226,Brick Marketing,10.0,91,14137.059159,"[search, web, enterprise, network_hosting, sea..."
681,5cd52f5440b61af8a98099bd,web,Boston,2008,"{'type': 'Point', 'coordinates': [-71.13129, 4...",42.322072,-71.13129,InformedMD,2.0,92,13997.794843,"[web, web, mobile, ecommerce, mobile, biotech,..."
704,5cd52f5440b61af8a98099d4,web,Boston,2007,"{'type': 'Point', 'coordinates': [-71.13129, 4...",42.322072,-71.13129,TOKiBiz,1.0,92,13997.794843,"[web, web, mobile, ecommerce, mobile, biotech,..."
91,5cd52f5440b61af8a980976f,advertising,Boston,2005,"{'type': 'Point', 'coordinates': [-71.051931, ...",42.340889,-71.051931,ScanScout,40.0,93,13862.640515,"[advertising, advertising, software, ecommerce..."


A continuación se cogerán los primeros 400 registros del modelo 1 y los primeros 400 registros del modelo 2 y se observará la intersección, es decir, aquellos valores que aparezcan como altos en cuanto al número de empresas cercanas y también en cuanto a la varianza en el número de empleados (que tengan variedad de empresas grandes y pequeñas). La empresas que aparezcan el la base llamada "best_choices" marcarán la localización (o localizaciones) ideal(es) para colocar nuestra empresa. Además, se observa si en estos puntos hay empresas de games_video cercanas, encontrándose que sí las hay para todas. 

In [7]:
lst=[]
for i in list(model1['_id'][0:400]): 
    if i in list(model2['_id'][0:400]):
        lst.append(i)
best_choices=df.loc[(df['_id'].isin(lst))]

count=0
print(best_choices.shape[0])
for i in best_choices['categories']:
    if 'games_video' in i:
        print(count,'True')
    count+=1


14
0 True
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
9 True
10 True
11 True
12 True
13 True


In [8]:
best_choices

Unnamed: 0,_id,category_code,city,founded_year,geo,latitude,longitude,name,number_of_employees,near_companies,size_variance,categories
22,5cd52f5440b61af8a980972a,games_video,New York,2004,"{'type': 'Point', 'coordinates': [-73.99873, 4...",40.722655,-73.99873,Pando Networks,23.0,357,4581.913489,"[games_video, public_relations, other, web, mo..."
448,5cd52f5440b61af8a98098d4,web,New York,2006,"{'type': 'Point', 'coordinates': [-74.005897, ...",40.715145,-74.005897,Convos,,357,4581.913489,"[web, other, software, ecommerce, web, games_v..."
568,5cd52f5440b61af8a980994c,games_video,New York,2001,"{'type': 'Point', 'coordinates': [-74.002662, ...",40.717248,-74.002662,Boonty,150.0,357,4581.913489,"[games_video, ecommerce, web, software, games_..."
602,5cd52f5440b61af8a980996e,web,New York,2007,"{'type': 'Point', 'coordinates': [-74.00118, 4...",40.718871,-74.00118,Thrive,15.0,357,4581.913489,"[web, software, advertising, games_video, ente..."
942,5cd52f5440b61af8a9809ac2,web,New York,2005,"{'type': 'Point', 'coordinates': [-74.0013669,...",40.723839,-74.001367,GoMobo,,357,4581.913489,"[web, mobile, games_video, web, web, web, web,..."
1554,5cd52f5440b61af8a9809d26,web,New York,2008,"{'type': 'Point', 'coordinates': [-74.004441, ...",40.721583,-74.004441,photothread,,357,4581.913489,"[web, web, advertising, web, ecommerce, softwa..."
1706,5cd52f5440b61af8a9809dbe,web,New York,2008,"{'type': 'Point', 'coordinates': [-74.004441, ...",40.721583,-74.004441,photothread,,357,4581.913489,"[web, web, advertising, web, ecommerce, softwa..."
1749,5cd52f5440b61af8a9809de9,public_relations,New York,2007,"{'type': 'Point', 'coordinates': [-73.9973, 40...",40.722118,-73.9973,GroundRez,20.0,357,4581.913489,"[public_relations, games_video, other, web, en..."
1868,5cd52f5440b61af8a9809e60,software,New York,2006,"{'type': 'Point', 'coordinates': [-74.001169, ...",40.718888,-74.001169,ReverbNation,85.0,357,4581.913489,"[software, web, advertising, games_video, ente..."
2012,5cd52f5440b61af8a9809ef0,games_video,New York,2007,"{'type': 'Point', 'coordinates': [-74.005398, ...",40.716682,-74.005398,Pilotlite,5.0,357,4581.913489,"[games_video, web, other, ecommerce, games_vid..."


Se guarda best_choices para representar en tableau nuestro área de interés.

In [9]:
best_choices.to_csv('./best_choices.csv')

Para encontrar el punto central entre las 14 empresas obtenidas, habiendo visto en tableau que los puntos están muy próximos, se puede tratar la tierra como si fuese plana en este punto, y hallar el punto central de la siguiente manera: 

In [20]:
ll=list(zip(best_choices['latitude'],best_choices['longitude']))


lat = []
long = []
for l in ll :
    lat.append(l[0])
    long.append(l[1])

final_lat=sum(lat)/len(lat)
final_long=sum(long)/len(long)
print('The best location to set my company is {}, {}'.format(final_lat, final_long))


The best location to set my company is 40.71942090714286, -74.00237357857142


La localización de las empresas pertenecientes a best_choices, las cuales determinan el punto central en el que se decidirá poner la empresa puede verse en tableau. 
URL: https://public.tableau.com/profile/patricia.rick#!/vizhome/Best_choices/Dashboard1?publish=yes