In [1]:
import pymongo
from pymongo import MongoClient
from pandas.io.json import json_normalize
import pandas as pd
import os


In [2]:
client = MongoClient ('localhost', 27017)
db = client['companies']

In [3]:
#Listado de categorias de las compañias
categorias = db.companies.distinct('category_code');
print(list(categorias))

['enterprise', 'web', 'software', 'news', 'network_hosting', 'games_video', 'mobile', 'music', 'social', 'search', 'messaging', 'advertising', 'photo_video', 'security', 'finance', 'ecommerce', 'travel', 'hardware', 'public_relations', 'other', 'real_estate', 'semiconductor', 'analytics', 'health', 'legal', 'sports', 'biotech', 'cleantech', 'education', 'consulting', 'transportation', None, 'hospitality', 'fashion', 'nonprofit', 'nanotech', 'automotive', 'design', 'manufacturing', 'government', 'local', 'medical']


In [4]:
categorias = ['web', 'software', 'games_video', 'social', 'design', 'search']
campos = {'name': 1,
          'number_of_employees': 1,
          'category_code': 1,
          'acquisition': 1,
          'founded_year': 1,
          'funding_rounds': 1,
          'offices.latitude': 1,
          'offices.longitude': 1,
          'offices.city': 1,
          'offices.country_code': 1}

def buscar_oficinas(categorias, campos, anyo, mm, num_emp):
    '''
    categorias: los item por los que filtrar
    campos: los campos a mostrar
    mm: par el número de empleados, si es mayor o menor $gt, $lt ...
    num_emp: número de empleados
    '''
    datos = db.companies.find({'$and':[{'category_code': {'$in':categorias}},
                                       {'number_of_employees': {mm: num_emp}},
                                       {'offices': {'$exists': True, '$not': {'$size': 0}}},
                                       {'deadpooled_year': None},
                                       {'founded_year':{'$gte': anyo}}]},
                              campos)
    return datos



In [5]:
big_office_data = json_normalize(data = buscar_oficinas(categorias, campos, 1990, '$gt', 250), 
                             record_path ='offices', 
                             meta=['name', 'category_code', 'founded_year', 'number_of_employees']) 
big_office_data.head()

Unnamed: 0,city,country_code,latitude,longitude,name,category_code,founded_year,number_of_employees
0,Menlo Park,USA,37.41605,-122.151801,Facebook,social,2004,5299
1,Dublin,IRL,53.344104,-6.267494,Facebook,social,2004,5299
2,New York,USA,40.755716,-73.979247,Facebook,social,2004,5299
3,San Francisco,USA,37.776805,-122.416924,Twitter,social,2006,1300
4,San Jose,USA,37.295005,-121.930035,eBay,web,1995,15000


In [6]:
big_office_data.shape

(354, 8)

In [7]:
startup_office_data = json_normalize(data = buscar_oficinas(categorias, campos, 2010, '$lt', 100), 
                             record_path ='offices', 
                             meta=['name', 'category_code', 'founded_year', 'number_of_employees']) 
startup_office_data.head()

Unnamed: 0,city,country_code,latitude,longitude,name,category_code,founded_year,number_of_employees
0,New York,USA,40.757929,-73.985506,PeekYou,search,2012,20
1,Berlin,DEU,52.501345,13.410907,headr,web,2012,8
2,Hannover,DEU,,,headr,web,2012,8
3,San Mateo,USA,37.566879,-122.323895,Fixya,web,2013,30
4,Palo Alto,USA,,,Simplicant,software,2012,10


In [8]:
data = pd.concat([startup_office_data, big_office_data])
data = data.reset_index()
data = data.dropna()
data.shape

(183, 9)

In [9]:
data['coordenadas'] = [[x, y] for x, y in zip(data['longitude'], data['latitude'])]
data.head()

Unnamed: 0,index,city,country_code,latitude,longitude,name,category_code,founded_year,number_of_employees,coordenadas
0,0,New York,USA,40.757929,-73.985506,PeekYou,search,2012,20,"[-73.985506, 40.757929]"
1,1,Berlin,DEU,52.501345,13.410907,headr,web,2012,8,"[13.4109071, 52.5013449]"
3,3,San Mateo,USA,37.566879,-122.323895,Fixya,web,2013,30,"[-122.323895, 37.566879]"
5,5,Santa Clara,USA,37.760524,-122.387799,Fuzz,games_video,2011,6,"[-122.387799, 37.760524]"
6,6,Van Nuys,USA,40.650291,-74.294395,CollegeConvo,web,2010,2,"[-74.294395, 40.650291]"


In [10]:
data.to_json('oficinas.json', orient="records", lines=True)

In [11]:
db_ofi = client['oficinas']
def create_db(archivo = 'oficinas'):
    
    existe = os.path.isfile(archivo + '.json')
    if existe:
        os.system('mongoimport --db oficinas --collection oficinas --drop --file '+ archivo +'.json')
        db_ofi.oficinas.create_index([('coordenadas', pymongo.GEOSPHERE )])
        print('Base de datos y collección creadas')
    else:
        raise ValueError('Error archivo no encontrado')
    
create_db()

Base de datos y collección creadas


In [12]:
def buscar_cercanas(lng, lat, min_dist_m = 0, max_dist_m = 3000):
    nearLocation = {
        "lng": lng,
        "lat": lat
    }
    busqueda = db_ofi.oficinas.find({
        "coordenadas": {
         "$near": {
           "$geometry": {
              "type": "Point" ,
              "coordinates": [ nearLocation["lng"] , nearLocation["lat"] ]
           },
           '$minDistance': min_dist_m,
           "$maxDistance": max_dist_m, 
         }
       }
        
    },{'latitude': 1,
       'longitude': 1,
       'city': 1,
       'name': 1,
       'category_code': 1,
       'number_of_employees': 1,
       '_id': 0})
    
    return busqueda



#query = buscar_cercanas(-73.990869, 40.748368, 0, 10000)


cercanos = [buscar_cercanas(data.iloc[i]['longitude'], data.iloc[i]['latitude']) for i in range(len(data))]    

In [29]:
data['cercanas'] = data.apply(lambda x : list(buscar_cercanas(x['longitude'], x['latitude'])), axis = 1)

def ratios_sb(oficinas):
    startup, big = 0, 0
    for office in oficinas:
        if office['number_of_employees'] > 250:
            big += 1
        else:
            startup += 1
    if startup > 0 and big > 0:
        salida = big / startup
    else:
        salida = 100 
        #(big, startup, salida)
    return salida

data['num_offices'] = data['cercanas'].apply(lambda x: len(x))
data['ratio'] = data['cercanas'].apply(ratios_sb)
datos_finales = data[(data.num_offices > 5)].head(183)
display(data.head(200))
print(datos_finales['ratio'].idxmin())
#print(datos_finales['cercanas'][20])
final = data.cercanas[(datos_finales['ratio'].idxmin())]
df_final = pd.DataFrame(final)
df_final.head(200)

Unnamed: 0,index,city,country_code,latitude,longitude,name,category_code,founded_year,number_of_employees,coordenadas,cercanas,num_offices,ratio
0,0,New York,USA,40.757929,-73.985506,PeekYou,search,2012,20,"[-73.985506, 40.757929]","[{'city': 'New York', 'latitude': 40.757929, '...",6,1.000000
1,1,Berlin,DEU,52.501345,13.410907,headr,web,2012,8,"[13.4109071, 52.5013449]","[{'city': 'Berlin', 'latitude': 52.5013449, 'l...",2,1.000000
3,3,San Mateo,USA,37.566879,-122.323895,Fixya,web,2013,30,"[-122.323895, 37.566879]","[{'city': 'San Mateo', 'latitude': 37.566879, ...",2,1.000000
5,5,Santa Clara,USA,37.760524,-122.387799,Fuzz,games_video,2011,6,"[-122.387799, 37.760524]","[{'city': 'Santa Clara', 'latitude': 37.760524...",5,4.000000
6,6,Van Nuys,USA,40.650291,-74.294395,CollegeConvo,web,2010,2,"[-74.294395, 40.650291]","[{'city': 'Van Nuys', 'latitude': 40.650291, '...",2,100.000000
7,7,Bethesda,USA,38.989124,-77.026676,Carfeine,software,2012,5,"[-77.026676, 38.989124]","[{'city': 'Bethesda', 'latitude': 38.989124, '...",1,100.000000
8,8,Palo Alto,USA,37.444098,-122.161287,Ziippi,web,2011,6,"[-122.1612868, 37.4440981]","[{'city': 'Palo Alto', 'latitude': 37.4440981,...",1,100.000000
9,9,Santa Monica,USA,53.544711,-113.515769,Titan Gaming,games_video,2010,18,"[-113.515769, 53.544711]","[{'city': 'Santa Monica', 'latitude': 53.54471...",1,100.000000
10,10,Vancouver,CAN,49.263588,-123.138565,Pixelmatic,games_video,2011,10,"[-123.138565, 49.263588]","[{'city': 'Vancouver', 'latitude': 49.263588, ...",2,1.000000
12,12,Chicago,USA,41.857204,-87.623923,JumpForward,software,2010,30,"[-87.623923, 41.857204]","[{'city': 'Chicago', 'latitude': 41.857204, 'l...",2,1.000000


0


Unnamed: 0,category_code,city,latitude,longitude,name,number_of_employees
0,search,New York,40.757929,-73.985506,PeekYou,20
1,social,New York,40.755716,-73.979247,Facebook,5299
2,software,New York,40.764577,-73.979901,Unison Technologies,30
3,web,New York,40.744618,-73.987764,Yipit,23
4,search,New York,40.74222,-74.004489,Google,28000
5,web,New York,40.741888,-74.004747,MLB Advanced Media,600


In [None]:
import folium     
mapa = folium.Map(location=[15, 0], tiles="openstreetmap", zoom_start=3)

for i in range(0,len(data)):
    if  data.iloc[i]['number_of_employees'] >= 250:
        folium.Marker([data.iloc[i]['latitude'], 
                       data.iloc[i]['longitude']], 
                      popup=data.iloc[i]['name'],
                      icon=folium.Icon(color='blue')).add_to(mapa)
    else:
        folium.Marker([data.iloc[i]['latitude'], 
                       data.iloc[i]['longitude']], 
                      popup=data.iloc[i]['name'],
                      icon=folium.Icon(color='green')).add_to(mapa)  
        
        
        
leyenda = '''
    <style type="text/css">
        #leyenda{
            position: fixed;
            z-index: 9999;
            font-size: 1em;
            background-color: #ffffff;
            color: #333333;
            bottom: 10px;
            right: 10px;
            padding: 10px;
            border: 1px solid #333333;
        }
        
        #leyenda .verde{
            color: #71af26;
        }
        
        #leyenda .azul{
            color: #36a5d6;
        }
    </style>
    
     <div id="leyenda">
        <i class="fa fa-map-marker fa-2x verde"></i> Startup <br>
        <i class="fa fa-map-marker fa-2x azul"></i> Big Company
      </div>
     '''
mapa.get_root().html.add_child(folium.Element(leyenda))
mapa.save('map-oficinas.html')


In [None]:
#from IPython.display import HTML
#HTML(filename='map-oficinas.html')