In [1]:
from pymongo import MongoClient
import pandas as pd
import json
from pandas.io.json import json_normalize
import re
from pymongo import MongoClient, GEO2D

In [2]:
client = MongoClient ('localhost', 27017)
db = client['companies']  

In [3]:
# Ordena los datos según el año de fundación en orden descendente. El año más alto es 2013.
# db.companies.find_one(sort=[('founded_year', -1)])

In [4]:
# PRIMER PASO: FILTRO DE DATOS:
filtered = list(db.companies.find({'$and':[{'number_of_employees':{'$gte': 250}}, {'founded_year': {'$gte': 2009}}, {'offices.latitude': {'$exists': True}}, {'offices.longitude': {'$exists':True}}]}, ['offices.latitude', 'offices.longitude']))
filtered[:5]

[{'_id': ObjectId('52cdef7c4bab8bd675298608'),
  'offices': [{'latitude': 34.052187, 'longitude': -118.243425}]},
 {'_id': ObjectId('52cdef7e4bab8bd67529b2f3'),
  'offices': [{'latitude': None, 'longitude': None}]},
 {'_id': ObjectId('52cdef7e4bab8bd67529ba36'),
  'offices': [{'latitude': None, 'longitude': None},
   {'latitude': None, 'longitude': None},
   {'latitude': None, 'longitude': None}]}]

In [5]:
# NÚMERO DE REGISTROS TRAS EL PRIMER FILTRO: 
db.companies.count_documents({'$and':[{'number_of_employees':{'$gte': 1}}, {'founded_year': {'$gte': 2009}}]})

630

In [6]:
query_data = db.companies.find({'$and':[{'number_of_employees':{'$gt': 1}}, {'founded_year': {'$gt': 2009}}, {'offices.latitude': {'$exists': True}}, {'offices.longitude': {'$exists':True}}]})

In [7]:
df = json_normalize(data=query_data, record_path='offices', meta=['name', 'category_code', 'number_of_employees', 'founded_year', 'total_money_raised'])

In [8]:
df.head()

Unnamed: 0,address1,address2,city,country_code,description,latitude,longitude,state_code,zip_code,name,founded_year,category_code,total_money_raised,number_of_employees
0,,,San Francisco,USA,,,,CA,,CircleUp,2011,finance,$9M,11
1,,,New York,USA,,40.757929,-73.985506,NY,,PeekYou,2012,search,$1.83M,20
2,4966 El Camino Real,Suite 200,Los Altos,USA,,33.8171,-111.9035,CA,94022.0,GENWI,2010,mobile,$7.1M,25
3,Ritterstr. 12-14,,Berlin,DEU,,52.501345,13.410907,,10969.0,headr,2012,web,$0,8
4,Fischerstr. 13,,Hannover,DEU,,,,,30167.0,headr,2012,web,$0,8


In [9]:
# Borra filas con valores nulos en las columnas latitude, longitude, city and country_code
df = df.dropna(axis = 0, how ='any', subset=['latitude', 'longitude', 'city', 'country_code'])
df.head() 

Unnamed: 0,address1,address2,city,country_code,description,latitude,longitude,state_code,zip_code,name,founded_year,category_code,total_money_raised,number_of_employees
1,,,New York,USA,,40.757929,-73.985506,NY,,PeekYou,2012,search,$1.83M,20
2,4966 El Camino Real,Suite 200,Los Altos,USA,,33.8171,-111.9035,CA,94022,GENWI,2010,mobile,$7.1M,25
3,Ritterstr. 12-14,,Berlin,DEU,,52.501345,13.410907,,10969,headr,2012,web,$0,8
5,One Franklin Parkway,"Building 910, Suite 110",San Mateo,USA,,37.566879,-122.323895,CA,94401,Fixya,2013,web,$8M,30
6,460 Ste-Catherine W. #402,,Montreal,CAN,Needium HQ,45.504931,-73.568163,,H3B1A7,Needium,2010,enterprise,$1M,15


In [10]:
# Comprueba que latitud y longitud son de tipo float:
df.dtypes

address1                object
address2                object
city                    object
country_code            object
description             object
latitude               float64
longitude              float64
state_code              object
zip_code                object
name                    object
founded_year             int64
category_code           object
total_money_raised      object
number_of_employees      int64
dtype: object

In [11]:
# Una vez filtrados los datos y tras eliminar registros que tengan un valor nulo en las columnas 'latitude' o 'longitude'
# quedan estas empresas:
len(df)

55

In [12]:
# Criterio empleado para clasificar las compañías como 'big': número de empleados igual o mayor a 250.
big1 = df[(df['number_of_employees'] >= 250)] 

# Según este criterio el porcentaje de big companies sobre el total de empresas, sería 0.021 
print('Big companies: {}'.format(len(big1)))
print('Total companies: {}'.format(len(df)))
print('Ratio Big companies / Total companies: {}'.format(len(big1)/len(df)))
 

Big companies: 1
Total companies: 55
Ratio Big companies / Total companies: 0.01818181818181818


In [13]:
#  Columna con formato adecuado para longitude y latitude. Ej: { type: "Point", coordinates: [ 40, 5 ] }
#  List the longitude first and then latitude

def get_geo(a, b):
    return {'type': 'Point', 'coordinates': [a, b]}

df['geopoint'] = df[['longitude', 'latitude']].apply(lambda x: get_geo(x['longitude'], x['latitude']), axis=1)

In [14]:
df.head(3)

Unnamed: 0,address1,address2,city,country_code,description,latitude,longitude,state_code,zip_code,name,founded_year,category_code,total_money_raised,number_of_employees,geopoint
1,,,New York,USA,,40.757929,-73.985506,NY,,PeekYou,2012,search,$1.83M,20,"{'coordinates': [-73.985506, 40.757929], 'type..."
2,4966 El Camino Real,Suite 200,Los Altos,USA,,33.8171,-111.9035,CA,94022.0,GENWI,2010,mobile,$7.1M,25,"{'coordinates': [-111.9035, 33.8171], 'type': ..."
3,Ritterstr. 12-14,,Berlin,DEU,,52.501345,13.410907,,10969.0,headr,2012,web,$0,8,"{'coordinates': [13.4109071, 52.5013449], 'typ..."


In [15]:
# Exporta el df a json:
df.to_json('./companies.json', orient='records', lines=True)

In [16]:
# Query sobre la colección 'geo' (la colección está en la base de datos 'companies')

In [17]:
def geoquery(lat, long, rad):
    client = MongoClient() 
    db = client['companies'] 
    
    geo = pd.DataFrame(db.geo.find({
        "geopoint": {
         "$near": {
           "$geometry": {
              "type": "Point" ,
              "coordinates": [long,lat]
           },
           "$maxDistance": rad, # In meters
         }
       }
    }))

    return geo

In [18]:
cities = pd.pivot_table(df, index=['country_code', 'city'], values=['number_of_employees'] , aggfunc=[len, sum])
cities

Unnamed: 0_level_0,Unnamed: 1_level_0,len,sum
Unnamed: 0_level_1,Unnamed: 1_level_1,number_of_employees,number_of_employees
country_code,city,Unnamed: 2_level_2,Unnamed: 3_level_2
AUS,Queensland,1,10
CAN,Kanata,1,3
CAN,Montreal,1,15
CAN,Toronto,1,60
CAN,Vancouver,1,10
DEU,Berlin,1,8
ESP,Bilbao,1,4
GBR,Manchester,1,11
IND,Bangalore,1,25
NLD,Maarssen,1,80


In [19]:
# Función para calcular el total de empleados en el área seleccionada
def get_employees_near(lat, lon, radio): 
    df = geoquery(lat, lon, radio)
    sum_empleados = df['number_of_employees'].sum()
    return sum_empleados

In [20]:
# Dataframe para el máximo número total de empleados tras filtrar las empresas que se dedican
# a los video juegos

empleados500 = []
radio = 500

for i in range(len(df)):
    lon, lat = df.iloc[i, 6], df.iloc[i, 5]  
    empleados500.append(get_employees_near(lat, lon, radio))
    
df['empleados500'] = empleados500

df = df[df['category_code']=='games_video'].sort_values(by=['empleados500'], ascending=False)
df.head(10)

Unnamed: 0,address1,address2,city,country_code,description,latitude,longitude,state_code,zip_code,name,founded_year,category_code,total_money_raised,number_of_employees,geopoint,empleados500
72,,,San Francisco,USA,,37.775196,-122.419204,CA,,Fliggo,2012,games_video,$0,2,"{'coordinates': [-122.419204, 37.775196], 'typ...",1080
12,9570 W. Pico Blvd 3rd Floor,,Los Angeles,USA,HQ Los Angeles,37.446823,-122.161523,CA,90035.0,Social Gaming Network,2011,games_video,$17.1M,100,"{'coordinates': [-122.161523, 37.446823], 'typ...",251
13,9606 S. Santa Monica Blvd,2nd Floor,Beverly Hills,USA,,34.069849,-118.405418,CA,90210.0,Social Gaming Network,2011,games_video,$17.1M,100,"{'coordinates': [-118.4054182, 34.0698494], 't...",102
71,,,Bangalore,IND,India Office,12.971606,77.594376,,,BASH Gaming,2010,games_video,$1M,25,"{'coordinates': [77.594376, 12.971606], 'type'...",67
24,,,Vancouver,CAN,Vancouver,49.263588,-123.138565,,,Pixelmatic,2011,games_video,$0,10,"{'coordinates': [-123.138565, 49.263588], 'typ...",27
70,,,Fremont,USA,US Office,37.556171,-122.078063,CA,94555.0,BASH Gaming,2010,games_video,$1M,25,"{'coordinates': [-122.0780632, 37.5561712], 't...",25
23,"1351 4th Street, 4th floor",,Santa Monica,USA,Santa Monica Office,53.544711,-113.515769,CA,90401.0,Titan Gaming,2010,games_video,$1M,18,"{'coordinates': [-113.515769, 53.544711], 'typ...",18
66,1750 Montgomery St,Suite 150,San Francisco,USA,Headquarters,37.805324,-122.405276,CA,94133.0,Indee,2010,games_video,$0,3,"{'coordinates': [-122.4052761, 37.8053241], 't...",17
86,Fortitude Valley,Fortitude Valley,Queensland,AUS,,-27.457486,153.033686,,4006.0,ekidnaworld,2010,games_video,$0,10,"{'coordinates': [153.033686, -27.4574861], 'ty...",10
19,"3979 Freedom Circle, Suite 610",,Santa Clara,USA,,37.760524,-122.387799,CA,95054.0,Fuzz,2011,games_video,$0,6,"{'coordinates': [-122.387799, 37.760524], 'typ...",6


In [21]:
df.head()

Unnamed: 0,address1,address2,city,country_code,description,latitude,longitude,state_code,zip_code,name,founded_year,category_code,total_money_raised,number_of_employees,geopoint,empleados500
72,,,San Francisco,USA,,37.775196,-122.419204,CA,,Fliggo,2012,games_video,$0,2,"{'coordinates': [-122.419204, 37.775196], 'typ...",1080
12,9570 W. Pico Blvd 3rd Floor,,Los Angeles,USA,HQ Los Angeles,37.446823,-122.161523,CA,90035.0,Social Gaming Network,2011,games_video,$17.1M,100,"{'coordinates': [-122.161523, 37.446823], 'typ...",251
13,9606 S. Santa Monica Blvd,2nd Floor,Beverly Hills,USA,,34.069849,-118.405418,CA,90210.0,Social Gaming Network,2011,games_video,$17.1M,100,"{'coordinates': [-118.4054182, 34.0698494], 't...",102
71,,,Bangalore,IND,India Office,12.971606,77.594376,,,BASH Gaming,2010,games_video,$1M,25,"{'coordinates': [77.594376, 12.971606], 'type'...",67
24,,,Vancouver,CAN,Vancouver,49.263588,-123.138565,,,Pixelmatic,2011,games_video,$0,10,"{'coordinates': [-123.138565, 49.263588], 'typ...",27


In [23]:
# pasas df a csv y lo representas
df.to_csv('top10companies.csv')