### Establecer la localización más óptima para las oficinas de una empresa de video juegos con los siguientes condiciones:

##### -Es una compañía de software que crea videojuegos.
##### -Tiene 50 empleados. Los roles dentro de la nueva compañía son: 20 desarrolladores, 20 diseñadores/creativos/UX-UI y 10 ejecutivos/managers.
##### -Debe haber ingenieros de software trabajando alrededor, así como diseñadores, desarrolladores y ejecutivos para cubrir los intereses del equipo.
##### -Debe haber un buen ratio de empresas grandes vs. startups.
##### -Se prefieren empresas de reciente creación.




In [1]:
# Importar librerias.
import pandas as pd
import numpy as np
import pymongo
import geopandas

In [2]:
# Cargar la base de de MongoDB.
MongoClient = pymongo.MongoClient # Llamar a Mongo (conectar con el cliente).
client = MongoClient() # Guardar Mongo en variable cliente.
db = client.companies # Dentro de mongo conecto con base de dato company.

In [3]:
# Imprimo todas las categorias para ver que tengo.
cursor = db.companies.find({'category_code':{'$exists': 'true'}}) # Creo mi cursor para iterar secuencialmente los resultados obtenidos de la query.
categories = list(map(lambda x:x['category_code'], list(cursor))) # Creo función lambda para que me consiga todos mis category_code que debo convertir en lista ya que es un objeto.

category_codes = [] # Asigno y creo lista vacia a category_codes.
for element in categories: # Para cada elemento en categories 
    if element not in category_codes: # si el elemento no esta en category_codes
        category_codes.append(element) # Entonces mi lista vacia de category_codes le voy a .append(agregar) el elemento
    else:                              # si no.
        pass                           # pasa.
print(category_codes)   # Display todas las catergory_codes de mi lista.  

['enterprise', 'software', 'web', 'news', 'network_hosting', 'games_video', 'music', 'mobile', 'social', 'search', 'messaging', 'advertising', 'security', 'photo_video', 'finance', 'ecommerce', 'travel', 'hardware', 'public_relations', 'other', 'real_estate', 'semiconductor', 'analytics', 'health', 'legal', 'sports', 'biotech', 'cleantech', 'education', 'consulting', 'transportation', None, 'hospitality', 'fashion', 'nonprofit', 'nanotech', 'automotive', 'design', 'manufacturing', 'government', 'local', 'medical']


In [4]:
companies_interest_cursor = db.companies.find({"$and": 
                               [{"offices.latitude": {"$exists": True}}, {"offices.latitude": {"$ne": None}},
                                {"offices.longitude": {"$exists": True}}, {"offices.longitude": {"$ne": None}},
                                {"category_code": {"$exists": True}}, {"category_code": {"$ne": None}},
                                {"name": {"$exists": True}}, {"name": {"$ne": None}},
                                {"deadpooled_year": {"$eq": None}},
                                {"founded_year": {"$gte": 2005}}, 
                                {"category_code": {"$ne": ["enterprise", "news", "security", "messaging",
                                                           "finance", "ecommerce", "travel", "public_relations",
                                                           "other", "real_estate","semiconductor", "analytics",
                                                           "health", "legal", "education", "consulting",
                                                           "transportation", "None", "hospitality", "fashion"
                                                          "nonprofit", "automotive", "design", "manufacturing"
                                                          "government", "local", "medical"]}}]}, 
                              
                                {"_id":1, 
                                 "category_code":1,
                                 "founded_year":1,
                                 "ipo.valuation_amount":1, 
                                 "name":1,
                                 "number_of_employees":1,
                                 "offices.latitude":1,
                                 "offices.longitude":1,
                                 "total_money_raised":1})

In [5]:
df_companies = pd.DataFrame(companies_interest_cursor) # Creando dataframe con companies_interest_cursor

display(df_companies.shape) # Mostrando el tamaño de mi dataframe.

(4204, 8)

In [6]:
df_companies.head(10) # Mostrar las 10 primeras filas de mi dataframe.

Unnamed: 0,_id,category_code,founded_year,ipo,name,number_of_employees,offices,total_money_raised
0,52cdef7c4bab8bd675297d91,web,2006,,Geni,18.0,"[{'latitude': 34.090368, 'longitude': -118.393...",$16.5M
1,52cdef7c4bab8bd675297d98,music,2006,,Slacker,,"[{'latitude': 33.022176, 'longitude': -117.081...",$73.1M
2,52cdef7c4bab8bd675297d9a,mobile,2005,,Helio,,"[{'latitude': 34.057498, 'longitude': -118.446...",$0
3,52cdef7c4bab8bd675297d97,news,2007,,Scribd,50.0,"[{'latitude': 37.789634, 'longitude': -122.404...",$25.8M
4,52cdef7c4bab8bd675297d9c,social,2007,,MeetMoi,15.0,"[{'latitude': 40.757929, 'longitude': -73.9855...",$5.58M
5,52cdef7c4bab8bd675297d9d,games_video,2006,,Joost,0.0,"[{'latitude': 40.7464969, 'longitude': -74.009...",$45M
6,52cdef7c4bab8bd675297d94,social,2006,{'valuation_amount': 18100000000},Twitter,1300.0,"[{'latitude': 37.7768052, 'longitude': -122.41...",$1.16B
7,52cdef7c4bab8bd675297da4,search,2006,,Powerset,60.0,"[{'latitude': 37.778613, 'longitude': -122.395...",$22.5M
8,52cdef7c4bab8bd675297da9,web,2007,,Mahalo,40.0,"[{'latitude': 34.017606, 'longitude': -118.487...",$21M
9,52cdef7c4bab8bd675297daf,mobile,2005,,Jingle Networks,35.0,"[{'latitude': 37.480999, 'longitude': -122.173...",$88.7M


In [7]:
# cambio el nombre de la columna category_code a category y offices a coordenadas(cords) para visualizarlo mejor.
df_companies = df_companies.rename(index=str, columns={"category_code": "category", "offices": "coords"})
# cambio el orden de las columnas para visualizar mejor el dataframe.
df_companies = df_companies.reindex(['category', 'name', 'number_of_employees', 'founded_year', 'total_money_raised', 'coords'],axis=1)
df_companies.head() # Mostrar las 5 primeras filas de mi dataframe.


Unnamed: 0,category,name,number_of_employees,founded_year,total_money_raised,coords
0,web,Geni,18.0,2006,$16.5M,"[{'latitude': 34.090368, 'longitude': -118.393..."
1,music,Slacker,,2006,$73.1M,"[{'latitude': 33.022176, 'longitude': -117.081..."
2,mobile,Helio,,2005,$0,"[{'latitude': 34.057498, 'longitude': -118.446..."
3,news,Scribd,50.0,2007,$25.8M,"[{'latitude': 37.789634, 'longitude': -122.404..."
4,social,MeetMoi,15.0,2007,$5.58M,"[{'latitude': 40.757929, 'longitude': -73.9855..."


In [8]:
# Get latitude 
def get_lat(coord): 
        return coord[0]['latitude']
    
# Get longitude    
def get_long(coord): 
        return coord[0]['longitude']

In [9]:
df_companies['latitude']= df_companies['coords'].apply(get_lat)

In [10]:
df_companies['longitude']= df_companies['coords'].apply(get_long)

In [11]:
df_companies.head()

Unnamed: 0,category,name,number_of_employees,founded_year,total_money_raised,coords,latitude,longitude
0,web,Geni,18.0,2006,$16.5M,"[{'latitude': 34.090368, 'longitude': -118.393...",34.090368,-118.393064
1,music,Slacker,,2006,$73.1M,"[{'latitude': 33.022176, 'longitude': -117.081...",33.022176,-117.081406
2,mobile,Helio,,2005,$0,"[{'latitude': 34.057498, 'longitude': -118.446...",34.057498,-118.446596
3,news,Scribd,50.0,2007,$25.8M,"[{'latitude': 37.789634, 'longitude': -122.404...",37.789634,-122.404052
4,social,MeetMoi,15.0,2007,$5.58M,"[{'latitude': 40.757929, 'longitude': -73.9855...",40.757929,-73.985506


In [12]:
df_companies.dtypes # Veo de que tipo son mis columnas.

category                object
name                    object
number_of_employees    float64
founded_year             int64
total_money_raised      object
coords                  object
latitude               float64
longitude              float64
dtype: object

In [13]:
#Creo funcion para transformar datos.

def float_to_int(flt):    #Transforma un dato tipo float a uno tipo int.
    integer = int(flt)
    return integer

In [14]:
df_companies['number_of_employees'].fillna(0, inplace=True) # Relleno todos los NAN en columna number_of_employees.
df_companies['founded_year'].fillna(0, inplace=True) # Relleno todos los NAN en columna founded_year.
df_companies.dropna(axis=0, subset=['coords'])

Unnamed: 0,category,name,number_of_employees,founded_year,total_money_raised,coords,latitude,longitude
0,web,Geni,18.0,2006,$16.5M,"[{'latitude': 34.090368, 'longitude': -118.393...",34.090368,-118.393064
1,music,Slacker,0.0,2006,$73.1M,"[{'latitude': 33.022176, 'longitude': -117.081...",33.022176,-117.081406
2,mobile,Helio,0.0,2005,$0,"[{'latitude': 34.057498, 'longitude': -118.446...",34.057498,-118.446596
3,news,Scribd,50.0,2007,$25.8M,"[{'latitude': 37.789634, 'longitude': -122.404...",37.789634,-122.404052
4,social,MeetMoi,15.0,2007,$5.58M,"[{'latitude': 40.757929, 'longitude': -73.9855...",40.757929,-73.985506
5,games_video,Joost,0.0,2006,$45M,"[{'latitude': 40.7464969, 'longitude': -74.009...",40.746497,-74.009447
6,social,Twitter,1300.0,2006,$1.16B,"[{'latitude': 37.7768052, 'longitude': -122.41...",37.776805,-122.416924
7,search,Powerset,60.0,2006,$22.5M,"[{'latitude': 37.778613, 'longitude': -122.395...",37.778613,-122.395289
8,web,Mahalo,40.0,2007,$21M,"[{'latitude': 34.017606, 'longitude': -118.487...",34.017606,-118.487267
9,mobile,Jingle Networks,35.0,2005,$88.7M,"[{'latitude': 37.480999, 'longitude': -122.173...",37.480999,-122.173887


In [15]:
# Aplico la función para transformar mis datos a enteros.
df_companies['number_of_employees'] = df_companies['number_of_employees'].apply(float_to_int)
df_companies['founded_year'] = df_companies['founded_year'].apply(float_to_int)

In [16]:
df_companies.dtypes # Veo de que tipo son mis columnas para ver si el cambio se aplico.

category                object
name                    object
number_of_employees      int64
founded_year             int64
total_money_raised      object
coords                  object
latitude               float64
longitude              float64
dtype: object

In [17]:
# Crear unos bins para ver el tamaño de la empresa segun los'number_of_employees': startup, small, medium y big

bins = [0,10,100,1000,100000] # Creo categorias de el bin
labels = ["startup","small","medium","big"] # Creo nombre para las categorias del bin
df_companies['tamaño_empresa'] = pd.cut(df_companies['number_of_employees'], bins=bins, labels=labels)
df_companies.dropna(subset=['tamaño_empresa'], inplace=True)
df_companies.groupby(df_companies.tamaño_empresa)['name'].nunique() 


tamaño_empresa
startup    1833
small       882
medium      100
big           5
Name: name, dtype: int64

In [18]:
df_companies.head()

Unnamed: 0,category,name,number_of_employees,founded_year,total_money_raised,coords,latitude,longitude,tamaño_empresa
0,web,Geni,18,2006,$16.5M,"[{'latitude': 34.090368, 'longitude': -118.393...",34.090368,-118.393064,small
3,news,Scribd,50,2007,$25.8M,"[{'latitude': 37.789634, 'longitude': -122.404...",37.789634,-122.404052,small
4,social,MeetMoi,15,2007,$5.58M,"[{'latitude': 40.757929, 'longitude': -73.9855...",40.757929,-73.985506,small
6,social,Twitter,1300,2006,$1.16B,"[{'latitude': 37.7768052, 'longitude': -122.41...",37.776805,-122.416924,big
7,search,Powerset,60,2006,$22.5M,"[{'latitude': 37.778613, 'longitude': -122.395...",37.778613,-122.395289,small


In [19]:
df_companies.iloc[0, 5]

[{'latitude': 34.090368, 'longitude': -118.393064}]

In [20]:
# Creación de toGeoJSON:
    # Input: lista con dos valores: la longitud y la latitud de la ubicación de la compañía.
    # Output: diccionario con ambas coordenadas en una sola lista y el establecimiento de "type".
    
def to_Geo_JSON(array):
    try:
        array = array[0]
        return {
                "type": "Point",
                "coordinates":[array['longitude'], array['latitude']]
        }
    except: 
        return

df_companies['coords'] = df_companies.apply(lambda s: to_Geo_JSON(s['coords']), axis=1)

In [25]:
df_companies.head()

Unnamed: 0,category,name,number_of_employees,founded_year,total_money_raised,coords,latitude,longitude,tamaño_empresa
0,web,Geni,18,2006,$16.5M,"{'type': 'Point', 'coordinates': [-118.393064,...",34.090368,-118.393064,small
3,news,Scribd,50,2007,$25.8M,"{'type': 'Point', 'coordinates': [-122.404052,...",37.789634,-122.404052,small
4,social,MeetMoi,15,2007,$5.58M,"{'type': 'Point', 'coordinates': [-73.985506, ...",40.757929,-73.985506,small
6,social,Twitter,1300,2006,$1.16B,"{'type': 'Point', 'coordinates': [-122.4169244...",37.776805,-122.416924,big
7,search,Powerset,60,2006,$22.5M,"{'type': 'Point', 'coordinates': [-122.395289,...",37.778613,-122.395289,small


In [26]:
df_companies.to_json('prueba1.json', orient="records", lines=True)

In [23]:
df_companies.iloc[0, 5]

{'type': 'Point', 'coordinates': [-118.393064, 34.090368]}