In [1]:
from pymongo import MongoClient
import pandas as pd
client = MongoClient()

db = client.companies

#for rest in db.companies.find({"name":"Facebook"}):
    #print(rest['name'])

- VISUALIZATION PROJECT Geospatial Business Intelligence (BI)
  * Make a geospartial analysis of the `companies` dataset
  * Things you know:
    - You have a software company with 50 employees
    - The company creates video games
    - Roles in your company: 20 developers, 20 Designers/Creatieves/UX/UI and 10 executives/managers
  * Do an analysis about placing the new company offices in the best environment based on the following criteria:
    - There should be software engineers working around
    - The surroundings must have a good ratio of big companies vs startups
    - Ensure you have in your surroundings companies that cover the interests of your team
    - Avoid old companies, prefer recently created ones

In [2]:
#PRIMERAS QUERIES. FILTRADO POR FECHA Y TIPO DE COMPAÑÍA.
empresas = db.companies.find({'$and':
                              [ { "offices.latitude": {"$exists": True,"$ne":None} },
                               { "offices.longitude": {"$exists": True,"$ne":None} },
                               {'founded_year': {'$gt': 2005 }}, 
                               {'number_of_employees': {'$gt': 1}}, 
                               {'$or':[{'category_code': 'web' },
                                       {'category_code': 'software' },
                                       {'category_code': 'games_video' },
                                       {'category_code': 'mobile' },
                                       {'category_code': 'music' },
                                       {'category_code': 'nanotech'}]}]}, 
                             {'name':1,'founded_year': 1, '_id': 0, 'number_of_employees':1,'offices.latitude':1, 'offices.longitude':1, 'category_code':1})
                        


In [3]:
dataset = pd.DataFrame(empresas)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1498 entries, 0 to 1497
Data columns (total 5 columns):
category_code          1498 non-null object
founded_year           1498 non-null int64
name                   1498 non-null object
number_of_employees    1498 non-null int64
offices                1498 non-null object
dtypes: int64(2), object(3)
memory usage: 58.6+ KB


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1498 entries, 0 to 1497
Data columns (total 5 columns):
category_code          1498 non-null object
founded_year           1498 non-null int64
name                   1498 non-null object
number_of_employees    1498 non-null int64
offices                1498 non-null object
dtypes: int64(2), object(3)
memory usage: 58.6+ KB


In [5]:
dataset['category_code'].unique() 

array(['web', 'games_video', 'mobile', 'software', 'music'], dtype=object)

In [6]:
# EN LUGAR DE UTILIZAR LA PROPUESTA DE MARC PARA EXTRAER LATITUDES Y LONGITUDES UTILICÉ ESTA DE MARTÍN.
def get_lat(coord): 
    try: 
        return coord[0]['latitude']
    except:
        return None

# Get longitude    
def get_long(coord): 
    try: 
        return coord[0]['longitude']
    except:
        return None

# Inserting new columns with latitude and longitude    
def insert_latlong(df):   
    df['latitude'] = df['offices'].apply(get_lat)
    df['longitude'] = df['offices'].apply(get_long)
    return df


In [7]:
data_limpio = insert_latlong(dataset)
data_limpio.head()

Unnamed: 0,category_code,founded_year,name,number_of_employees,offices,latitude,longitude
0,web,2006,Geni,18,"[{'latitude': 34.090368, 'longitude': -118.393...",34.090368,-118.393064
1,web,2007,Mahalo,40,"[{'latitude': 34.017606, 'longitude': -118.487...",34.017606,-118.487267
2,games_video,2006,Kyte,40,"[{'latitude': 37.788482, 'longitude': -122.409...",37.788482,-122.409173
3,web,2007,CriticalMetrics,4,"[{'latitude': 37.269175, 'longitude': -119.306...",37.269175,-119.306607
4,games_video,2006,Stickam,35,"[{'latitude': 34.051409, 'longitude': -118.254...",34.051409,-118.254558


In [8]:
data_limpio.drop(['offices'], axis=1)

Unnamed: 0,category_code,founded_year,name,number_of_employees,latitude,longitude
0,web,2006,Geni,18,34.090368,-118.393064
1,web,2007,Mahalo,40,34.017606,-118.487267
2,games_video,2006,Kyte,40,37.788482,-122.409173
3,web,2007,CriticalMetrics,4,37.269175,-119.306607
4,games_video,2006,Stickam,35,34.051409,-118.254558
5,games_video,2007,Livestream,120,40.726155,-73.995625
6,games_video,2007,AdaptiveBlue,15,40.801358,-74.337200
7,mobile,2006,GrandCentral,3,37.465645,-121.932202
8,web,2007,Pownce,6,37.762541,-122.397224
9,web,2007,SodaHead,25,37.269175,-119.306607


In [9]:
data_limpio.to_json('data_limpio_ok.json', orient='records')

In [10]:
data_limpio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1498 entries, 0 to 1497
Data columns (total 7 columns):
category_code          1498 non-null object
founded_year           1498 non-null int64
name                   1498 non-null object
number_of_employees    1498 non-null int64
offices                1498 non-null object
latitude               1498 non-null float64
longitude              1498 non-null float64
dtypes: float64(2), int64(2), object(3)
memory usage: 82.0+ KB


In [11]:
#FUNCIÓN PARA CREAR LA COLUMNA GEOJSON PARA PODER HACER LAS GEOQUERIES
def geoj(a,b):
    return {'type': 'point', 'coordinates':[a,b]}
data_limpio['geojson'] = data_limpio.apply(lambda x: geoj(x['longitude'], x['latitude']),axis=1)
data_limpio.head()

Unnamed: 0,category_code,founded_year,name,number_of_employees,offices,latitude,longitude,geojson
0,web,2006,Geni,18,"[{'latitude': 34.090368, 'longitude': -118.393...",34.090368,-118.393064,"{'coordinates': [-118.393064, 34.090368], 'typ..."
1,web,2007,Mahalo,40,"[{'latitude': 34.017606, 'longitude': -118.487...",34.017606,-118.487267,"{'coordinates': [-118.487267, 34.017606], 'typ..."
2,games_video,2006,Kyte,40,"[{'latitude': 37.788482, 'longitude': -122.409...",37.788482,-122.409173,"{'coordinates': [-122.409173, 37.788482], 'typ..."
3,web,2007,CriticalMetrics,4,"[{'latitude': 37.269175, 'longitude': -119.306...",37.269175,-119.306607,"{'coordinates': [-119.306607, 37.269175], 'typ..."
4,games_video,2006,Stickam,35,"[{'latitude': 34.051409, 'longitude': -118.254...",34.051409,-118.254558,"{'coordinates': [-118.254558, 34.051409], 'typ..."


In [12]:
data_limpio.to_json('datacoords.json', orient = 'records', lines=True)

In [None]:
#ME LLEVO EL DATACOORDS A TABLEAU DONDE CATEGORIZO EL TAMAÑO DE LAS EMPRESAS Y OBSERVO QUE SAN FRANCISCO ES UN BUEN LUGAR DONDE COLOCAR LA EMPRESA, YA QUE HAY UNA BUENA PROPORCIÓN DE EMPRESAS GRANDES, MEDIANAS Y PEQUEÑAS.
#CON LAS GEOQUERIES PODRÉ CORROBORAR ESTA PRIMERA APRECIACIÓN.
 

In [14]:
data_limpio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1498 entries, 0 to 1497
Data columns (total 8 columns):
category_code          1498 non-null object
founded_year           1498 non-null int64
name                   1498 non-null object
number_of_employees    1498 non-null int64
offices                1498 non-null object
latitude               1498 non-null float64
longitude              1498 non-null float64
geojson                1498 non-null object
dtypes: float64(2), int64(2), object(4)
memory usage: 93.7+ KB


In [None]:
data_limpio.info()