In [1]:
from pymongo import MongoClient
import pandas as pd
from pandas.io.json import json_normalize
import re


client=MongoClient()

db=client.companies

In [2]:
def full(df):             # funcion para mostrar el dataframe completo
    pd.set_option('display.max_rows', len(df))
    print(df)
    pd.reset_option('display.max_rows')
    return '\nDone'

In [3]:
def data():    # funcion para realizar query a mongoDB
    query=(db.companies.find({'$and':[{'$or': [{'category_code':'games_video'},
                                               {'category_code':'software'},
                                               {'category_code':'web'},
                                               {'category_code':'mobile'},
                                               {'category_code':'photo_video'},
                                               {'category_code':'ecommerce'},
                                               {'category_code':'search'},
                                               {'category_code':'network_hosting'}]},
                             {'founded_year': {'$gte':1999}},
                             {'offices': {'$ne':[]}}, 
                             {'offices.latitude': {'$ne':[]}},
                             {'offices.longitude': {'$ne':[]}},
                             {'offices.latitude': {'$ne':None}},
                             {'offices.longitude': {'$ne':None}},         
                             {'deadpooled_year':None},
                             {'number_of_employees': {'$ne':None}},
                             {'total_money_raised': {'$ne':None}}]},

                             {'name':1, '_id':0, 'offices.latitude':1, 'offices.longitude':1, 
                              'founded_year':1, 'category_code':1, 'number_of_employees':1,
                              'total_money_raised':1}))

    
    df=json_normalize(data=query, record_path='offices',
                      meta=['category_code', 'name', 'number_of_employees',
                            'founded_year', 'total_money_raised'])
    
    
    df.insert(2, 'geo_loc', pd.Series(list(range(len(df)))))
    df['geo_loc']=df.apply(lambda x: [x['longitude'], x['latitude']], axis=1)
    
    return df

In [4]:
def exchange(df):  # extrae numero del dinero total
    res=[]
    number=1
    symbol=1
    
    for e in df:
        if e[-1]=='k'  : number=1e3  # unidades
        elif e[-1]=='M': number=1e6
        elif e[-1]=='B': number=1e9
             
        if e[0]=='£'  : symbol=1.3   # cambio a 11-05-2019 entre libras y dolares...
        elif e[0]=='€': symbol=1.12  # ... y euros y dolares.
            
        n_df=re.findall('[0-9.]+', e)
        
        res.append(int(float(n_df[0])*number*symbol))
    return res

In [5]:
def classifying(date, employees):  # clasifica big-startup segun fecha y nº de empleados
    company_class=[]
        
    for i in range(len(date)):
        if date[i]>=2008 and employees[i]<100: company_class.append('StartUp')
        else: company_class.append('Big')
            
    return company_class

In [6]:
if __name__=="__main__":
    df=data()
    print ('El numero de oficinas consideradas es {}.'.format(len(df)))     
    df['n_total_money']=exchange(df['total_money_raised'])
    df['company_class']=classifying(df['founded_year'], df['number_of_employees'])
    df.to_json('geoloc.json', orient='records', lines=True)
    display (df.head())

El numero de oficinas consideradas es 2881.


Unnamed: 0,latitude,longitude,geo_loc,category_code,name,number_of_employees,founded_year,total_money_raised,n_total_money,company_class
0,34.090368,-118.393064,"[-118.393064, 34.090368]",web,Geni,18,2006,$16.5M,16500000,Big
1,37.387845,-122.055197,"[-122.055197, 37.387845]",web,Plaxo,50,2002,$28.3M,28300000,Big
2,40.746497,-74.009447,"[-74.0094471, 40.7464969]",games_video,Joost,0,2006,$45M,45000000,Big
3,37.778613,-122.395289,"[-122.395289, 37.778613]",search,Powerset,60,2006,$22.5M,22500000,Big
4,34.017606,-118.487267,"[-118.487267, 34.017606]",web,Mahalo,40,2007,$21M,21000000,Big
