In [3]:
import pandas as pd
from pymongo import MongoClient
from pandas.io.json import json_normalize
pd.set_option('display.max_columns', 500)
client = MongoClient ('localhost', 27017)
data = client['companies'].companies

In [4]:
meta = ['name', 'category_code', 'number_of_employees', 'founded_year', 'total_money_raised']

def get_locations(data):
    return json_normalize(data, record_path = 'offices', meta = meta)

def get_companies_df(data):
    #df = pd.DataFrame(data)
    locations = get_locations(data)
    return(locations)
    
    '''
    gelocs = locations.apply(lambda e: [e["coord"][0],e["coord"][1]], result_type="expand", axis=1)
    clean_df = pd.concat([bks["restaurant_id"],locations[["street","zipcode"]],gelocs], axis=1)
    clean_df.rename({1:"lat",0:"long"}, axis=1, inplace=True)
    return clean_df
    '''

In [5]:
#Primero, analizamos las diferentes categorías disponibles en la totalidad del data set:

categories_df = pd.DataFrame(data.find({}, {"name": 1, "category_code": 1, "_id": 0}))
print(categories_df['category_code'].unique())

['web' 'enterprise' 'software' 'news' 'social' 'network_hosting'
 'games_video' 'music' 'mobile' 'search' 'advertising' 'messaging'
 'security' 'photo_video' 'finance' 'hardware' 'ecommerce' 'travel'
 'public_relations' 'other' 'real_estate' 'semiconductor' 'analytics'
 'health' 'legal' 'sports' 'biotech' 'cleantech' 'education' 'consulting'
 'transportation' None 'hospitality' 'fashion' 'nonprofit' 'nanotech'
 'automotive' 'design' 'manufacturing' 'government' 'local' 'medical']


In [6]:
#Escogemos las categorías que más relación pueden tener con nuestros sector, los videjuegos, y añadimos alguna 
#que pertenezca a sectores de apoyo como la consultoría.

categories = ['web', 'software', 'social', 'network_hosting'
 'games_video', 'photo_video', 'mobile', 'search', 'ecommerce', 'consulting', 'nanotech']

Buscamos todas aquellas que:
- Su fecha de fundación es posterior a 1990.
- Pertenecen a las categorías listadas en el punto anterior.
- Tienen al menos una oficina con coordenadas válidas.
- Cumplen alguna de las siguientes características:
    - Tienen IPO (es decir, han sacado cotización en bolsa).
    - Han sido adquiridos y disponen de un valor de precio de adquisición.
    - Han conseguido levantar inversiones (inversiones no igual a cero).
    - Han realizado inversiones en otras empresas.
    - Su cifra de empleados es inferior a 100.
    - Su fundación es muy reciente: 2010 en adelante.
 

In [7]:
filtered_data = data.find({
    'founded_year': {'$gte': 1990},
    'offices': {'$exists': True, '$ne': []},
    'offices.latitude': {'$ne': None}, 
    'offices.longitude': {'$ne': None},
    'category_code': {'$in': categories},
    '$or': [
        #{'ipo': {'$ne': None}},
        {'ipo': {'$exists': True, '$ne': None}},
        {'acquisition.price_amount': {'$ne': None}},
        {'investments': {'$ne': []}},
        {'total_money_raised': {'$ne': "$0"}},
        {'number_of_employees': {'$lt': 500}},
        {'founded_year': {'$gte': 2010}}
    ], 
})

display(filtered_data.count())



3434

In [8]:
target_companies = get_companies_df(filtered_data)                   

In [9]:
display(target_companies.head())

Unnamed: 0,address1,address2,city,country_code,description,latitude,longitude,state_code,zip_code,name,category_code,number_of_employees,founded_year,total_money_raised
0,710 - 2nd Avenue,Suite 1100,Seattle,USA,,47.603122,-122.333253,WA,98104.0,Wetpaint,web,47,2005,$39.8M
1,270 Lafayette Street,Suite 505,New York,USA,,40.723731,-73.996431,NY,10012.0,Wetpaint,web,47,2005,$39.8M
2,1601 Willow Road,,Menlo Park,USA,Headquarters,37.41605,-122.151801,CA,94025.0,Facebook,social,5299,2004,$2.43B
3,,,Dublin,IRL,Europe HQ,53.344104,-6.267494,,,Facebook,social,5299,2004,$2.43B
4,340 Madison Ave,,New York,USA,New York,40.755716,-73.979247,NY,10017.0,Facebook,social,5299,2004,$2.43B


In [10]:
target_companies.shape

(3699, 14)

In [11]:
target_companies.describe()

Unnamed: 0,latitude,longitude,founded_year
count,3699.0,3699.0,3699.0
mean,38.178384,-65.965888,2004.794269
std,14.802998,63.805113,3.723997
min,-41.296454,-159.480262,1990.0
25%,36.676994,-119.306607,2003.0
50%,39.568519,-80.83722,2006.0
75%,44.918213,-3.70325,2007.0
max,65.056601,175.2604,2013.0


In [17]:
target_companies['loc'] = target_companies.apply(lambda row: [row['longitude'], row['latitude']], axis = 1)

display(target_companies.head())

Unnamed: 0,address1,address2,city,country_code,description,latitude,longitude,state_code,zip_code,name,category_code,number_of_employees,founded_year,total_money_raised,loc
0,710 - 2nd Avenue,Suite 1100,Seattle,USA,,47.603122,-122.333253,WA,98104.0,Wetpaint,web,47,2005,$39.8M,"[-122.333253, 47.603122]"
1,270 Lafayette Street,Suite 505,New York,USA,,40.723731,-73.996431,NY,10012.0,Wetpaint,web,47,2005,$39.8M,"[-73.9964312, 40.7237306]"
2,1601 Willow Road,,Menlo Park,USA,Headquarters,37.41605,-122.151801,CA,94025.0,Facebook,social,5299,2004,$2.43B,"[-122.151801, 37.41605]"
3,,,Dublin,IRL,Europe HQ,53.344104,-6.267494,,,Facebook,social,5299,2004,$2.43B,"[-6.267494, 53.344104]"
4,340 Madison Ave,,New York,USA,New York,40.755716,-73.979247,NY,10017.0,Facebook,social,5299,2004,$2.43B,"[-73.9792469, 40.7557162]"


In [20]:
target_companies['loc2'] = list(zip(target_companies['longitude'], target_companies['latitude']))
display(target_companies.head())

Unnamed: 0,address1,address2,city,country_code,description,latitude,longitude,state_code,zip_code,name,category_code,number_of_employees,founded_year,total_money_raised,loc,loc2
0,710 - 2nd Avenue,Suite 1100,Seattle,USA,,47.603122,-122.333253,WA,98104.0,Wetpaint,web,47,2005,$39.8M,"[-122.333253, 47.603122]","(-122.333253, 47.603122)"
1,270 Lafayette Street,Suite 505,New York,USA,,40.723731,-73.996431,NY,10012.0,Wetpaint,web,47,2005,$39.8M,"[-73.9964312, 40.7237306]","(-73.9964312, 40.7237306)"
2,1601 Willow Road,,Menlo Park,USA,Headquarters,37.41605,-122.151801,CA,94025.0,Facebook,social,5299,2004,$2.43B,"[-122.151801, 37.41605]","(-122.151801, 37.41605)"
3,,,Dublin,IRL,Europe HQ,53.344104,-6.267494,,,Facebook,social,5299,2004,$2.43B,"[-6.267494, 53.344104]","(-6.267494, 53.344104)"
4,340 Madison Ave,,New York,USA,New York,40.755716,-73.979247,NY,10017.0,Facebook,social,5299,2004,$2.43B,"[-73.9792469, 40.7557162]","(-73.9792469, 40.7557162)"


In [22]:
target_companies.to_json("target_companies.json", orient="records", lines=True)
target_companies.to_json("target_companies.csv")