In [59]:
import pandas as pd
import re
import gmaps
import gmaps.datasets
import requests
from pymongo import MongoClient
from pandas.io.json import json_normalize
from sklearn import preprocessing


with open('.env') as file:
        key = file.read()
#key = pd.read_csv('.env')
gmaps.configure(api_key= key)

pd.set_option('display.max_columns', 500)
client = MongoClient ('localhost', 27017)
data = client['companies'].companies

In [60]:
meta = ['name', 'category_code', 'number_of_employees', 'founded_year', 'total_money_raised', 'ipo', 'acquisition', 'investments']

def get_locations(data):
    return json_normalize(data, record_path = 'offices', meta = meta, errors='ignore')

def get_companies_df(data):
    #df = pd.DataFrame(data)
    locations = get_locations(data)
    return(locations)

In [61]:
#Primero, analizamos las diferentes categorías disponibles en la totalidad del data set:

categories_df = pd.DataFrame(data.find({}, {"name": 1, "category_code": 1, "_id": 0}))
print(categories_df['category_code'].unique())

['web' 'enterprise' 'software' 'news' 'social' 'network_hosting'
 'games_video' 'music' 'mobile' 'search' 'advertising' 'messaging'
 'security' 'photo_video' 'finance' 'hardware' 'ecommerce' 'travel'
 'public_relations' 'other' 'real_estate' 'semiconductor' 'analytics'
 'health' 'legal' 'sports' 'biotech' 'cleantech' 'education' 'consulting'
 'transportation' None 'hospitality' 'fashion' 'nonprofit' 'nanotech'
 'automotive' 'design' 'manufacturing' 'government' 'local' 'medical']


In [62]:
#Escogemos cualitativamente las categorías que más relación pueden tener con nuestros sector, los videjuegos, y añadimos alguna 
#que pertenezca a sectores de apoyo como la consultoría.

categories = ['web', 'software', 'social', 'network_hosting'
 'games_video', 'photo_video', 'mobile', 'search', 'ecommerce', 'consulting', 'nanotech']

Buscamos todas aquellas que:
- Su fecha de fundación es posterior a 1990.
- Pertenecen a las categorías listadas en el punto anterior.
- Tienen al menos una oficina con coordenadas válidas.
- Cumplen alguna de las siguientes características:
    - Tienen IPO (es decir, han sacado cotización en bolsa).
    - Han sido adquiridos y disponen de un valor de precio de adquisición.
    - Han conseguido levantar inversiones (inversiones no igual a cero).
    - Han realizado inversiones en otras empresas.
    - Su cifra de empleados es inferior a 100.
    - Su fundación es muy reciente: 2010 en adelante.
 

In [63]:
filtered_data = data.find({
    'founded_year': {'$gte': 1990},
    'offices': {'$exists': True, '$ne': []},
    'offices.latitude': {'$ne': None}, 
    'offices.longitude': {'$ne': None},
    'category_code': {'$in': categories},
    '$or': [
        #{'ipo': {'$ne': None}},
        {'ipo': {'$exists': True, '$ne': None}},
        {'acquisition.price_amount': {'$ne': None}},
        {'investments': {'$ne': []}},
        {'total_money_raised': {'$ne': "$0"}},
        {'number_of_employees': {'$lt': 500}},
        {'founded_year': {'$gte': 2010}}
    ], 
})

In [64]:
target_companies = get_companies_df(filtered_data)                   

In [65]:
display(target_companies.head())

Unnamed: 0,address1,address2,city,country_code,description,latitude,longitude,state_code,zip_code,name,category_code,number_of_employees,founded_year,total_money_raised,ipo,acquisition,investments
0,710 - 2nd Avenue,Suite 1100,Seattle,USA,,47.603122,-122.333253,WA,98104.0,Wetpaint,web,47,2005,$39.8M,,"{'price_amount': 30000000, 'price_currency_cod...",[]
1,270 Lafayette Street,Suite 505,New York,USA,,40.723731,-73.996431,NY,10012.0,Wetpaint,web,47,2005,$39.8M,,"{'price_amount': 30000000, 'price_currency_cod...",[]
2,1601 Willow Road,,Menlo Park,USA,Headquarters,37.41605,-122.151801,CA,94025.0,Facebook,social,5299,2004,$2.43B,"{'valuation_amount': 104000000000, 'valuation_...",,"[{'funding_round': {'round_code': 'seed', 'sou..."
3,,,Dublin,IRL,Europe HQ,53.344104,-6.267494,,,Facebook,social,5299,2004,$2.43B,"{'valuation_amount': 104000000000, 'valuation_...",,"[{'funding_round': {'round_code': 'seed', 'sou..."
4,340 Madison Ave,,New York,USA,New York,40.755716,-73.979247,NY,10017.0,Facebook,social,5299,2004,$2.43B,"{'valuation_amount': 104000000000, 'valuation_...",,"[{'funding_round': {'round_code': 'seed', 'sou..."


In [66]:
target_companies.shape

(3699, 17)

In [67]:
target_companies.describe()

Unnamed: 0,latitude,longitude,founded_year
count,3699.0,3699.0,3699.0
mean,38.178384,-65.965888,2004.794269
std,14.802998,63.805113,3.723997
min,-41.296454,-159.480262,1990.0
25%,36.676994,-119.306607,2003.0
50%,39.568519,-80.83722,2006.0
75%,44.918213,-3.70325,2007.0
max,65.056601,175.2604,2013.0


In [68]:
target_companies['loc'] = list(zip(target_companies['longitude'], target_companies['latitude']))
#Método alternativo
'''
target_companies['loc'] = target_companies.apply(lambda row: (row['longitude'], row['latitude']), axis = 1)
'''
display(target_companies.head())

Unnamed: 0,address1,address2,city,country_code,description,latitude,longitude,state_code,zip_code,name,category_code,number_of_employees,founded_year,total_money_raised,ipo,acquisition,investments,loc
0,710 - 2nd Avenue,Suite 1100,Seattle,USA,,47.603122,-122.333253,WA,98104.0,Wetpaint,web,47,2005,$39.8M,,"{'price_amount': 30000000, 'price_currency_cod...",[],"(-122.333253, 47.603122)"
1,270 Lafayette Street,Suite 505,New York,USA,,40.723731,-73.996431,NY,10012.0,Wetpaint,web,47,2005,$39.8M,,"{'price_amount': 30000000, 'price_currency_cod...",[],"(-73.9964312, 40.7237306)"
2,1601 Willow Road,,Menlo Park,USA,Headquarters,37.41605,-122.151801,CA,94025.0,Facebook,social,5299,2004,$2.43B,"{'valuation_amount': 104000000000, 'valuation_...",,"[{'funding_round': {'round_code': 'seed', 'sou...","(-122.151801, 37.41605)"
3,,,Dublin,IRL,Europe HQ,53.344104,-6.267494,,,Facebook,social,5299,2004,$2.43B,"{'valuation_amount': 104000000000, 'valuation_...",,"[{'funding_round': {'round_code': 'seed', 'sou...","(-6.267494, 53.344104)"
4,340 Madison Ave,,New York,USA,New York,40.755716,-73.979247,NY,10017.0,Facebook,social,5299,2004,$2.43B,"{'valuation_amount': 104000000000, 'valuation_...",,"[{'funding_round': {'round_code': 'seed', 'sou...","(-73.9792469, 40.7557162)"


In [46]:
fig = gmaps.figure(map_type='SATELLITE')

offices_loc = target_companies[['latitude', 'longitude']]

heatmap_layer = gmaps.heatmap_layer(offices_locs)
fig.add_layer(heatmap_layer)
fig

Figure(layout=FigureLayout(height='420px'))

In [69]:
target_companies.to_json("target_companies.json", orient="records", lines=True)
target_companies.to_csv("target_companies.csv")

In [70]:
def get_near_offices(row):
    nearLocation = {
        "lng": row["longitude"],
        "lat": row["latitude"]
    }

    prospects = pd.DataFrame(target.find({
        "loc": {
         "$near": {
           "$geometry": {
              "type": "Point" ,
               "coordinates": [ nearLocation["lng"] , nearLocation["lat"] ]
              #"coordinates": [ nearLocation["lat"] , nearLocation["lng"] ]
           },
           "$maxDistance": 2000, # In meters
         }
       }
    }))
    
    return prospects

- Para cada cluster:
        - Aplicamos solo si el número de empresas cercanas es superior a 30.
        - Necesario: compañía software cerca: ['web', 'software', 'social', 'games_video','network_hosting', 'search', 'ecommerce']
        - Otorgamos puntos según el sector de la empresa:
{'web': 5, 'software': 5, 'social': 5, 'games_video': 10,'network_hosting': 2, 'photo_video': 3, 'mobile': 2, 'search': 5, 'ecommerce': 5, 'consulting': 2, 'nanotech': 2}
        - Otorgamos 5 puntos por cada empresa dentro del cluster.
        - Ratio entre startups y empresas grandes: debe ser entre 0,4 y 0,6:
            -Definimos como startup aquellas posteriores o iguales a 2010 y con un máximo de      300 empleados.
            - Definimos como empresa grande aquellas fundadas entre 1990 y 2009, o con más de 300 empleados.
        - Si la empresa tiene IPO, investments, money raised, acquisition.price amount, le sumamos 10.
        Multiplicamos todos esos puntos por 0,5
    - Cogemos el money raised total y lo dividimos entre el número de empresas. Lo multiplicamos por 0,5 y lo sumamos con el otro valor.

In [71]:
def money_raised(s):
    m = re.findall(r"[-+]?\d*\.\d+|\d+", s)[0]
    if '$' in s:
        factor = 1
    elif '€' in s:
        factor = 1.2
    else:
        s = s[1:]
        factor = 1
    m = float(m)
    if 'k' in s:
        return m * 1000 * factor
    if 'm' in s:
        return m * 1000 * factor
    if 'b' in s:
        return m * 1000 * factor
    else:
        return float(m) * factor
    
def compute_raised_money(df):
    money = 0
    for index, row in df.iterrows():
        money += money_raised(row['total_money_raised'])
    return money/len(df)

In [72]:
def check_mandatory_sectors(df):
    mandatory_cats = ['web', 'software', 'social', 'games_video', 'network_hosting', 'search', 'ecommerce']
    for index, row in df.iterrows():
        if row['category_code'] in mandatory_cats:
            return True
    return False

In [73]:
def get_investments_points(row):
    ipo = row['ipo'] != None
    raised = row['total_money_raised'] != '$0'
    acquisition = row['acquisition'] != None
    investments = len(row['investments']) != 0
    return ipo or raised or acquisition or investments

def get_sector_points(df):
    ac = 0
    category_points = {'web': 5, 'software': 5, 'social': 5, 'games_video': 10,
                    'network_hosting': 2, 'photo_video': 3, 'mobile': 2, 'search': 5, 
                    'ecommerce': 5, 'consulting': 2, 'nanotech': 2}
    for index, row in df.iterrows():
        ac += (category_points[row['category_code']] + get_investments_points(row) * 10)
    return ac

In [74]:
def company_size_ratio(df):
    start_ups = df[(df['founded_year'] > 2005) & (df['number_of_employees'] < 300)]
    consolidated = df[(df['founded_year'] <= 2009) | (df['number_of_employees'] >= 300)]
    
    if len(consolidated != 0):
        ratio = len(start_ups)/len(consolidated)
        return  ratio > 0.35 and ratio < 0.65
    else:
        return False

In [75]:
def filter_and_get_points(df):
    if not check_mandatory_sectors(df) or not company_size_ratio(df) or len(df) < 30:
        return[0, 0]
    points = get_sector_points(df)
    money = compute_raised_money(df)
    return [points, money]

In [76]:
data = []
for index, row in target_companies.iterrows():
    prospects = get_near_offices(row)
    results = (filter_and_get_points(prospects))
    longitude = row['longitude']
    latitude = row['latitude']
    location = row['loc']
    data.append([results[0], results[1], longitude, latitude, location])

In [77]:
results_df = pd.DataFrame(data, columns = ['points', 'money', 'longitude', 'latitude', 'loc'])
results_df = results_df[results_df['points'] > 0]
results_df = results_df.reset_index(drop = True)

In [78]:
#Normalizamos las columnas de puntos y dinero recaudado, creando un nuevo dataset

x = results_df[['points', 'money']]
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled, columns = ['points', 'money'])
df.reset_index

locations_df = results_df[['longitude', 'latitude', 'loc']]
locations_df = locations_df.reset_index(drop = True)

normalized_result = pd.concat([df, locations_df], axis = 1)

display(normalized_result.head(10))

  return self.partial_fit(X, y)


Unnamed: 0,points,money,longitude,latitude,loc
0,0.110204,0.407616,-122.333253,47.603122,"(-122.333253, 47.603122)"
1,0.206122,0.225153,-73.996431,40.723731,"(-73.9964312, 40.7237306)"
2,0.490306,0.28946,-73.979247,40.755716,"(-73.9792469, 40.7557162)"
3,0.559184,0.603305,-122.416924,37.776805,"(-122.4169244, 37.7768052)"
4,0.467347,0.271891,-73.985506,40.757929,"(-73.985506, 40.757929)"
5,0.739796,0.819724,-122.395289,37.778613,"(-122.395289, 37.778613)"
6,0.25051,0.256524,-73.995722,40.72604,"(-73.995722, 40.72604)"
7,0.689286,0.698718,-122.402567,37.793148,"(-122.402567, 37.793148)"
8,0.109694,0.24983,-122.151198,37.44296,"(-122.151198, 37.44296)"
9,0.204592,0.866523,-122.397224,37.762541,"(-122.397224, 37.762541)"


In [79]:
normalized_result['total'] = normalized_result['points'] + normalized_result['money']
normalized_result = normalized_result.sort_values('total', ascending = False)
display(normalized_result.head(20))

Unnamed: 0,points,money,longitude,latitude,loc,total
286,1.0,0.750258,-122.407709,37.781754,"(-122.407709, 37.781754)",1.750258
321,1.0,0.750258,-122.407709,37.781754,"(-122.407709, 37.781754)",1.750258
135,0.97449,0.766746,-122.406912,37.781002,"(-122.406912, 37.781002)",1.741235
185,0.995918,0.738346,-122.408646,37.784137,"(-122.408646, 37.784137)",1.734264
11,0.964796,0.76254,-122.402195,37.786183,"(-122.402195, 37.786183)",1.727336
561,0.964796,0.76254,-122.404392,37.786905,"(-122.4043924, 37.7869047)",1.727336
347,0.964796,0.76254,-122.404403,37.78691,"(-122.404403, 37.78691)",1.727336
312,0.964796,0.76254,-122.404403,37.78691,"(-122.404403, 37.78691)",1.727336
365,0.972959,0.75369,-122.400591,37.782163,"(-122.400591, 37.782163)",1.726649
15,0.972959,0.75369,-122.401116,37.782103,"(-122.401116, 37.782103)",1.726649


In [80]:
lat = normalized_result['latitude'][0]
lng = normalized_result['longitude'][0]
url = 'https://maps.googleapis.com/maps/api/geocode/json?latlng={},{}&key={}'.format(lat, lng, key)

response = requests.get(url)
print(response.text)

{
   "error_message" : "You have exceeded your daily request quota for this API. If you did not set a custom daily request quota, verify your project has an active billing account: http://g.co/dev/maps-no-account",
   "results" : [],
   "status" : "OVER_QUERY_LIMIT"
}



In [28]:
normalized_result.to_csv('companies_results_normalized.csv')

Tras aplicar el algoritmo, vemos que muchas de las zonas preeliminares han desaparecido, quedando ya solo las que se muestran en este mapa_

In [47]:
offices_locs = normalized_result[['latitude', 'longitude']]

offices_layer = gmaps.symbol_layer(
    offices_locs, fill_color="green", stroke_color="blue", scale=2
)
fig = gmaps.figure()
fig.add_layer(offices_layer)
display(fig)

Figure(layout=FigureLayout(height='420px'))

Observando los mismos resultados en un mapa de calor, sin tener en cuenta las puntuaciones, vemos que hay varias regiones preseleccionadas, aunque por volumen de empresas sale resaltada San Francisco.

In [48]:
fig = gmaps.figure(map_type='SATELLITE')

heatmap_layer = gmaps.heatmap_layer(offices_locs)
fig.add_layer(heatmap_layer)
fig

Figure(layout=FigureLayout(height='420px'))

Teniendo en cuenta el valor de la puntuación total como pesos, vemos como Londres se desestima en el mapa, siendo la zona más adecuada el cluster de San Francisco.

In [49]:
fig = gmaps.figure()

weights = normalized_result['total']

heatmap_layer = gmaps.heatmap_layer(offices_locs)
fig.add_layer(gmaps.heatmap_layer(offices_locs, weights = weights))
fig

Figure(layout=FigureLayout(height='420px'))

In [50]:
fig = gmaps.figure(map_type='SATELLITE')
heatmap_layer = gmaps.heatmap_layer(offices_locs)
fig.add_layer(gmaps.heatmap_layer(offices_locs, weights = weights))
fig

Figure(layout=FigureLayout(height='420px'))