In [51]:
import pandas as pd
from pymongo import MongoClient
from pandas.io.json import json_normalize
pd.set_option('display.max_columns', 500)
client = MongoClient ('localhost', 27017)
data = client['companies'].companies

In [52]:
meta = ['name', 'category_code', 'number_of_employees', 'founded_year', 'total_money_raised', 'ipo', 'acquisition']

def get_locations(data):
    return json_normalize(data, record_path = 'offices', meta = meta, errors='ignore')

def get_companies_df(data):
    #df = pd.DataFrame(data)
    locations = get_locations(data)
    return(locations)
    
    '''
    gelocs = locations.apply(lambda e: [e["coord"][0],e["coord"][1]], result_type="expand", axis=1)
    clean_df = pd.concat([bks["restaurant_id"],locations[["street","zipcode"]],gelocs], axis=1)
    clean_df.rename({1:"lat",0:"long"}, axis=1, inplace=True)
    return clean_df
    '''

In [53]:
#Primero, analizamos las diferentes categorías disponibles en la totalidad del data set:

categories_df = pd.DataFrame(data.find({}, {"name": 1, "category_code": 1, "_id": 0}))
print(categories_df['category_code'].unique())

['web' 'enterprise' 'software' 'news' 'social' 'network_hosting'
 'games_video' 'music' 'mobile' 'search' 'advertising' 'messaging'
 'security' 'photo_video' 'finance' 'hardware' 'ecommerce' 'travel'
 'public_relations' 'other' 'real_estate' 'semiconductor' 'analytics'
 'health' 'legal' 'sports' 'biotech' 'cleantech' 'education' 'consulting'
 'transportation' None 'hospitality' 'fashion' 'nonprofit' 'nanotech'
 'automotive' 'design' 'manufacturing' 'government' 'local' 'medical']


In [54]:
#Escogemos las categorías que más relación pueden tener con nuestros sector, los videjuegos, y añadimos alguna 
#que pertenezca a sectores de apoyo como la consultoría.

categories = ['web', 'software', 'social', 'network_hosting'
 'games_video', 'photo_video', 'mobile', 'search', 'ecommerce', 'consulting', 'nanotech']

Buscamos todas aquellas que:
- Su fecha de fundación es posterior a 1990.
- Pertenecen a las categorías listadas en el punto anterior.
- Tienen al menos una oficina con coordenadas válidas.
- Cumplen alguna de las siguientes características:
    - Tienen IPO (es decir, han sacado cotización en bolsa).
    - Han sido adquiridos y disponen de un valor de precio de adquisición.
    - Han conseguido levantar inversiones (inversiones no igual a cero).
    - Han realizado inversiones en otras empresas.
    - Su cifra de empleados es inferior a 100.
    - Su fundación es muy reciente: 2010 en adelante.
 

In [55]:
filtered_data = data.find({
    'founded_year': {'$gte': 1990},
    'offices': {'$exists': True, '$ne': []},
    'offices.latitude': {'$ne': None}, 
    'offices.longitude': {'$ne': None},
    'category_code': {'$in': categories},
    '$or': [
        #{'ipo': {'$ne': None}},
        {'ipo': {'$exists': True, '$ne': None}},
        {'acquisition.price_amount': {'$ne': None}},
        {'investments': {'$ne': []}},
        {'total_money_raised': {'$ne': "$0"}},
        {'number_of_employees': {'$lt': 500}},
        {'founded_year': {'$gte': 2010}}
    ], 
})

display(filtered_data.count())



3434

In [56]:
target_companies = get_companies_df(filtered_data)                   

In [57]:
display(target_companies.head())

Unnamed: 0,address1,address2,city,country_code,description,latitude,longitude,state_code,zip_code,name,category_code,number_of_employees,founded_year,total_money_raised,ipo,acquisition
0,710 - 2nd Avenue,Suite 1100,Seattle,USA,,47.603122,-122.333253,WA,98104.0,Wetpaint,web,47,2005,$39.8M,,"{'price_amount': 30000000, 'price_currency_cod..."
1,270 Lafayette Street,Suite 505,New York,USA,,40.723731,-73.996431,NY,10012.0,Wetpaint,web,47,2005,$39.8M,,"{'price_amount': 30000000, 'price_currency_cod..."
2,1601 Willow Road,,Menlo Park,USA,Headquarters,37.41605,-122.151801,CA,94025.0,Facebook,social,5299,2004,$2.43B,"{'valuation_amount': 104000000000, 'valuation_...",
3,,,Dublin,IRL,Europe HQ,53.344104,-6.267494,,,Facebook,social,5299,2004,$2.43B,"{'valuation_amount': 104000000000, 'valuation_...",
4,340 Madison Ave,,New York,USA,New York,40.755716,-73.979247,NY,10017.0,Facebook,social,5299,2004,$2.43B,"{'valuation_amount': 104000000000, 'valuation_...",


In [58]:
target_companies.shape

(3699, 16)

In [59]:
target_companies.describe()

Unnamed: 0,latitude,longitude,founded_year
count,3699.0,3699.0,3699.0
mean,38.178384,-65.965888,2004.794269
std,14.802998,63.805113,3.723997
min,-41.296454,-159.480262,1990.0
25%,36.676994,-119.306607,2003.0
50%,39.568519,-80.83722,2006.0
75%,44.918213,-3.70325,2007.0
max,65.056601,175.2604,2013.0


In [60]:
target_companies['loc'] = list(zip(target_companies['longitude'], target_companies['latitude']))
#Método alternativo
'''
target_companies['loc'] = target_companies.apply(lambda row: (row['longitude'], row['latitude']), axis = 1)
'''
display(target_companies.head())

Unnamed: 0,address1,address2,city,country_code,description,latitude,longitude,state_code,zip_code,name,category_code,number_of_employees,founded_year,total_money_raised,ipo,acquisition,loc
0,710 - 2nd Avenue,Suite 1100,Seattle,USA,,47.603122,-122.333253,WA,98104.0,Wetpaint,web,47,2005,$39.8M,,"{'price_amount': 30000000, 'price_currency_cod...","(-122.333253, 47.603122)"
1,270 Lafayette Street,Suite 505,New York,USA,,40.723731,-73.996431,NY,10012.0,Wetpaint,web,47,2005,$39.8M,,"{'price_amount': 30000000, 'price_currency_cod...","(-73.9964312, 40.7237306)"
2,1601 Willow Road,,Menlo Park,USA,Headquarters,37.41605,-122.151801,CA,94025.0,Facebook,social,5299,2004,$2.43B,"{'valuation_amount': 104000000000, 'valuation_...",,"(-122.151801, 37.41605)"
3,,,Dublin,IRL,Europe HQ,53.344104,-6.267494,,,Facebook,social,5299,2004,$2.43B,"{'valuation_amount': 104000000000, 'valuation_...",,"(-6.267494, 53.344104)"
4,340 Madison Ave,,New York,USA,New York,40.755716,-73.979247,NY,10017.0,Facebook,social,5299,2004,$2.43B,"{'valuation_amount': 104000000000, 'valuation_...",,"(-73.9792469, 40.7557162)"


In [61]:
target_companies.to_json("target_companies.json", orient="records", lines=True)
target_companies.to_csv("target_companies.csv")

In [64]:
target = client['companies'].target_companies
nearLocation = {
    "lng": 51.51692,
    "lat": -0.089732
}

prospects = pd.DataFrame(target.find({
    "loc": {
     "$near": {
       "$geometry": {
          "type": "Point" ,
          "coordinates": [ nearLocation["lat"] , nearLocation["lng"] ]
       },
       "$maxDistance": 10000, # In meters
     }
   }
}))
display(prospects)



Unnamed: 0,_id,acquisition,address1,address2,category_code,city,country_code,description,founded_year,ipo,latitude,loc,longitude,name,number_of_employees,state_code,total_money_raised,zip_code
0,5cda96da3370c10c1579b721,,1 Angel Court,,web,London,GBR,,2007,,51.514735,"[-0.0874239, 51.5147349]",-0.087424,Greenvoice,3.0,,$0,EC2R 7HJ
1,5cda96da3370c10c1579ac12,,107 Cheapside,,web,London,GBR,London Office,2007,,51.514157,"[-0.0934254, 51.5141566]",-0.093425,Zemanta,30.0,,$7.35M,EC2V 6DY
2,5cda96da3370c10c1579b3b4,,16 St Martin's Le Grand,,software,London,GBR,London Office,2002,,51.515768,"[-0.0970749, 51.5157682]",-0.097075,BrightTALK,,,$20.5M,EC1A 4NA
3,5cda96da3370c10c1579b6f5,,33-37 Charterhouse Sq.,,web,London,GBR,London Office,2008,,51.520327,"[-0.0994596, 51.5203268]",-0.099460,Zookel,10.0,,$0,EC1M 6EA
4,5cda96da3370c10c1579b447,"{'price_amount': None, 'price_currency_code': ...","11 Curtain Road,","2nd Floor, THe Courtyard Building",ecommerce,London,GBR,LaunchPad,2009,,51.519613,"[-0.1020265, 51.5196135]",-0.102027,Shutl,10.0,,£7.66M,EC2A3LT
5,5cda96da3370c10c1579ab66,,17 Blossom Street,,web,London,GBR,HQ,2007,,51.521116,"[-0.0778345, 51.5211159]",-0.077835,Tipped,2.0,,$0,E1 6PL
6,5cda96da3370c10c1579ac08,,57-63 Scrutton Street,,web,London,GBR,London Office,2007,,51.523537,"[-0.0808503, 51.5235368]",-0.080850,HelloTxt,2.0,,$0,EC2A 4PF
7,5cda96da3370c10c1579b795,,6 Snow Hill,,software,London,GBR,TestPlant,2008,,51.517356,"[-0.1037742, 51.517356]",-0.103774,Testplant,50.0,,$2.56M,EC1A 2AY
8,5cda96da3370c10c1579b1e8,,77 Leonard Street,,web,London,GBR,London,2002,,51.524662,"[-0.0830772, 51.5246619]",-0.083077,Solid State Group,18.0,,$0,EC2A 4QS
9,5cda96da3370c10c1579b718,,61 Charlotte Road,,search,London,GBR,Headquarters,2006,,51.524662,"[-0.0830772, 51.5246619]",-0.083077,Tug,25.0,,$0,EC2A 3QT


In [20]:
def get_near_offices(row):
    nearLocation = {
        "lng": row["longitude"],
        "lat": row["latitude"]
    }

    prospects = pd.DataFrame(target.find({
        "loc": {
         "$near": {
           "$geometry": {
              "type": "Point" ,
               "coordinates": [ nearLocation["lng"] , nearLocation["lat"] ]
              #"coordinates": [ nearLocation["lat"] , nearLocation["lng"] ]
           },
           "$maxDistance": 2000, # In meters
         }
       }
    }))
    
    return prospects

In [21]:
for index, row in target_companies.iterrows():
    prospects = get_near_offices(row)
    print(len(prospects))
    

40
57
11
8
112
6
6
114
8
9
10
107
25
6
142
16
20
5
63
137
2
39
20
1
1
39
28
52
7
14
26
7
13
149
10
3
9
183
13
8
16
18
12
1
86
21
18
27
70
135
182
6
33
56
1
35
36
18
4
53
10
121
134
28
15
6
41
20
39
10
3
86
1
1
24
48
10
6
16
33
61
1
2
4
4
139
101
5
4
2
5
4
11
31
2
37
1
92
19
1
20
86
127
2
30
36
22
38
107
1
9
146
1
23
12
39
34
11
24
10
6
37
9
33
33
37
1
4
14
1
33
5
129
133
9
8
41
18
1
14
12
33
55
20
39
33
11
7
1
126
101
1
3
33
33
182
1
4
4
6
16
107
22
12
54
30
33
86
38
1
1
19
29
3
93
1
35
1
127
107
2
3
1
2
1
4
8
2
76
10
14
3
1
1
36
2
12
12
13
33
33
50
13
1
24
1
3
1
7
150
26
1
10
18
1
19
1
1
12
4
135
3
4
1
98
2
4
2
5
36
7
121
14
99
40
5
37
86
2
1
11
1
2
6
1
46
34
1
36
137
12
124
46
8
4
3
57
1
12
10
1
144
1
134
6
143
144
1
49
1
90
15
40
4
1
8
10
18
18
18
5
149
11
2
34
1
19
2
33
23
4
40
24
13
33
7
33
36
33
3
40
10
10
95
13
36
1
4
2
12
5
133
107
36
9
30
1
7
1
2
1
59
51
2
1
13
1
1
33
3
1
1
16
33
33
85
33
169
129
5
9
33
86
36
1
34
138
145
1
124
4
6
118
14
10
1
14
4
9
146
7
1
6
65
123
3
5
7
33


1
5
14
2
1
2
3
1
3
12
3
3
8
1
1
1
3
3
1
2
1
165
15
1
1
27
8
1
1
3
123
1
4
1
2
2
6
14
12
3
1
33
1
2
41
13
1
14
1
1
1
1
1
5
10
3
4
1
4
1
149
1
1
1
75
122
17
1
1
15
1
31
18
1
33
5
1
2
14
1
86
1
5
18
2
4
14
12
15
37
1
16
4
101
3
12
3
1
1
5
28
35
6
123
1
5
22
9
121
6
3
1
1
3
1
31
13
1
1
1
3
1
11
5
18
1
1
13
3
13
1
1
1
2
1
3
30
18
13
7
36
92
11
1
4
115
1
1
17
1
2
2
3
7
1
40
15
14
2
1
1
7
21
2
2
6
2
1
1
1
3
1
4
1
1
1
2
10
2
1
9
20
1
9
13
3
1
4
1
13
12
1
74
1
1
3
2
47
35
1
1
18
37
2
27
45
75
2
6
7
7
2
1
39
2
4
1
183
129
1
1
7
2
2
1
2
1
1
1
2
1
36
5
12
1
18
1
2
12
6
2
18
8
14
1
1
6
5
9
1
8
1
1
24
8
1
20
1
3
2
3
2
12
106
1
1
1
1
1
37
132
2
5
2
4
1
50
1
50
1
122
15
27
56
1
1
1
8
1
39
1
40
4
1
1


- VISUALIZATION PROJECT Geospatial Business Intelligence (BI)
    * Make a geospartial analysis of the `companies` dataset
    * Things you know:
        - You have a software company with 50 employees
        - The company creates video games
        - Roles in your company: 20 developers, 20 Designers/Creatieves/UX/UI and 10 executives/managers
    * Do an analysis about placing the new company offices in the best environment based on the following criteria:
        - There should be software engineers working around
        - The surroundings must have a good ratio of big companies vs startups
        - Ensure you have in your surroundings companies that cover the interests of your team
        - Avoid old companies, prefer recently created ones

- Para cada cluster:
        - Aplicamos solo si el número de empresas cercanas es superior a 30.
        - Necesario: compañía software cerca: ['web', 'software', 'social', 'games_video','network_hosting', 'search', 'ecommerce']
        - Otorgamos puntos según el sector de la empresa:
{'web': 5, 'software': 5, 'social': 5, 'games_video': 10,'network_hosting': 2, 'photo_video': 3, 'mobile': 2, 'search': 5, 'ecommerce': 5, 'consulting': 2, 'nanotech': 2}
        - Otorgamos 5 puntos por cada empresa dentro del cluster.
        - Ratio entre startups y empresas grandes: debe ser entre 0,4 y 0,6:
            -Definimos como startup aquellas posteriores o iguales a 2010 y con un máximo de      300 empleados.
            - Definimos como empresa grande aquellas fundadas entre 1990 y 2009, o con más de 300 empleados.
        - Si la empresa tiene IPO, investments, money raised, acquisition.price amount, le sumamos 10.
        Multiplicamos todos esos puntos por 0,5
    - Cogemos el money raised total y lo dividimos entre el número de empresas. Lo multiplicamos por 0,5 y lo sumamos con el otro valor.

In [69]:
def money_raised(money):
  factors = {'k': 1000,
  'M': 1000000,
  'B': 1000000000}
  return(float(money[1:-1])* factors[money[-1]])

mandatory_cats = ['web', 'software', 'social', 'games_video',
                  'network_hosting', 'search', 'ecommerce']



In [65]:
display(prospects)

Unnamed: 0,_id,acquisition,address1,address2,category_code,city,country_code,description,founded_year,ipo,latitude,loc,longitude,name,number_of_employees,state_code,total_money_raised,zip_code
0,5cda96da3370c10c1579b721,,1 Angel Court,,web,London,GBR,,2007,,51.514735,"[-0.0874239, 51.5147349]",-0.087424,Greenvoice,3.0,,$0,EC2R 7HJ
1,5cda96da3370c10c1579ac12,,107 Cheapside,,web,London,GBR,London Office,2007,,51.514157,"[-0.0934254, 51.5141566]",-0.093425,Zemanta,30.0,,$7.35M,EC2V 6DY
2,5cda96da3370c10c1579b3b4,,16 St Martin's Le Grand,,software,London,GBR,London Office,2002,,51.515768,"[-0.0970749, 51.5157682]",-0.097075,BrightTALK,,,$20.5M,EC1A 4NA
3,5cda96da3370c10c1579b6f5,,33-37 Charterhouse Sq.,,web,London,GBR,London Office,2008,,51.520327,"[-0.0994596, 51.5203268]",-0.099460,Zookel,10.0,,$0,EC1M 6EA
4,5cda96da3370c10c1579b447,"{'price_amount': None, 'price_currency_code': ...","11 Curtain Road,","2nd Floor, THe Courtyard Building",ecommerce,London,GBR,LaunchPad,2009,,51.519613,"[-0.1020265, 51.5196135]",-0.102027,Shutl,10.0,,£7.66M,EC2A3LT
5,5cda96da3370c10c1579ab66,,17 Blossom Street,,web,London,GBR,HQ,2007,,51.521116,"[-0.0778345, 51.5211159]",-0.077835,Tipped,2.0,,$0,E1 6PL
6,5cda96da3370c10c1579ac08,,57-63 Scrutton Street,,web,London,GBR,London Office,2007,,51.523537,"[-0.0808503, 51.5235368]",-0.080850,HelloTxt,2.0,,$0,EC2A 4PF
7,5cda96da3370c10c1579b795,,6 Snow Hill,,software,London,GBR,TestPlant,2008,,51.517356,"[-0.1037742, 51.517356]",-0.103774,Testplant,50.0,,$2.56M,EC1A 2AY
8,5cda96da3370c10c1579b1e8,,77 Leonard Street,,web,London,GBR,London,2002,,51.524662,"[-0.0830772, 51.5246619]",-0.083077,Solid State Group,18.0,,$0,EC2A 4QS
9,5cda96da3370c10c1579b718,,61 Charlotte Road,,search,London,GBR,Headquarters,2006,,51.524662,"[-0.0830772, 51.5246619]",-0.083077,Tug,25.0,,$0,EC2A 3QT


In [66]:
display(len(prospects) > 30)

True

In [70]:
def check_mandatory_sectors(df):
    for index, row in df.iterrows():
        if row['category_code'] in mandatory_cats:
            print(row['category_code'])
            return True
    return False

display(check_mandatory_sectors(prospects))

web


True

In [72]:
def get_sector_points(df):
    ac = 0
    category_points = {'web': 5, 'software': 5, 'social': 5, 'games_video': 10,
                    'network_hosting': 2, 'photo_video': 3, 'mobile': 2, 'search': 5, 
                    'ecommerce': 5, 'consulting': 2, 'nanotech': 2}
    for index, row in df.iterrows():
        ac += category_points[row['category_code']]
        print(ac)
    return ac

display(get_sector_points(prospects))
    

5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
100
105
107
112
117
122
127
129
134
139
144
149
154
159
164
166
171
176
181
186
191
196
201
206
211
213
218
220
225
230
235
240
242
247
252
257
262
267
272
277
282
287
289
294
299
304
306
311
316
318
323
328
333
335
337
339
344
349
351
356
361
366
371
373
378
383
388
390
395
400
405
410
415
417
422
427
432
437
439
444
449
454
459
461
463


463