In [1]:
from pymongo import MongoClient
import pandas as pd
from pandas.io.json import json_normalize

In [2]:
# Importing new DataFrame
client = MongoClient('localhost', 27017)
collection = client['companies'].companies_v2
data = pd.DataFrame(list(collection.find()))
data = data[['name', 'founded_year', 'category_code', 'number_of_employees', 'rounds', 'total_raised_MnUSD', 'round_average_MnUSD', 
             'type: 1: startup, 2: small/mid corp, 3:big corp', 'latitude', 'longitude', 'geojson']]
data.head()

Unnamed: 0,name,founded_year,category_code,number_of_employees,rounds,total_raised_MnUSD,round_average_MnUSD,"type: 1: startup, 2: small/mid corp, 3:big corp",latitude,longitude,geojson
0,Geni,2006,web,18,3.0,16.5,5.5,2,34.090368,-118.393064,"{'type': 'Point', 'coordinates': [-118.393064,..."
1,Facebook,2004,social,5299,10.0,2325.7,232.6,3,37.41605,-122.151801,"{'type': 'Point', 'coordinates': [-122.151801,..."
2,Twitter,2006,social,1300,7.0,760.2,108.6,3,37.776805,-122.416924,"{'type': 'Point', 'coordinates': [-122.4169244..."
3,Plaxo,2002,web,50,4.0,28.3,7.1,2,37.387845,-122.055197,"{'type': 'Point', 'coordinates': [-122.055197,..."
4,Facebook,2004,social,5299,10.0,2325.7,232.6,3,40.755716,-73.979247,"{'type': 'Point', 'coordinates': [-73.9792469,..."


In [3]:
# Defining a function geo query to get number of near companies of each company (2km around)

def get_near_companies(geojson):
    nearcomps = pd.DataFrame(list(collection.find(
        {'geojson': 
         { '$near': 
          {'$geometry': geojson, '$maxDistance': 3000}}}, 
        {'name'})))
    return len(nearcomps)


In [4]:
# Function geo query to get the sum of the total number employees of the companies around each company of the DataFrame

def get_near_employees(geojson):
    nearemps = pd.DataFrame(list(collection.find(
        {'geojson': 
         { '$near': 
          {'$geometry': geojson, '$maxDistance': 3000}}})))
    return sum(nearemps['number_of_employees'])


In [5]:
# Function to get ratio startup/big corp of each company of DataFrame. This is a special ratio where equal number of startups for each big corp is valued at '2'. I want 4 startups per big corp so my desire is to get a 1.4 ratio or aprox there

def get_ratio(geojson):
    ratio = pd.DataFrame(list(collection.find(
        {'geojson': 
         { '$near': 
          {'$geometry': geojson, '$maxDistance': 3000}}})))
    return sum(ratio['type: 1: startup, 2: small/mid corp, 3:big corp'])/len(ratio)


In [6]:
# Function to get ratio startup/big corp of each company of DataFrame. This is a special ratio where equal number of startups for each big corp is valued at '2'

def get_investments(geojson):
    investments = pd.DataFrame(list(collection.find(
        {'geojson': 
         { '$near': 
          {'$geometry': geojson, '$maxDistance': 3000}}})))
    invest=0
    for i in range(len(investments)):
        if data['type: 1: startup, 2: small/mid corp, 3:big corp'][i]==1:
            invest+=investments['total_raised_MnUSD'][i]
    return invest

In [7]:
# Calling all functions

data['near_companies'] = data['geojson'].apply(lambda x: get_near_companies(x))
data['employees'] = data['geojson'].apply(lambda x: get_near_employees(x))
data['ratio'] = data['geojson'].apply(lambda x: get_ratio(x))
data['investments'] = data['geojson'].apply(lambda x: get_investments(x))

data.head()

Unnamed: 0,name,founded_year,category_code,number_of_employees,rounds,total_raised_MnUSD,round_average_MnUSD,"type: 1: startup, 2: small/mid corp, 3:big corp",latitude,longitude,geojson,near_companies,employees,ratio,investments
0,Geni,2006,web,18,3.0,16.5,5.5,2,34.090368,-118.393064,"{'type': 'Point', 'coordinates': [-118.393064,...",2,118,2.0,0.0
1,Facebook,2004,social,5299,10.0,2325.7,232.6,3,37.41605,-122.151801,"{'type': 'Point', 'coordinates': [-122.151801,...",6,18830,2.333333,0.0
2,Twitter,2006,social,1300,7.0,760.2,108.6,3,37.776805,-122.416924,"{'type': 'Point', 'coordinates': [-122.4169244...",68,4969,1.838235,4.0
3,Plaxo,2002,web,50,4.0,28.3,7.1,2,37.387845,-122.055197,"{'type': 'Point', 'coordinates': [-122.055197,...",10,452,2.0,0.0
4,Facebook,2004,social,5299,10.0,2325.7,232.6,3,40.755716,-73.979247,"{'type': 'Point', 'coordinates': [-73.9792469,...",37,6446,1.783784,2.8


In [8]:
# Here we drop all the points with less than 50 companies around and we create the column ratio2 with the distance to the startup/big corp ratio desired. 

data_final = data[(data['near_companies'] > 20)]
data_final['ratio_distance'] = ((data_final['ratio']-2)**2)**(1/2)
data_final.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,name,founded_year,category_code,number_of_employees,rounds,total_raised_MnUSD,round_average_MnUSD,"type: 1: startup, 2: small/mid corp, 3:big corp",latitude,longitude,geojson,near_companies,employees,ratio,investments,ratio_distance
2,Twitter,2006,social,1300,7.0,760.2,108.6,3,37.776805,-122.416924,"{'type': 'Point', 'coordinates': [-122.4169244...",68,4969,1.838235,4.0,0.161765
4,Facebook,2004,social,5299,10.0,2325.7,232.6,3,40.755716,-73.979247,"{'type': 'Point', 'coordinates': [-73.9792469,...",37,6446,1.783784,2.8,0.216216
8,Kyte,2006,games_video,40,4.0,23.3,5.8,2,37.788482,-122.409173,"{'type': 'Point', 'coordinates': [-122.409173,...",63,4933,1.84127,0.6,0.15873
9,Meetup,2002,web,75,4.0,18.3,4.6,2,40.72604,-73.995722,"{'type': 'Point', 'coordinates': [-73.995722, ...",32,1454,1.8125,1.2,0.1875
10,MeetMoi,2007,social,15,3.0,5.6,1.9,2,40.757929,-73.985506,"{'type': 'Point', 'coordinates': [-73.985506, ...",35,6435,1.8,3.7,0.2


In [9]:
# Getting a new DataFrame with all values with a distance to perfect equallity of more than 1 points.
# 1 point ratio distance means that # startups or # big corps doubles the other.

data_final = data_final[(data_final['ratio_distance'] < 0.5)]


In [10]:
data_final = data_final.sort_values(by='investments', ascending=False)
data_final.head()

Unnamed: 0,name,founded_year,category_code,number_of_employees,rounds,total_raised_MnUSD,round_average_MnUSD,"type: 1: startup, 2: small/mid corp, 3:big corp",latitude,longitude,geojson,near_companies,employees,ratio,investments,ratio_distance
104,GoLark,2007,web,3,,0.0,0.0,2,40.7508,-73.996122,"{'type': 'Point', 'coordinates': [-73.996122, ...",43,6974,1.790698,2325.7,0.209302
635,StyleCaster,2008,web,60,2.0,5.0,2.5,2,40.74505,-73.990098,"{'type': 'Point', 'coordinates': [-73.9900984,...",41,6897,1.829268,2325.7,0.170732
98,Splunk,2003,software,500,3.0,40.0,13.3,2,37.78274,-122.390945,"{'type': 'Point', 'coordinates': [-122.390945,...",63,4897,1.825397,181.0,0.174603
28,GoingOn,2008,software,40,3.0,8.6,2.9,1,37.782263,-122.392142,"{'type': 'Point', 'coordinates': [-122.392142,...",65,4936,1.815385,175.1,0.184615
425,DocVerse,2007,web,5,1.0,1.3,1.3,2,37.788115,-122.397141,"{'type': 'Point', 'coordinates': [-122.397141,...",63,4927,1.84127,148.0,0.15873
