In [1]:
from pymongo import MongoClient
import pandas as pd
from pandas.io.json import json_normalize

In [2]:
# Importing new DataFrame
client = MongoClient('localhost', 27017)
collection = client['companies'].companies_2
data = pd.DataFrame(list(collection.find()))
data = data[['name', 'founded_year', 'category_code', 'number_of_employees', 'type: 1: startup, 2: small/mid corp, 3:big corp',
                    'latitude', 'longitude', 'geojson']]
data.head()

Unnamed: 0,name,founded_year,category_code,number_of_employees,"type: 1: startup, 2: small/mid corp, 3:big corp",latitude,longitude,geojson
0,MeetMoi,2007,social,15,1,40.757929,-73.985506,"{'type': 'Point', 'coordinates': [-73.985506, ..."
1,Geni,2006,web,18,1,34.090368,-118.393064,"{'type': 'Point', 'coordinates': [-118.393064,..."
2,Facebook,2004,social,5299,3,37.41605,-122.151801,"{'type': 'Point', 'coordinates': [-122.151801,..."
3,Facebook,2004,social,5299,3,40.755716,-73.979247,"{'type': 'Point', 'coordinates': [-73.9792469,..."
4,Facebook,2004,social,5299,3,53.344104,-6.267494,"{'type': 'Point', 'coordinates': [-6.267494, 5..."


In [3]:
# Defining a function geo query to get number of near companies of each company (2km around)

def get_near_companies(geojson):
    nearcomps = pd.DataFrame(list(collection.find(
        {'geojson': 
         { '$near': 
          {'$geometry': geojson, '$maxDistance': 2000}}}, 
        {'name'})))
    return len(nearcomps)


In [4]:
# Function geo query to get the sum of the total number employees of the companies around each company of the DataFrame

def get_near_employees(geojson):
    nearemps = pd.DataFrame(list(collection.find(
        {'geojson': 
         { '$near': 
          {'$geometry': geojson, '$maxDistance': 2000}}})))
    return sum(nearemps['number_of_employees'])


In [5]:
# Function to get ratio startup/big corp of each company of DataFrame. This is a special ratio where equal number of startups for each big corp is valued at '2'

def get_ratio(geojson):
    ratio = pd.DataFrame(list(collection.find(
        {'geojson': 
         { '$near': 
          {'$geometry': geojson, '$maxDistance': 2000}}})))
    return sum(ratio['type: 1: startup, 2: small/mid corp, 3:big corp'])/len(ratio)


In [6]:
# Calling all functions

data['near_companies'] = data['geojson'].apply(lambda x: get_near_companies(x))
data['employees'] = data['geojson'].apply(lambda x: get_near_employees(x))
data['ratio'] = data['geojson'].apply(lambda x: get_ratio(x))

In [7]:
# Here we drop all the points with less than 50 companies around and we create the column ratio2 with the distance to the startup/big corp equallity

data_final = data[(data['near_companies'] > 50)]
data_final['ratio_distance'] = ((data_final['ratio']-2)**2)**(1/2)
data_final.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,name,founded_year,category_code,number_of_employees,"type: 1: startup, 2: small/mid corp, 3:big corp",latitude,longitude,geojson,near_companies,employees,ratio,ratio_distance
0,MeetMoi,2007,social,15,1,40.757929,-73.985506,"{'type': 'Point', 'coordinates': [-73.985506, ...",56,6911,1.267857,0.732143
3,Facebook,2004,social,5299,3,40.755716,-73.979247,"{'type': 'Point', 'coordinates': [-73.9792469,...",56,6976,1.285714,0.714286
8,Twitter,2006,social,1300,3,37.776805,-122.416924,"{'type': 'Point', 'coordinates': [-122.4169244...",56,3389,1.267857,0.732143
9,Kyte,2006,games_video,40,1,37.788482,-122.409173,"{'type': 'Point', 'coordinates': [-122.409173,...",94,5482,1.287234,0.712766
35,GoingOn,2008,software,40,1,37.782263,-122.392142,"{'type': 'Point', 'coordinates': [-122.392142,...",74,3800,1.283784,0.716216


In [10]:
# Getting a new DataFrame with all values with a distance to perfect equallity of more than 1 points.
# 1 point ratio distance means that # startups or # big corps doubles the other.

data_final = data_final[(data_final['ratio_distance'] < 1)]
data_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 135 entries, 0 to 2153
Data columns (total 12 columns):
name                                               135 non-null object
founded_year                                       135 non-null int64
category_code                                      135 non-null object
number_of_employees                                135 non-null int64
type: 1: startup, 2: small/mid corp, 3:big corp    135 non-null int64
latitude                                           135 non-null float64
longitude                                          135 non-null float64
geojson                                            135 non-null object
near_companies                                     135 non-null int64
employees                                          135 non-null int64
ratio                                              135 non-null float64
ratio_distance                                     135 non-null float64
dtypes: float64(4), int64(5), object(3)
mem

In [11]:
final = [(data_final['ratio_distance'].min())]
df_final = pd.DataFrame(final)
df_final.head(50)


Unnamed: 0,0
0,0.662162
