In [1]:
from pymongo import MongoClient
import pandas as pd
from pandas.io.json import json_normalize

Importing data from mongoDB and get list of categories


In [2]:
client = MongoClient ('localhost', 27017) 
db = client['companies']

In [3]:
categories = db.companies.distinct('category_code');
print(list(categories))

['enterprise', 'software', 'web', 'network_hosting', 'news', 'games_video', 'music', 'mobile', 'social', 'search', 'advertising', 'messaging', 'security', 'photo_video', 'finance', 'ecommerce', 'hardware', 'travel', 'public_relations', 'other', 'real_estate', 'semiconductor', 'analytics', 'health', 'legal', 'sports', 'biotech', 'cleantech', 'education', 'consulting', 'transportation', None, 'hospitality', 'fashion', 'nonprofit', 'nanotech', 'automotive', 'design', 'manufacturing', 'government', 'local', 'medical']


Getting a db with specified features and normalized by office 

In [4]:
cursor = db.companies.find({'$and': 
                             [{'number_of_employees': {'$gt': 1}},
                              {'deadpooled_year': {'$eq': None}},
                              {'offices.latitude': {'$exists': True,'$ne': None}},
                              {'offices.longitude': {'$exists': True,'$ne': None}},
                        {'$or': 
                               [{'category_code':'games_video'},
                                {'category_code':'software'},
                                {'category_code':'web'},
                               {'category_code':'social'}]
                        }]}
                            ,{'name':1, 'category_code':1, 'founded_year': 1, '_id': 0, 'number_of_employees':1, 
                              'offices.latitude':1, 'offices.longitude':1, 'offices.zip_code':1, 
                              'deadpooled_year':1, 'category_code':1, 'acquisition.price_amount':1}) 

data = json_normalize(data = cursor, record_path = 'offices', meta = ['name', 'category_code', 'number_of_employees', 'founded_year'])
data = data.dropna()
data.head()


Unnamed: 0,latitude,longitude,zip_code,name,category_code,number_of_employees,founded_year
0,34.090368,-118.393064,90069.0,Geni,web,18,2006
1,40.757929,-73.985506,,MeetMoi,social,15,2007
2,37.776805,-122.416924,94103.0,Twitter,social,1300,2006
3,37.41605,-122.151801,94025.0,Facebook,social,5299,2004
4,53.344104,-6.267494,,Facebook,social,5299,2004


Getting the DataFrame with investment rounds of companies did in the sub-data_investments document and exported as a .csv

In [5]:
DataFrame='data_investments.csv'
def read(DataFrame):
    data2 = pd.read_csv(DataFrame)
    return data2

data2 = read(DataFrame)
data2.head()

Unnamed: 0.1,Unnamed: 0,name,last_raised_amount_MnUSD,total_raised_MnUSD,round_average_MnUSD,rounds
0,2,Geni,5.0,16.5,5.5,3.0
1,5,MeetMoi,2.6,5.6,1.9,3.0
2,12,Twitter,400.0,760.2,108.6,7.0
3,24,Facebook,1500.0,2325.7,232.6,10.0
4,28,Plaxo,9.0,28.3,7.1,4.0


Merging the two DataFrames

In [6]:
data = data.merge(data2, left_on='name', right_on='name')
data.head()

Unnamed: 0.1,latitude,longitude,zip_code,name,category_code,number_of_employees,founded_year,Unnamed: 0,last_raised_amount_MnUSD,total_raised_MnUSD,round_average_MnUSD,rounds
0,34.090368,-118.393064,90069.0,Geni,web,18,2006,2,5.0,16.5,5.5,3.0
1,40.757929,-73.985506,,MeetMoi,social,15,2007,5,2.6,5.6,1.9,3.0
2,37.776805,-122.416924,94103.0,Twitter,social,1300,2006,12,400.0,760.2,108.6,7.0
3,37.41605,-122.151801,94025.0,Facebook,social,5299,2004,24,1500.0,2325.7,232.6,10.0
4,53.344104,-6.267494,,Facebook,social,5299,2004,24,1500.0,2325.7,232.6,10.0


Sorting the companies based on their year of foundation, number of employees they have and total raised ammount

In [7]:
def get_company_type(data_base):
    if data_base['number_of_employees'] <= 50 and data_base['founded_year'] >= 2008 \
    and data_base['total_raised_MnUSD'] <= 200 and data_base['rounds'] <= 5: return 1
    if data_base['number_of_employees'] >200 and data_base['total_raised_MnUSD'] >= 200: return 3
    else: return 2

def get_class(dataframe):
    data['type: 1: startup, 2: small/mid corp, 3:big corp']= data.apply(get_company_type, axis=1)
    return dataframe    

data = get_class(data)
data.head()

Unnamed: 0.1,latitude,longitude,zip_code,name,category_code,number_of_employees,founded_year,Unnamed: 0,last_raised_amount_MnUSD,total_raised_MnUSD,round_average_MnUSD,rounds,"type: 1: startup, 2: small/mid corp, 3:big corp"
0,34.090368,-118.393064,90069.0,Geni,web,18,2006,2,5.0,16.5,5.5,3.0,2
1,40.757929,-73.985506,,MeetMoi,social,15,2007,5,2.6,5.6,1.9,3.0,2
2,37.776805,-122.416924,94103.0,Twitter,social,1300,2006,12,400.0,760.2,108.6,7.0,3
3,37.41605,-122.151801,94025.0,Facebook,social,5299,2004,24,1500.0,2325.7,232.6,10.0,3
4,53.344104,-6.267494,,Facebook,social,5299,2004,24,1500.0,2325.7,232.6,10.0,3


Drop of all the columns I will not use as are zip_cope and Unnamed: 0 (the index of the second DataFrame)

In [8]:
data = data.drop(['zip_code','Unnamed: 0'], axis=1)
data.head()

Unnamed: 0,latitude,longitude,name,category_code,number_of_employees,founded_year,last_raised_amount_MnUSD,total_raised_MnUSD,round_average_MnUSD,rounds,"type: 1: startup, 2: small/mid corp, 3:big corp"
0,34.090368,-118.393064,Geni,web,18,2006,5.0,16.5,5.5,3.0,2
1,40.757929,-73.985506,MeetMoi,social,15,2007,2.6,5.6,1.9,3.0,2
2,37.776805,-122.416924,Twitter,social,1300,2006,400.0,760.2,108.6,7.0,3
3,37.41605,-122.151801,Facebook,social,5299,2004,1500.0,2325.7,232.6,10.0,3
4,53.344104,-6.267494,Facebook,social,5299,2004,1500.0,2325.7,232.6,10.0,3


Creating a geo json column to export the dataframe to MongoDB

In [11]:
def geojsn(a,b):
   return {'type': 'Point', 'coordinates':[a,b]}
data['geojson'] = data.apply(lambda x: geojsn(x['longitude'], x['latitude']),axis=1)
data.head()


Unnamed: 0,latitude,longitude,name,category_code,number_of_employees,founded_year,last_raised_amount_MnUSD,total_raised_MnUSD,round_average_MnUSD,rounds,"type: 1: startup, 2: small/mid corp, 3:big corp",geojson
0,34.090368,-118.393064,Geni,web,18,2006,5.0,16.5,5.5,3.0,2,"{'type': 'Point', 'coordinates': [-118.393064,..."
1,40.757929,-73.985506,MeetMoi,social,15,2007,2.6,5.6,1.9,3.0,2,"{'type': 'Point', 'coordinates': [-73.985506, ..."
2,37.776805,-122.416924,Twitter,social,1300,2006,400.0,760.2,108.6,7.0,3,"{'type': 'Point', 'coordinates': [-122.4169244..."
3,37.41605,-122.151801,Facebook,social,5299,2004,1500.0,2325.7,232.6,10.0,3,"{'type': 'Point', 'coordinates': [-122.151801,..."
4,53.344104,-6.267494,Facebook,social,5299,2004,1500.0,2325.7,232.6,10.0,3,"{'type': 'Point', 'coordinates': [-6.267494, 5..."


Exporting the merged DataFrame as a .json

In [10]:
data.to_json('data_v2.json',orient='records' ,lines=True)