In [1]:
from pymongo import MongoClient
import pandas as pd
from pandas.io.json import json_normalize

In [2]:
# Importing data from mongoDB
client = MongoClient ('localhost', 27017) 
db = client['companies']

In [3]:
# Getting list of categories
categories = db.companies.distinct('category_code');
print(list(categories))

['enterprise', 'software', 'web', 'network_hosting', 'news', 'games_video', 'music', 'mobile', 'social', 'search', 'advertising', 'messaging', 'security', 'photo_video', 'finance', 'ecommerce', 'hardware', 'travel', 'public_relations', 'other', 'real_estate', 'semiconductor', 'analytics', 'health', 'legal', 'sports', 'biotech', 'cleantech', 'education', 'consulting', 'transportation', None, 'hospitality', 'fashion', 'nonprofit', 'nanotech', 'automotive', 'design', 'manufacturing', 'government', 'local', 'medical']


In [4]:
# Getting a db with specified features 

cursor = db.companies.find({'$and': 
                             [{'number_of_employees': {'$gt': 1}},
                              {'deadpooled_year': {'$eq': None}},
                              {'offices.latitude': {'$exists': True,'$ne': None}},
                              {'offices.longitude': {'$exists': True,'$ne': None}},
                        {'$or': 
                               [{'category_code':'games_video'},
                                {'category_code':'software'},
                                {'category_code':'web'},
                               {'category_code':'social'}]
                        }]}
                            ,{'name':1, 'category_code':1, 'founded_year': 1, '_id': 0, 'number_of_employees':1, 
                              'offices.latitude':1, 'offices.longitude':1, 'offices.zip_code':1,'funding_rounds.raised_amount':1, 
                              'funding_rounds.raised_currency_code':1, 'funding_rounds.funded_year':1, 
                              'funding_rounds.round_code':1, 'ipo':1, 'deadpooled_year':1, 'category_code':1, 
                              'acquisition.price_amount':1, 'acquisition.acquired_year':1,
                              'acquisition.price_currency_code':1}) 

data = json_normalize(data = cursor, record_path = 'offices', meta = ['name', 'category_code', 'number_of_employees', 'founded_year'])
data = data.dropna()
data.head()


Unnamed: 0,latitude,longitude,zip_code,name,category_code,number_of_employees,founded_year
0,34.090368,-118.393064,90069.0,Geni,web,18,2006
1,40.757929,-73.985506,,MeetMoi,social,15,2007
2,37.776805,-122.416924,94103.0,Twitter,social,1300,2006
3,37.41605,-122.151801,94025.0,Facebook,social,5299,2004
4,53.344104,-6.267494,,Facebook,social,5299,2004


In [5]:
# Sorting the companies based on their year of foundation and the number of employees they have

def get_company_type(data_base):
    if data_base['number_of_employees'] <= 50 and data_base['founded_year'] <= 2008: return 1
    if data_base['number_of_employees'] >300: return 3
    else: return 2

def get_class(dataframe):
    data['type: 1: startup, 2: small/mid corp, 3:big corp']= data.apply(get_company_type, axis=1)
    return dataframe    

data = get_class(data)
data.head()

Unnamed: 0,latitude,longitude,zip_code,name,category_code,number_of_employees,founded_year,"type: 1: startup, 2: small/mid corp, 3:big corp"
0,34.090368,-118.393064,90069.0,Geni,web,18,2006,1
1,40.757929,-73.985506,,MeetMoi,social,15,2007,1
2,37.776805,-122.416924,94103.0,Twitter,social,1300,2006,3
3,37.41605,-122.151801,94025.0,Facebook,social,5299,2004,3
4,53.344104,-6.267494,,Facebook,social,5299,2004,3


In [6]:
# Creating a geo json column to export the dataframe to MongoDB

def geojsn(a,b):
   return {'type': 'Point', 'coordinates':[a,b]}
data['geojson'] = data.apply(lambda x: geojsn(x['longitude'], x['latitude']),axis=1)
data.head()


Unnamed: 0,latitude,longitude,zip_code,name,category_code,number_of_employees,founded_year,"type: 1: startup, 2: small/mid corp, 3:big corp",geojson
0,34.090368,-118.393064,90069.0,Geni,web,18,2006,1,"{'type': 'Point', 'coordinates': [-118.393064,..."
1,40.757929,-73.985506,,MeetMoi,social,15,2007,1,"{'type': 'Point', 'coordinates': [-73.985506, ..."
2,37.776805,-122.416924,94103.0,Twitter,social,1300,2006,3,"{'type': 'Point', 'coordinates': [-122.4169244..."
3,37.41605,-122.151801,94025.0,Facebook,social,5299,2004,3,"{'type': 'Point', 'coordinates': [-122.151801,..."
4,53.344104,-6.267494,,Facebook,social,5299,2004,3,"{'type': 'Point', 'coordinates': [-6.267494, 5..."


In [9]:
data.to_json('data_2.json',orient='records' ,lines=True)