In [1]:
from pymongo import MongoClient
import pandas as pd
from pandas.io.json import json_normalize

In [2]:
client = MongoClient('localhost', 27017)
db = client['companies']
collection = db.companies

In [3]:
companies = collection.find(
    {'$and':[
        {'$or':[
            {'category_code': 'web'},
            {'category_code': 'software'},
            {'category_code': 'games_video'},
            {'category_code': 'social'}]},
        {'$or':[
            {'number_of_employees': {'$lte': 50}},
            {'number_of_employees': {'$gte': 250}}]},
        {'offices.latitude': {'$ne': None}},
        {'offices.longitude': {'$ne': None}},
        {'offices.city': {'$ne': None}},
        {'offices.country_code': {'$ne': None}},
        {'name': {'$ne': None}},
        {'founded_year': {'$gte': 2005}}, {'offices': { '$ne': []} }]},
    {'name' : 1, 'category_code' : 1, 'number_of_employees' : 1,
     'founded_year' : 1, 'offices.country_code': 1, 'offices.city': 1, 'offices.latitude' : 1, 'offices.longitude' : 1, '_id': 0})

print(companies)

<pymongo.cursor.Cursor object at 0x7f6296346240>


In [4]:
data = json_normalize(data = companies, record_path = 'offices', meta = ['name', 'category_code', 'number_of_employees', 'founded_year'])
data.head()

Unnamed: 0,city,country_code,latitude,longitude,name,category_code,number_of_employees,founded_year
0,Seattle,USA,47.603122,-122.333253,Wetpaint,web,47,2005
1,New York,USA,40.723731,-73.996431,Wetpaint,web,47,2005
2,Pleasanton,USA,37.692934,-121.904945,Zoho,software,1600,2005
3,West Hollywood,USA,34.090368,-118.393064,Geni,web,18,2006
4,New York City,USA,40.757929,-73.985506,MeetMoi,social,15,2007


In [5]:
def get_type(number):
    if number <= 50: return 'startup'
    else: return 'large company'

In [6]:
data['type'] = data['number_of_employees'].apply(get_type)
data.head()

Unnamed: 0,city,country_code,latitude,longitude,name,category_code,number_of_employees,founded_year,type
0,Seattle,USA,47.603122,-122.333253,Wetpaint,web,47,2005,startup
1,New York,USA,40.723731,-73.996431,Wetpaint,web,47,2005,startup
2,Pleasanton,USA,37.692934,-121.904945,Zoho,software,1600,2005,large company
3,West Hollywood,USA,34.090368,-118.393064,Geni,web,18,2006,startup
4,New York City,USA,40.757929,-73.985506,MeetMoi,social,15,2007,startup


In [7]:
def get_geo(a, b):
    return {'type': 'Point', 'coordinates': [a, b]}

In [8]:
data['geopoint'] = data.apply(lambda x: get_geo(x['longitude'], x['latitude']), axis = 1)

In [9]:
data.head()

Unnamed: 0,city,country_code,latitude,longitude,name,category_code,number_of_employees,founded_year,type,geopoint
0,Seattle,USA,47.603122,-122.333253,Wetpaint,web,47,2005,startup,"{'type': 'Point', 'coordinates': [-122.333253,..."
1,New York,USA,40.723731,-73.996431,Wetpaint,web,47,2005,startup,"{'type': 'Point', 'coordinates': [-73.9964312,..."
2,Pleasanton,USA,37.692934,-121.904945,Zoho,software,1600,2005,large company,"{'type': 'Point', 'coordinates': [-121.904945,..."
3,West Hollywood,USA,34.090368,-118.393064,Geni,web,18,2006,startup,"{'type': 'Point', 'coordinates': [-118.393064,..."
4,New York City,USA,40.757929,-73.985506,MeetMoi,social,15,2007,startup,"{'type': 'Point', 'coordinates': [-73.985506, ..."


In [10]:
data.to_json('data_clean.json', orient = 'records', lines = True)