## Python BI Location project

In [1]:
#Importing libraries

import pandas as pd
import re
from pymongo import MongoClient

In [2]:
#Importing data base

client=MongoClient()
db = client.db_companies
collection_companies = db.companies

In [3]:
#Some categories available

import itertools
from operator import itemgetter
categories = list(collection_companies.find({ 'category_code' : { '$exists' : 'true' }}, {'_id':0,'category_code' :1}).limit(2000))
res = dict((k, list(g)) for k, g in itertools.groupby(categories, key=itemgetter('category_code')))
list(res.keys())

['web',
 'enterprise',
 'software',
 'news',
 'social',
 'network_hosting',
 'games_video',
 'music',
 'mobile',
 'search',
 'advertising',
 'messaging',
 'security',
 'photo_video',
 'finance',
 'hardware',
 'ecommerce',
 'travel',
 'public_relations',
 'other',
 'real_estate',
 'semiconductor',
 'analytics',
 'health',
 'legal',
 'sports',
 'biotech',
 'cleantech',
 'education',
 'consulting',
 'transportation',
 None,
 'hospitality',
 'fashion',
 'nonprofit',
 'nanotech',
 'automotive',
 'design']

In [4]:
#Filtering all data

filtering = list(collection_companies.find({'$or': [{'category_code':'ecommerce'},\
                                         {'category_code':'hardware'},\
                                         {'category_code':'software'}],\
                                         'number_of_employees' : {'$lt': 1000 },\
                                         'offices' : {'$not': {'$size':0}},\
                                         'offices.latitude' : { '$ne': None }},\
                                         {'_id':0,'name':1,'category_code':1,'offices':1}))

In [5]:
#Amount of data

len(filtering)

1201

In [6]:
#Creating a data frame with the data

df=pd.DataFrame(filtering)
df.head()

Unnamed: 0,name,category_code,offices
0,ooma,hardware,"[{'description': '', 'address1': '1840 Embarca..."
1,GoingOn,software,"[{'description': 'GoingOn Networks, Inc.', 'ad..."
2,Bazaarvoice,software,"[{'description': 'Head Office', 'address1': '3..."
3,spigit,software,"[{'description': '', 'address1': '311 Ray Stre..."
4,JumpBox,software,"[{'description': '', 'address1': 'PO Box 15265..."


In [7]:
#Deleting null data

df=df.dropna()

In [8]:
#Function to get lat and len

def get_first(data):
    data=data['offices']
    
    principal=None
    
    
    if data[0]['latitude'] and data[0]['longitude']:
        # esto ya es una geoquery (geopoint)
        principal={
            'type':'Point',
            'coordinates':[data[0]['longitude'],
                           data[0]['latitude']]
        }
        
    return {'totalOffices': len(data), 
            'lat': data[0]['latitude'],
            'lng': data[0]['longitude'],
            'principal': principal}

In [9]:
#New structurated data with lat and long

New_office=df[['offices']].apply(get_first, result_type='expand', axis=1)

In [10]:
#Overview

New_office = New_office.dropna()
New_office.head()

Unnamed: 0,totalOffices,lat,lng,principal
0,1,37.451958,-122.116026,"{'type': 'Point', 'coordinates': [-122.116026,..."
1,1,37.782263,-122.392142,"{'type': 'Point', 'coordinates': [-122.392142,..."
2,1,30.407545,-97.717667,"{'type': 'Point', 'coordinates': [-97.717667, ..."
3,3,37.663728,-121.873181,"{'type': 'Point', 'coordinates': [-121.8731805..."
4,1,33.429864,-111.944967,"{'type': 'Point', 'coordinates': [-111.944967,..."


In [11]:
#New data frame 

df=pd.concat([df, New_office], axis=1).drop(columns=['offices'])
df=df.dropna()
df.head()

Unnamed: 0,name,category_code,totalOffices,lat,lng,principal
0,ooma,hardware,1,37.451958,-122.116026,"{'type': 'Point', 'coordinates': [-122.116026,..."
1,GoingOn,software,1,37.782263,-122.392142,"{'type': 'Point', 'coordinates': [-122.392142,..."
2,Bazaarvoice,software,1,30.407545,-97.717667,"{'type': 'Point', 'coordinates': [-97.717667, ..."
3,spigit,software,3,37.663728,-121.873181,"{'type': 'Point', 'coordinates': [-121.8731805..."
4,JumpBox,software,1,33.429864,-111.944967,"{'type': 'Point', 'coordinates': [-111.944967,..."


In [12]:
#Filtering data document into the original folder

db.New_office.insert_many(df.to_dict('records'))

<pymongo.results.InsertManyResult at 0x1fb39fb0248>

In [13]:
#2dsphere

db.New_office.create_index([('principal', '2dsphere')])

'principal_2dsphere'