# Retrieving Info from Companies database

In [1]:
from pymongo import MongoClient
import pymongo
import pandas as pd
import numpy as np
import geojson,json,os
from geojson import Point, Feature, FeatureCollection, dump
import geocoder

TODO:
1. Import the companies.json dataset into a mongodb collection
2. Using pymongo connect to the database and:
    1. From all the companies get first office, number of people and company name
    2. Convert it to GeoJSON format
    3. Insert it back to another mongodb collection
3. In another notebook or python file
    1. Query the newly created collection geospatially using $near operator and for each company calculate the nearest companies with radius R meters. Use R=100, R=1000 and R=10000
    2. For each company sum the results of the nearest companies and save all results in a JSON file containing an array with the following format: 
[ 
    {
        id: XXXX
        name: <company_name>, 
        office_location:<geojson_point_office_location>,
        nearest_workers:{ 
            “r=100”:9999,
            “r=1000”:9999,
            “r=10000”:9999,
        }
    },
    {…},
    {...}
]

4. Using the created json file, add it to tableau and create 3 sheets using geospartial plot.
5. Create a tableau story and present the results you’ve found

In [2]:
def geoQueryNear(geoJsonField,lng,lat,distance=10000):
    return {
        geoJsonField: {
            "$near": {
                "$geometry": {
                    "type": "Point", 
                    "coordinates": [lng, lat]
                },
                "$maxDistance":distance
            }
        }
    }

In [3]:
client = MongoClient("mongodb://admin:admin@localhost:32773")
db = client.mybbdd

___

# First step
Create the JSON geo dataset for the companies. Avoid this step if the dataset is already finished. The collection is named as `docs` in my `mybbdd` database

In [None]:
q1=db.docs.find({"$and":[{"number_of_employees":{"$ne":None}},{"offices":{"$ne":[]}}]}
                ,projection={ "name": 1 ,"number_of_employees":1, "offices": 1,"_id":0})
df1=list(q1)
print('Number of companies: ',len(df1))

In [None]:
listed_companies=[]
for item in df1:
    if item['offices'][0]['longitude']!=None and item['offices'][0]['latitude']!=None and item['number_of_employees']!=0:
        listed_companies.append([item['name'],
                                 item['number_of_employees'],
                                 item['offices'][0]['latitude'],
                                 item['offices'][0]['longitude']])

In [None]:
df_companies=pd.DataFrame(listed_companies)
df_companies.columns=['name','employees','latitude','longitude']
print('Number of companies: ',len(df_companies))
display(df_companies.head())

In [None]:
def get_vals_for_geojson(df):
    list_points=[]
    for i in range(len(df)):
        point=Point((df['latitude'].iloc[i],df['longitude'].iloc[i]))
        at={"name":df['name'].iloc[i],
            "employees":df['employees'].iloc[i],
            "location":{"type":point['type'],
                       "coordinates":point['coordinates'][::-1]}}
        list_points.append(at)
    return list_points

In [None]:
final=get_vals_for_geojson(df_companies)

In [None]:
df_final_A=pd.DataFrame(final)
df_final_A=df_final_A[['name','employees','location']]
display(df_final_A.head())

### Saving the JSON file for companies and its location

In [None]:
if os.path.exists('companies_1_final.json'):
    os.remove('companies_1_final.json')

with open('companies_1_final.json','a') as f:
    for i,e in df_final_A.iterrows():
        f.write(e.to_json()+'\n')
f.close()

___

# Second Step
Here we are going to make some queries to the dataset we have created before to obtain the number of employees around each company for 100, 1000 and 10000 meters radius. The collection is named as `c1` in my `mybbdd` database

In [4]:
# Query to retrieve the well structured data
q2a=db.c1.find(projection={'_id':0,'name':1,'employees':1,'location':1})
lsq2a=list(q2a)
display(lsq2a[0])
print('Just checking everything is OK (5520 registers)')
print('Number of registers: ',len(lsq2a))

{'name': 'AdventNet',
 'employees': 600,
 'location': {'type': 'Point', 'coordinates': [-121.904945, 37.692934]}}

Just checking everything is OK (5520 registers)
Number of registers:  5520


### We only want USA companies so we have to filter them by the next coordinates
You can see more [in this link](http://en.wikipedia.org/wiki/Extreme_points_of_the_United_States#Westernmost)

In [5]:
top = 49.3457868 # north lat
left = -124.7844079 # west long
right = -66.9513812 # east long
bottom = 24.7433195 # south lat

### Here comes the worst part. The calculation for each company of how many employees are in a radius of 100, 1000 and 10000 meters. This could take many time

In [6]:
%%time

q2a=db.c1.find(projection={'_id':0,'name':1,'employees':1,'location':1})
lsq2a=list(q2a)
lsq2a_loop=lsq2a.copy()

lsq2a_final=[]
for company in lsq2a_loop:
    
    # Calculating employees by distance
    
    indiv=company
    cemp=indiv['employees']
    cord=indiv['location']['coordinates']
    
    distances=[100,1000,10000]
    for item in distances:

        q2b=db.c1.find(geoQueryNear('location',cord[0],cord[1], item),
                      projection={'_id':0,'name':1,'employees':1,'location':1})

        near_comp=list(q2b)
        num_comp=len(near_comp)
        for elem in near_comp:
            cemp+=elem['employees']

        indiv.update({str(item)+"m":cemp})
    indiv.update({'longitude':cord[0]})
    indiv.update({'latitude':cord[1]})
    
    # Checking if the are in the USA
    if cord[0]<=right and cord[0]>=left and cord[1]<=top and cord[1]>=bottom:
        lsq2a_final.append(indiv)
    

CPU times: user 7.54 s, sys: 469 ms, total: 8.01 s
Wall time: 37.1 s


In [7]:
# Storaging data as DataFrame

lsq2a_final_df=pd.DataFrame(lsq2a_final)
lsq2a_final_df=lsq2a_final_df.drop(['employees','location'],axis=1)
lsq2a_final_df=lsq2a_final_df[lsq2a_final_df.columns[::-1]]
lsq2a_final_df=lsq2a_final_df.drop_duplicates(subset='name')
print('Number of registers: ',len(lsq2a_final_df))
display(lsq2a_final_df.head())


Number of registers:  3731


Unnamed: 0,name,longitude,latitude,100m,1000m,10000m
0,AdventNet,-121.904945,37.692934,2800,5532,9655
1,Wetpaint,-122.333253,47.603122,399,2121,7025
2,Zoho,-121.904945,37.692934,3800,6532,10655
3,Geni,-118.393064,34.090368,36,54,27773
4,Digg,-122.394523,37.764726,120,883,23148


In [13]:
indexes=[]
for item in distances:
    argument=str(item)+'m'
    lsq2a_final_df_A=lsq2a_final_df.sort_values(argument,ascending=False)[:10]
    indexes.append(lsq2a_final_df_A.index)
indexes=np.unique(indexes).tolist()

lsq2a_final_df=lsq2a_final_df.loc[indexes]

display(lsq2a_final_df)

Unnamed: 0,name,longitude,latitude,100m,1000m,10000m
10,eBay,-121.930035,37.295005,30000,345470,760942
111,Sony,-95.712891,37.09024,371666,562832,753998
119,PayPal,-121.927696,37.294465,600000,915470,1330867
161,TechTarget,-95.712891,37.09024,191766,382932,574098
164,Sportsline,-95.712891,37.09024,191430,382596,573762
166,Espotting,-95.712891,37.09024,191416,382582,573748
211,ValueClick,-95.712891,37.09024,192278,383444,574610
348,Apple,-122.028961,37.330534,160000,240040,821473
360,NetApp,-95.712891,37.09024,199166,390332,581498
650,IBM,-73.723999,41.109534,776000,1164000,1552059


In [14]:
# Saving data as JSON file

filename='companies_2_final.json'
print(filename)
if os.path.exists(filename):
    os.remove(filename)

with open(filename,'a') as f:
    for i,e in lsq2a_final_df.iterrows():
        f.write(e.to_json()+'\n')
f.close()

companies_2_final.json


### DONE

## NOW YOU CAN SEE MY WORK AT MY TABLEAU PUBLIC PROFILE
[MY TABLEAU STORY FOR THIS LAB](https://public.tableau.com/views/Geo-Mongo-M2/Geo-MongoUSACompaniesStory?:embed=y&:display_count=yes&publish=yes)

___