In [59]:
from pymongo import MongoClient
import pandas as pd
import requests
import src.func as fc


## Dataset set-up

Initial query to filter by the most important parameter: companies created in the 2000s; that have not closed and with coordinates of their offices.

In [28]:
client = MongoClient("mongodb://localhost/companies")
db = client.get_database()

In [29]:
query = {"$and": [{
    "founded_year": {"$gte":2000}}, {
    "founded_year": {"$lte": 2010}}, {
    "deadpooled_year": None}, {
    "offices": {
        "$not": {
            "$size": 0
        }
    }
}]}

mydb = list(db["companies"].find(query,{"name":1,
                                        "founded_year":1,
                                        "offices":1, 
                                        "category_code": 1,
                                        "number_of_employees":1, 
                                        "_id":0}))

In [30]:
df = pd.DataFrame(mydb)

df.head()

Unnamed: 0,name,category_code,number_of_employees,founded_year,offices
0,Digg,news,60.0,2004,"[{'description': None, 'address1': '135 Missis..."
1,Geni,web,18.0,2006,"[{'description': 'Headquarters', 'address1': '..."
2,StumbleUpon,web,,2002,"[{'description': '', 'address1': '', 'address2..."
3,Gizmoz,web,,2003,"[{'description': None, 'address1': None, 'addr..."
4,Facebook,social,5299.0,2004,"[{'description': 'Headquarters', 'address1': '..."


Processing to obtain the "location" column of the "point" type to generate GeoIndex in MongoDB.

In [31]:
df = df.explode("offices")
dfOfficeData = df[["offices"]].apply(lambda r: r.offices, result_type="expand", axis=1)


In [32]:
cleanData = pd.concat([df,dfOfficeData], axis=1)
cleanData.head()

Unnamed: 0,name,category_code,number_of_employees,founded_year,offices,description,address1,address2,zip_code,city,state_code,country_code,latitude,longitude
0,Digg,news,60.0,2004,"{'description': None, 'address1': '135 Mississ...",,135 Mississippi St,,94107.0,San Francisco,CA,USA,37.764726,-122.394523
1,Geni,web,18.0,2006,"{'description': 'Headquarters', 'address1': '9...",Headquarters,9229 W. Sunset Blvd.,,90069.0,West Hollywood,CA,USA,34.090368,-118.393064
2,StumbleUpon,web,,2002,"{'description': '', 'address1': '', 'address2'...",,,,,San Francisco,CA,USA,37.775196,-122.419204
2,StumbleUpon,web,,2002,"{'description': '', 'address1': '', 'address2'...",,,,,New York City,NY,USA,,
3,Gizmoz,web,,2003,"{'description': None, 'address1': None, 'addre...",,,,,Menlo Park,CA,USA,37.48413,-122.169472


The records with NaN values in their coordinates and the columns that are not interesting me are removed.

In [33]:
cleanData = cleanData.copy().dropna(subset=["latitude", "longitude"]) 
cleanData = cleanData.drop(columns=["offices", "address1", "address2"]) #¿Por que hay que eliminarla en este paso y no después?

In [36]:
def asGeoJSON(lat,lng):
    try:
        lat = float(lat)
        lng = float(lng)
        if not math.isnan(lat) and not math.isnan(lng):
            return {
                "type":"Point",
                "coordinates":[lng,lat]
            }
    except Exception:
        print("Invalid data")
        return None
        
cleanData["location"] = cleanData[["latitude","longitude"]].apply(lambda x:asGeoJSON(x.latitude,x.longitude), axis=1)
cleanData[["latitude","longitude","location"]]

Unnamed: 0,latitude,longitude,location
0,37.764726,-122.394523,"{'type': 'Point', 'coordinates': [-122.394523,..."
1,34.090368,-118.393064,"{'type': 'Point', 'coordinates': [-118.393064,..."
2,37.775196,-122.419204,"{'type': 'Point', 'coordinates': [-122.419204,..."
3,37.484130,-122.169472,"{'type': 'Point', 'coordinates': [-122.169472,..."
4,37.416050,-122.151801,"{'type': 'Point', 'coordinates': [-122.151801,..."
...,...,...,...
8352,29.848779,-95.504545,"{'type': 'Point', 'coordinates': [-95.5045451,..."
8353,37.701940,-121.936698,"{'type': 'Point', 'coordinates': [-121.9366984..."
8354,47.088219,8.437163,"{'type': 'Point', 'coordinates': [8.4371634, 4..."
8356,40.793024,-74.323554,"{'type': 'Point', 'coordinates': [-74.3235539,..."


In [37]:
cleanData.head()

Unnamed: 0,name,category_code,number_of_employees,founded_year,description,zip_code,city,state_code,country_code,latitude,longitude,location
0,Digg,news,60.0,2004,,94107.0,San Francisco,CA,USA,37.764726,-122.394523,"{'type': 'Point', 'coordinates': [-122.394523,..."
1,Geni,web,18.0,2006,Headquarters,90069.0,West Hollywood,CA,USA,34.090368,-118.393064,"{'type': 'Point', 'coordinates': [-118.393064,..."
2,StumbleUpon,web,,2002,,,San Francisco,CA,USA,37.775196,-122.419204,"{'type': 'Point', 'coordinates': [-122.419204,..."
3,Gizmoz,web,,2003,,,Menlo Park,CA,USA,37.48413,-122.169472,"{'type': 'Point', 'coordinates': [-122.169472,..."
4,Facebook,social,5299.0,2004,Headquarters,94025.0,Menlo Park,CA,USA,37.41605,-122.151801,"{'type': 'Point', 'coordinates': [-122.151801,..."


In [38]:
dfCoordenates = cleanData.copy()
cleanData = cleanData.drop(columns=["latitude", "longitude"])
cleanData.to_json("output/cleaned_offices.json", orient="records")

## Where in Europe?

In [39]:
import geopandas as gpd
from geopandas import GeoDataFrame
from shapely.geometry import Point

from cartoframes.viz import Map, Layer
from cartoframes.viz.helpers import size_continuous_layer
from cartoframes.viz.widgets import histogram_widget

In [51]:
def geoDataFrame(df):
    geometry = [Point(xy) for xy in zip(df.longitude, df.latitude)]
    crs = {"init": "epsg:4326"}
    gdf = GeoDataFrame(df, crs=crs, geometry=geometry)
    return gdf

geodf = geoDataFrame(dfCoordenates)

type(geodf)

geopandas.geodataframe.GeoDataFrame

In [52]:
Map(Layer(geodf), viewport={'zoom': 3.26, 'lat': 50.556628, 'lng': 10.135056})

In [56]:
londondf = geodf[geodf["city"] == "London"]
berlindf = geodf[geodf["city"] == "Berlin"]
amsterdamdf = geodf[geodf["city"] == "Amsterdam"]

len(londondf), len(berlindf), len(amsterdamdf)

(213, 62, 41)

I want to do the analysis in Europe because of their higher standard of living. Although not anymore, London represents the European city and has a good pool of companies compared to the second in the ranking, Berlin.

In [57]:
display(londondf.head())

Unnamed: 0,name,category_code,number_of_employees,founded_year,description,zip_code,city,state_code,country_code,latitude,longitude,location,geometry
61,KickApps,enterprise,72.0,2004,UK Office,W1J 9HF,London,,GBR,51.509473,-0.135624,"{'type': 'Point', 'coordinates': [-0.1356235, ...",POINT (-0.13562 51.50947)
117,spigit,software,120.0,2006,,W1W 8RL,London,,GBR,51.517038,-0.139476,"{'type': 'Point', 'coordinates': [-0.1394759, ...",POINT (-0.13948 51.51704)
198,Curverider,network_hosting,5.0,2005,,W1J 7BU,London,,GBR,51.785428,-1.197534,"{'type': 'Point', 'coordinates': [-1.197534, 5...",POINT (-1.19753 51.78543)
222,Zopa,finance,,2005,,W1T 1QD,London,,GBR,51.517904,-0.139947,"{'type': 'Point', 'coordinates': [-0.139947, 5...",POINT (-0.13995 51.51790)
237,Covestor,finance,,2005,London,W1W 6AN,London,,GBR,51.51942,-0.143369,"{'type': 'Point', 'coordinates': [-0.1433689, ...",POINT (-0.14337 51.51942)


## Google API

In [66]:
import os
import json
from dotenv import load_dotenv
load_dotenv()

Python-dotenv could not parse statement starting at line 2


True

How many Starbucks are near every company?

In [72]:
def starbucks250m(val):
    API_key = os.getenv('API_KEY')
    lat = val["coordinates"][1]
    lon = val["coordinates"][0]
    base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json?"
    endpoint = "query=starbucks&location={0},{1}&radius=250&key={2}".format(lat, lon, API_key)
    res = requests.get(base_url+endpoint).json()
    number = len(res["results"])
    return number

In [74]:
londondf['Starbucks'] = londondf.apply(lambda x: starbucks250m(x["location"]), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Design agencies in London

In [76]:
def design():
    API_key = os.getenv('API_KEY')
    base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json?"
    endpoint = "query=design+agency+in+london&key={}".format(API_key)
    res = requests.get(base_url+endpoint).json()
    return res

In [82]:
# The whole JSON document
design = design()
with open('output/design.json', 'w', encoding='utf-8') as f:
    json.dump(design, f, ensure_ascii=False, indent=4)

In [83]:
# The "results" array --> My collection

design2 = design["results"]
with open('output/design2.json', 'w', encoding='utf-8') as f:
    json.dump(design2, f, ensure_ascii=False, indent=4)

In [84]:
client = MongoClient("mongodb://localhost/design2")
db = client.get_database()

In [85]:
mydb = list(db["design2"].find(projection = {"name":1,
                                        "geometry":1,
                                        }))

In [86]:
df = pd.DataFrame(mydb)

df.head()

Unnamed: 0,_id,geometry,name
0,5e4251eabc5847a39c59a485,"{'location': {'lat': 51.5238508, 'lng': -0.076...",Evolve Branding and Creative Agency in London
1,5e4251eabc5847a39c59a486,"{'location': {'lat': 51.52266119999999, 'lng':...",Brand Design in London
2,5e4251eabc5847a39c59a487,"{'location': {'lat': 51.5041254, 'lng': -0.079...",Top Branding and Design Agency in London- Spur...
3,5e4251eabc5847a39c59a488,"{'location': {'lat': 51.4790202, 'lng': -0.199...",Consultants in Design
4,5e4251eabc5847a39c59a489,"{'location': {'lat': 51.5372076, 'lng': -6.879...",London Design Agency


In [88]:
dfGeometry = df[["geometry"]].apply(lambda r: r.geometry, result_type="expand", axis=1)

In [89]:
dfGeometry

Unnamed: 0,location,viewport
0,"{'lat': 51.5238508, 'lng': -0.07681099999999999}","{'northeast': {'lat': 51.52510042989272, 'lng'..."
1,"{'lat': 51.52266119999999, 'lng': -0.1031967}","{'northeast': {'lat': 51.52396932989272, 'lng'..."
2,"{'lat': 51.5041254, 'lng': -0.07981189999999999}","{'northeast': {'lat': 51.50541327989273, 'lng'..."
3,"{'lat': 51.4790202, 'lng': -0.1995988}","{'northeast': {'lat': 51.48040902989272, 'lng'..."
4,"{'lat': 51.5372076, 'lng': -6.879999999999999e...","{'northeast': {'lat': 51.53851732989272, 'lng'..."
5,"{'lat': 51.5266198, 'lng': -0.0800414}","{'northeast': {'lat': 51.52796302989272, 'lng'..."
6,"{'lat': 51.5464922, 'lng': -0.0511709}","{'northeast': {'lat': 51.54785057989272, 'lng'..."
7,"{'lat': 51.5273195, 'lng': -0.0888947}","{'northeast': {'lat': 51.52873597989272, 'lng'..."
8,"{'lat': 51.5147519, 'lng': -0.0722078}","{'northeast': {'lat': 51.51618442989272, 'lng'..."
9,"{'lat': 51.5267875, 'lng': -0.0965553}","{'northeast': {'lat': 51.52812787989271, 'lng'..."


In [90]:
cleanData = pd.concat([df,dfGeometry], axis=1)
cleanData.head()

Unnamed: 0,_id,geometry,name,location,viewport
0,5e4251eabc5847a39c59a485,"{'location': {'lat': 51.5238508, 'lng': -0.076...",Evolve Branding and Creative Agency in London,"{'lat': 51.5238508, 'lng': -0.07681099999999999}","{'northeast': {'lat': 51.52510042989272, 'lng'..."
1,5e4251eabc5847a39c59a486,"{'location': {'lat': 51.52266119999999, 'lng':...",Brand Design in London,"{'lat': 51.52266119999999, 'lng': -0.1031967}","{'northeast': {'lat': 51.52396932989272, 'lng'..."
2,5e4251eabc5847a39c59a487,"{'location': {'lat': 51.5041254, 'lng': -0.079...",Top Branding and Design Agency in London- Spur...,"{'lat': 51.5041254, 'lng': -0.07981189999999999}","{'northeast': {'lat': 51.50541327989273, 'lng'..."
3,5e4251eabc5847a39c59a488,"{'location': {'lat': 51.4790202, 'lng': -0.199...",Consultants in Design,"{'lat': 51.4790202, 'lng': -0.1995988}","{'northeast': {'lat': 51.48040902989272, 'lng'..."
4,5e4251eabc5847a39c59a489,"{'location': {'lat': 51.5372076, 'lng': -6.879...",London Design Agency,"{'lat': 51.5372076, 'lng': -6.879999999999999e...","{'northeast': {'lat': 51.53851732989272, 'lng'..."


In [91]:
cleanData = cleanData.drop(columns=["geometry", "_id",  "viewport"])

Unnamed: 0,name,location
0,Evolve Branding and Creative Agency in London,"{'lat': 51.5238508, 'lng': -0.07681099999999999}"
1,Brand Design in London,"{'lat': 51.52266119999999, 'lng': -0.1031967}"
2,Top Branding and Design Agency in London- Spur...,"{'lat': 51.5041254, 'lng': -0.07981189999999999}"
3,Consultants in Design,"{'lat': 51.4790202, 'lng': -0.1995988}"
4,London Design Agency,"{'lat': 51.5372076, 'lng': -6.879999999999999e..."
5,Klinical - Design Agency London,"{'lat': 51.5266198, 'lng': -0.0800414}"
6,Spinach Branding Agency,"{'lat': 51.5464922, 'lng': -0.0511709}"
7,ikon | Boutique Branding Agency in London,"{'lat': 51.5273195, 'lng': -0.0888947}"
8,London Creative Designs,"{'lat': 51.5147519, 'lng': -0.0722078}"
9,Fabrik Brands,"{'lat': 51.5267875, 'lng': -0.0965553}"
