# Getting Openings from Indeed.com to Map

These are openings for full-time data science jobs from March 1 from Indeed.com. We run them through Google's geolocating API to get the data for a heat map

In [1]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib as plt
import gmaps

import requests
import json

# Google developer API key
from config import gkey

In [2]:
# load in CSV
openings_df = pd.read_csv("resources/input/entry_level_openings_indeed.csv")
openings_df.head()
openings_df['location'] = openings_df['job_title_city'] + ", "+ openings_df['job_title_state']
openings_df.head()

Unnamed: 0,job_title_name,job_title_company,job_title_city,job_title_state,location
0,Intermediate Data Scientist,Citi,Irving,TX,"Irving, TX"
1,Data Science Intern,The Hartford,Charlotte,NC,"Charlotte, NC"
2,"Intern, Data Science",Vertex Pharmaceuticals,Boston,MA,"Boston, MA"
3,"Data Science QA Engineer - Conshohocken, PA",RS Energy Group,Conshohocken,PA,"Conshohocken, PA"
4,Data Science Summer Intern,Commonwealth Care Alliance,Boston,MA,"Boston, MA"


In [3]:
#sort by city
by_city_df = openings_df.sort_values(by=['location'])
by_city_df.head()
# remove "united states"
by_city_df = by_city_df[by_city_df.job_title_city != "United States"]
by_city_df.head()

Unnamed: 0,job_title_name,job_title_company,job_title_city,job_title_state,location
950,Computer Science and Data Science,"BGI, LLC",Akron,OH,"Akron, OH"
1110,Data Scientist,VoiceGlance,Albany,NY,"Albany, NY"
511,Data Research & Entry Analyst,First Light Fiber,Albany,NY,"Albany, NY"
773,Summer Internship/Data Science,Systems Planning and Analysis,Alexandria,VA,"Alexandria, VA"
848,Summer Internship/Data Science,Systems Planning and Analysis,Alexandria,VA,"Alexandria, VA"


In [4]:
# get top states
state_counts = by_city_df['job_title_state'].value_counts()
state_counts.head(10)

#make it a df

state_counts_df = state_counts.rename_axis('state').reset_index(name='openings')

state_counts_df.head()

#send it to .CSV

output_states = state_counts_df.to_csv('state_openings.csv', index = None, header=True)

In [5]:
# get top cities
city_counts = by_city_df['location'].value_counts()
city_counts.head(10)


New York, NY         169
Boston, MA           158
Charlotte, NC        106
Philadelphia, PA     102
Irving, TX           101
Yakima, WA            99
San Francisco, CA     60
Arlington, VA         48
Chicago, IL           31
Conshohocken, PA      23
Name: location, dtype: int64

In [6]:
# make a df from the counts
city_counts_df = city_counts.rename_axis('city').reset_index(name='openings')

city_counts_df.head()


Unnamed: 0,city,openings
0,"New York, NY",169
1,"Boston, MA",158
2,"Charlotte, NC",106
3,"Philadelphia, PA",102
4,"Irving, TX",101


In [7]:
city_lat = []
city_long = []
city_state = []

#loop through df and get array of lat and long

for index, row in city_counts_df.iterrows():
    target_city = row["city"]
    target_url = ('https://maps.googleapis.com/maps/api/geocode/json?'
    'address={0}&key={1}').format(target_city, gkey)
    
    # Run a request to endpoint and convert result to json
    geo_data = requests.get(target_url).json()
    
    #append the lat and long to array
    city_lat.append(geo_data["results"][0]["geometry"]["location"]["lat"])
    city_long.append(geo_data["results"][0]["geometry"]["location"]["lng"])
    city_state.append(geo_data["results"][0]["address_components"][2]["long_name"])
    
   

In [26]:
len(city_long)

283

In [27]:
len(city_lat)

283

In [28]:
len(city_counts_df)

283

In [29]:
city_counts_df['lat'] = city_lat
city_counts_df['long'] = city_long
city_counts_df['g_metro'] = city_state
city_counts_df['state_1'] = city_counts_df.city.str[-2]+city_counts_df.city.str[-1]

In [30]:
city_counts_df.head()

Unnamed: 0,city,openings,lat,long,g_metro,state_1
0,"New York, NY",169,40.712775,-74.005973,United States,NY
1,"Boston, MA",158,42.360082,-71.05888,Massachusetts,MA
2,"Charlotte, NC",106,35.227087,-80.843127,Mecklenburg County,NC
3,"Philadelphia, PA",102,39.952584,-75.165222,Pennsylvania,PA
4,"Irving, TX",101,32.814018,-96.948894,Texas,TX


In [31]:
city_counts_df.columns

Index(['city', 'openings', 'lat', 'long', 'g_metro', 'state_1'], dtype='object')

In [44]:
# save this so I don't have to keep hitting Google 

city_counts_df.to_csv("resources/output/indeed_ds_city_openings_to_heatmap.csv")