# Data Scientist Open Positions by Metro Area

In [208]:
# Imports Dependencies
import pandas as pd
import numpy as np
import requests

### Define Function to Calculate Distance Between Two Sets of Latitude & Longitude

In [209]:
def gc_dist_np(lat1, lon1, lat2, lon2):
    """
    Using the Haversine Formula and presuming a spherical Earth with radius R this function  
        calculates the great circle distance (elevation not taken into account) between two points.

    Note: this method is not extremely accurate at short distances (<10km)
    
    Attributes:
        lat1 - Latitude of city 1
        lon1 - Longitude of city 1
        lat2 - Latitude of city 2
        lon2 - Longitude of city 2
        
    Returns:
        Distance in miles
    """
    R = 3956 # Approximate radius of the earth in statute miles
    
    lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])
    a = np.sin((lat2-lat1)/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a)) # great circle distance in radians

    return R * c

### Read Raw Data .csv's into 2 Dataframes

In [210]:
openings_df = pd.read_csv('openings_by_city_lat_long.csv',
                            sep=',',
                            usecols=['city', 'openings', 'lat', 'long'],
                           )
metro_area_df = pd.read_csv('metro_area_pop_data.csv',
                              sep=',',
                              header=None,
                              usecols=[1, 2, 6, 7],
                              names=['metro', 'population', 'lat', 'lon']
                             )

In [211]:
openings_df.head()

Unnamed: 0,city,openings,lat,long
0,"New York, NY",169,40.712775,-74.005973
1,"Boston, MA",158,42.360083,-71.05888
2,"Charlotte, NC",106,35.227087,-80.843127
3,"Philadelphia, PA",102,39.952584,-75.165222
4,"Irving, TX",101,32.814018,-96.948894


In [212]:
metro_area_df.head()

Unnamed: 0,metro,population,lat,lon
0,"New York-Newark-Jersey City, NY-NJ-PA",19949502,40.712784,-74.005941
1,"Los Angeles-Long Beach-Anaheim, CA",13131431,34.052234,-118.243685
2,"Chicago-Naperville-Elgin, IL-IN-WI",9537289,41.878114,-87.629798
3,"Dallas-Fort Worth-Arlington, TX",6810913,32.78014,-96.800451
4,"Houston-The Woodlands-Sugar Land, TX",6313158,29.760193,-95.36939


### Clean, Organize and Prepare Data for Processing

In [213]:
metro_area_df = metro_area_df[['metro', 'population', 'lat', 'lon']]

In [214]:
metro_area_df.head()

Unnamed: 0,metro,population,lat,lon
0,"New York-Newark-Jersey City, NY-NJ-PA",19949502,40.712784,-74.005941
1,"Los Angeles-Long Beach-Anaheim, CA",13131431,34.052234,-118.243685
2,"Chicago-Naperville-Elgin, IL-IN-WI",9537289,41.878114,-87.629798
3,"Dallas-Fort Worth-Arlington, TX",6810913,32.78014,-96.800451
4,"Houston-The Woodlands-Sugar Land, TX",6313158,29.760193,-95.36939


In [215]:
openings_df.rename(columns={"long": "lon"}, inplace=True)

In [216]:
city_state_df = openings_df['city'].str.split(", ", n = 1, expand = True)
openings_df['city'] = city_state_df[0]
openings_df['state'] = city_state_df[1]

In [217]:
openings_df = openings_df[['city', 'state', 'openings', 'lat', 'lon']]

In [218]:
openings_df.head()

Unnamed: 0,city,state,openings,lat,lon
0,New York,NY,169,40.712775,-74.005973
1,Boston,MA,158,42.360083,-71.05888
2,Charlotte,NC,106,35.227087,-80.843127
3,Philadelphia,PA,102,39.952584,-75.165222
4,Irving,TX,101,32.814018,-96.948894


### Create New Dataframe with Final Results

In [219]:
# Initialize dict to hold openings be metro area data
openings_by_metro_dict = {}
for i_pop, r_pop in metro_area_df.iterrows():
    metro_cnt = 0
    for i_open, r_open in openings_df.iterrows():
        # Metro Areas with fewer than 500,000 inhibitants are disregarded
        if r_pop['population'] < 500000:
            continue
        else:
            # Use Haversine Formula to calculate distance from metro area to city
            dist = gc_dist_np(r_pop['lat'], r_pop['lon'], r_open['lat'], r_open['lon'])
            # If distance from metro area to city is greater than 50 miles, disregard city
            if dist <= 50:
                # Maintain an accumulator of openings per metro area
                metro_cnt += int(r_open['openings'])
                openings_by_metro_dict[r_pop['metro']] = metro_cnt

In [220]:
openings_by_metro_area_df = pd.DataFrame.from_dict(openings_by_metro_dict, orient='index',
...                        columns=['openings'])

In [222]:
openings_by_metro_area_df.sort_values('openings', ascending=False)

Unnamed: 0,openings
"New York-Newark-Jersey City, NY-NJ-PA",209
"Worcester, MA-CT",187
"Boston-Cambridge-Newton, MA-NH",180
"Providence-Warwick, RI-MA",179
"San Francisco-Oakland-Hayward, CA",172
"San Jose-Sunnyvale-Santa Clara, CA",170
"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",140
"Allentown-Bethlehem-Easton, PA-NJ",138
"Dallas-Fort Worth-Arlington, TX",120
"Charlotte-Concord-Gastonia, NC-SC",106
