## Importing Data & Libraries

In [1]:
import pandas as pd
import regex as re
import os

In [2]:
## Reads all csv's in granular folder
df = pd.DataFrame()
for file in os.listdir('../data/exports/granular/'):
    filename = os.fsdecode(file)
    if filename.endswith('.csv'):
        df = pd.concat([df, pd.read_csv('../data/exports/granular/'+filename)])
    else:
        pass

In [3]:
## Renames addresses where the country is in its full name rather than the 3 letter code
df['end_address'] = [address.replace('United States', 'USA') for address in df['end_address']]
df['end_address'] = [address.replace('Canada', 'CAN') for address in df['end_address']]

In [4]:
## Creates a country column
country = []
for address in df['end_address']:
    if 'USA' in address:
        country.append('USA')
    elif 'CAN' in address:
        country.append('CAN')
    else:
        country.append('Unidentified')
df['country'] = country

In [5]:
## Drops row where the country isn't usa or CAN
df.drop(df[((df['country'] != 'USA') & (df['country'] != 'CAN'))].index, inplace=True)
df.reset_index(inplace=True, drop=True)

In [6]:
## Creates a state column
state = []
for address in df['end_address']:
    try:
         state.append(re.search('(A[KLRZ]|C[AOT]|D[CE]|FL|GA|HI|I[ADLN]|K[SY]|LA|M[ADEINOST]|N[CDEHJMVY]|O[HKR]'
                                '|P[AR]|RI|S[CD]|T[NX]|UT|V[AIT]|W[AIVY])', address).group(0))
    except:
        state.append('Unidentified')
df['state'] = state

In [7]:
## Resets duration and distance to floats
df['duration'] = df['duration'].astype('float')
df['distance'] = df['distance'].astype('float')

## Calculate Swamp Score

In [8]:
def calculate_points(df):
    for i, row in df.iterrows():
        if row['duration'] < 600:
            df.loc[i, 'proximity'] = 'close'
            df.loc[i, 'points'] = 3
        elif row['duration'] < 1200:
            df.loc[i, 'proximity'] = 'medium'
            df.loc[i, 'points'] = 2
        elif row['duration'] < 3600:
            df.loc[i, 'proximity'] = 'far'
            df.loc[i, 'points'] = 1
        else:
            df.loc[i, 'proximity'] = 'over_hour'
            df.loc[i, 'points'] = 0
    return df

In [9]:
def calculate_gradient_points(df):
    for i, row in df.iterrows():
        if row['duration'] <= 3600:
            df.loc[i, 'gradient_points'] = abs(row['duration'] - 3600)
        else:
            df.loc[i, 'gradient_points'] = 0
    return df

In [10]:
def calculate_swamp_score(df):
    iter_address = set(df['start_address'])
    swamp_df = pd.DataFrame()
    for i, i_address in enumerate(iter_address):
        swamp_df.loc[i,'start_address'] = i_address
        
        conv_store_points = df[df['start_address'] == i_address].groupby('label').sum()['points']['conv_store']
        swamp_df.loc[i,'conv_store_points'] = conv_store_points
        
        fast_food_points = df[df['start_address'] == i_address].groupby('label').sum()['points']['fast_food_rest']
        swamp_df.loc[i,'fast_food_points'] = fast_food_points
        
        groc_store_points = df[df['start_address'] == i_address].groupby('label').sum()['points']['groc_store']
        swamp_df.loc[i,'groc_store_points'] = groc_store_points
        
        swamp_df.loc[i, 'swamp_score'] = (conv_store_points + fast_food_points) / groc_store_points
        
        
        conv_store_grad_points = df[df['start_address'] == i_address].groupby('label')\
            .sum()['gradient_points']['conv_store']
        swamp_df.loc[i,'conv_store_grad_points'] = conv_store_grad_points
        fast_food_grad_points = df[df['start_address'] == i_address].groupby('label')\
            .sum()['gradient_points']['fast_food_rest']
        swamp_df.loc[i,'fast_food_grad_points'] = fast_food_grad_points
        groc_store_grad_points = df[df['start_address'] == i_address]\
            .groupby('label').sum()['gradient_points']['groc_store']
        swamp_df.loc[i,'groc_store_grad_points'] = groc_store_points
        swamp_df.loc[i, 'grad_swamp_score'] = (conv_store_grad_points + fast_food_grad_points)\
            / groc_store_grad_points
        
    return swamp_df

In [11]:
df = calculate_points(df)

In [12]:
df = calculate_gradient_points(df)

In [15]:
swamp_df = calculate_swamp_score(df)


Unnamed: 0,start_address,swamp_score,grad_swamp_score
0,803-Sheep-Farm-Rd-Weybridge-VT,1.388889,1.18305
1,410-Delaware-St-Denver-CO,3.689362,4.052134
2,351-California-St-Ste-450-San-Francisco-CA,3.838983,3.865289


In [18]:
swamp_df

Unnamed: 0,start_address,conv_store_points,fast_food_points,groc_store_points,swamp_score,conv_store_grad_points,fast_food_grad_points,groc_store_grad_points,grad_swamp_score
0,803-Sheep-Farm-Rd-Weybridge-VT,6.0,69.0,54.0,1.388889,1485.0,60747.0,54.0,1.18305
1,410-Delaware-St-Denver-CO,210.0,1524.0,470.0,3.689362,283283.0,2316882.0,470.0,4.052134
2,351-California-St-Ste-450-San-Francisco-CA,103.0,1256.0,354.0,3.838983,167376.0,1714811.0,354.0,3.865289


In [17]:
df.to_csv('../data/exports/granular_master.csv', index=False)