In [1]:
import numpy as np
import pandas as pd
import glob
import pickle
from tqdm import tqdm
from datetime import datetime
from collections import Counter

In [2]:
from google.colab import drive as mountGoogleDrive 
mountGoogleDrive.mount('/content/drive')

Mounted at /content/drive


In [3]:
in_dir = r'/content/drive/MyDrive/Research/Safegraph/scratch/20221205/batch'
out_dir = r'/content/drive/MyDrive/Research/Safegraph/scratch/20221205/'

# Data Processing

In [4]:
def print_bad_files(in_dir):
    """Combines pickle files of lists of json files that didn't load
    """
    pickle_files = glob.glob(f'{in_dir}/*.pickle')
    bad_files = []
    for picklepath in pickle_files:
        with open(picklepath, 'rb') as f:
            bad_files.append(pickle.load(f))
    bad_files = [item for sublist in bad_files for item in sublist]
    print(bad_files)


def combine_batch_csv(in_dir):
    """Combines Safegraph dataframes into one
    """
    csv_files = glob.glob(f'{in_dir}/*.csv')

    df_list = []

    for filename in tqdm(csv_files):
        df = pd.read_csv(filename, header=0, index_col=0)
        df = df[~df['raw_visitor_counts'].isnull()]
        df = df[~df['node.safegraph_core.naics_code'].isnull()]
        df_list.append(df)

    df = pd.concat(df_list, axis=0)

    return df


def clean_df(df, density_cutoff=150, dwell_cutoff=500, avg_by_naics=True):
    """ Returns dataframe with density and median dwell time info
    """
    # drop nans
    # df = df[~df['raw_visitor_counts'].isnull()]
    # df = df[~df['node.safegraph_core.naics_code'].isnull()]

    # set types
    df['node.safegraph_core.naics_code'] = df['node.safegraph_core.naics_code'].astype(int).astype(str)
    naics_code_lengths = df['node.safegraph_core.naics_code'].apply(len).tolist()
    print(Counter(naics_code_lengths))

    # create features
    df['density'] = df['raw_visit_counts'] / df['node.safegraph_geometry.wkt_area_sq_meters']

    # remove outliers
    df = df.loc[(df['median_dwell'] < dwell_cutoff) & (df['density'] < density_cutoff),:]

    # filter to 2019
    df['start'] = pd.to_datetime(df.start, format='%Y-%m-%d')
    df['end'] = pd.to_datetime(df.end, format='%Y-%m-%d')
    df = df.loc[(df.start > '2019-01-01') & (df.start < '2019-12-31'), :]

    # aggregate
    if avg_by_naics:
        df_mean = df.groupby('node.safegraph_core.naics_code')['density', 'median_dwell'].mean()
    else:
        df_mean = df.groupby('node.placekey')['density', 'median_dwell'].mean()
    
    # log transform
    df_mean['log_density'] = np.log(df_mean['density'])
    df_mean['log_median_dwell'] = np.log(df_mean['median_dwell'])
   
    df_mean = df_mean.reset_index()

    return df_mean

In [5]:
df = combine_batch_csv(in_dir)

100%|██████████| 9/9 [02:20<00:00, 15.59s/it]


In [6]:
df['node.safegraph_core.naics_code'] = df['node.safegraph_core.naics_code'].astype(int).astype(str)
naics_map = df.loc[:,['node.safegraph_core.naics_code','node.safegraph_core.top_category','node.safegraph_core.sub_category']].drop_duplicates()

In [7]:
naics_df = clean_df(df)

Counter({6: 4977967, 4: 325067, 5: 43005, 3: 14083})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['start'] = pd.to_datetime(df.start, format='%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['end'] = pd.to_datetime(df.end, format='%Y-%m-%d')
  df_mean = df.groupby('node.safegraph_core.naics_code')['density', 'median_dwell'].mean()


In [8]:
naics_df.shape

(261, 5)

In [None]:
naics_df.to_csv(f"{out_dir}/merge_naics.csv", index=False)

# Clustering

In [None]:
X = naics_df.loc[:,['log_density','log_median_dwell']]

In [None]:
from sklearn.mixture import BayesianGaussianMixture

bay_gmm = BayesianGaussianMixture(n_components=4, n_init=10, max_iter=1000)

bay_gmm.fit(X)

BayesianGaussianMixture(max_iter=1000, n_components=4, n_init=10)

In [None]:
bay_gmm.weights_ * 100

array([43.20723662, 32.62858208, 17.71798524,  6.44619606])

In [None]:
bay_gmm.weight_concentration_prior_

0.25

In [None]:
bay_gmm.mean_precision_prior_

1.0

In [None]:
bay_gmm.mean_prior_

array([-3.45683499,  4.47148407])

In [None]:
y_pred = bay_gmm.predict(X)

In [None]:
naics_df['cluster'] = y_pred

In [None]:
merge_df = naics_df[['node.safegraph_core.naics_code','cluster']].merge(naics_map)

In [None]:
merge_df.to_csv(f"{out_dir}/cluster.csv", index=False)

In [None]:
for cluster in range(4):
  print(np.unique(merge_df.loc[merge_df['cluster'] == cluster,'node.safegraph_core.top_category']))

['Activities Related to Real Estate' 'Administration of Economic Programs'
 'Amusement Parks and Arcades' 'Automobile Dealers'
 'Automotive Parts, Accessories, and Tire Stores'
 'Bakeries and Tortilla Manufacturing' 'Beer, Wine, and Liquor Stores'
 'Beverage Manufacturing' 'Book Stores and News Dealers'
 'Building Finishing Contractors' 'Building Material and Supplies Dealers'
 'Civic and Social Organizations' 'Clothing Stores'
 'Consumer Goods Rental'
 'Continuing Care Retirement Communities and Assisted Living Facilities for the Elderly'
 'Death Care Services' 'Department Stores'
 'Drinking Places (Alcoholic Beverages)'
 'Electronics and Appliance Stores' 'Florists' 'Furniture Stores'
 'Gambling Industries' 'Gasoline Stations'
 'General Merchandise Stores, including Warehouse Clubs and Supercenters'
 'Glass and Glass Product Manufacturing' 'Grocery Stores'
 'Grocery and Related Product Merchant Wholesalers'
 'Health and Personal Care Stores' 'Home Furnishings Stores'
 'Individual and