In [1]:
! pip install miceforest

Collecting miceforest
  Downloading miceforest-5.6.3-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting blosc (from miceforest)
  Downloading blosc-1.11.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Installing collected packages: blosc, miceforest
Successfully installed blosc-1.11.1 miceforest-5.6.3


# Creating Distance Matrix and processed_zipcode_data parquet files.

In [2]:
import pandas as pd
import time
import os
import logging
logging.basicConfig(level=logging.INFO, format="%(message)s")
import numpy as np
from miceforest import ImputationKernel
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from scipy.spatial import distance_matrix

from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler(feature_range=(0.1, .95))

In [3]:
class ZipCodeProcessor():
    def __init__(self):
        self.zipcode_data = os.path.join(os.getcwd(),'data','uszips.csv')
        self.drop_columns=[
            'state_id', 'parent_zcta', 'imprecise', 'metdiv_fips', 
            'metdiv_name', 'county_fips', 'county_weights',
            'county_names_all', 'county_fips_all', 'cbsa_fips',
            'cbsa_name', 'cbsa_metro', 'csa_fips', 'csa_name'
        ]
        self.random_state = 1
        self.variance_coverage = .90
        self.clusters = 15
        
    def clean_zipcode_data(self, zipcode_df):
        """
        This function drops un-necessary columns from the dataframe.
        It also removes row with null lat and lng and military zipcodes.
        """
        zipcode_df.drop(columns=self.drop_columns, inplace=True)
        zipcode_df.dropna(subset=['lat','lng'], how='all', inplace=True)
        zipcode_df = zipcode_df[zipcode_df['military']==False]
        logging.info(
            f"Function clean_zipcode_data ran successfully. Output Dataframe Shape - {zipcode_df.shape}",
        )
        return zipcode_df
    
    def imputing_missing_data(self, numerical_data):   
        """
        This function uses LGBM model to fill up empty column values 
        one at a time. It uses n-1 columns to create a model which predicts
        empty column values of a single column.
        This above algorithm is iterated until every null value is replaced.
        """ 
        logging.info(
            f"Starting data imputation for missing numerical columns. This will take around 12 minutes to complete.",
        )
        start = time.time()
        mice_kernel = ImputationKernel(
        data = numerical_data,
        save_all_iterations = False,
        random_state = self.random_state
        )
        
        mice_kernel.mice(10)
        imputed_zipcode_df = mice_kernel.complete_data()
        end = time.time()
        logging.info(
            f"Completed numerical data imputation. Time taken - ({str(round(end-start,2))}) seconds",
        )
        assert imputed_zipcode_df.isnull().sum().sum() == 0
        return imputed_zipcode_df
    
    def featurizing_categorical_data(self,categorical_data):
        """
        This function replaces nan values with 'Missing' in categorical columns.
        """
        logging.info(
            f"Featurizing categorical data.",
        )
        for col in categorical_data.columns:
            categorical_data[col].fillna('Missing', inplace=True)
        return categorical_data
    
    def processing_zipcode_data(self, zipcode_data_df):
        """"
        Removing categorical data 
        """
        logging.info(
            f"processing zipcode data !!",
        )
        zipcode_data_df.drop([col for col in zipcode_data_df.columns if zipcode_data_df[col].dtypes=='object'], axis=1,inplace=True)
        return zipcode_data_df
    
    def scaling_and_featurizing_data(self, zipcode_data_df):
        """
        This function separates the zipcode with it's demographics data.
        This function also scales the data to a standard scale and creates new features 
        with a 95% variance coverage using PCA.  
        """
        zip_X = zipcode_data_df['zip']
        X_input = zipcode_data_df.drop('zip',axis=1)
        
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(X_input)
        
        pca = PCA(n_components = self.variance_coverage, random_state = self.random_state)
        scaled_pca_data = pca.fit_transform(scaled_data)
        scaled_pca_data = pd.DataFrame(scaled_pca_data, columns=['scaped_pca_'+str(i) for i in range(scaled_pca_data.shape[1])])
        zip_X.reset_index(drop=True, inplace=True)
        scaled_pca_data.reset_index(drop=True, inplace=True)
        logging.info(
            f"Scaled the data and created PCA features. zip_X shape - {zip_X.shape}, scaled_pca_data shape - {scaled_pca_data.shape}"
        )
        return zip_X, scaled_pca_data
    
    def creating_clusters(self, scaled_pca_data, zip_X):
        """"
        This function assigns each row with a cluster value ranging from 0 to self.clusters-1.
        The function also combines, zipcode, pca features data and cluster labels.
        It also stores the row index per cluster.
        """
        logging.info(
            f"Running KMeans algorithm and creating clusters.",
        )
        kmeans = KMeans(n_clusters=self.clusters, max_iter=500, random_state = self.random_state)
        kmeans.fit(scaled_pca_data)
       
        combined_zipcode_data = pd.concat([zip_X, scaled_pca_data], axis=1)
        combined_zipcode_data['cluster_label'] = kmeans.labels_
        
        return combined_zipcode_data
    
        # return cluster_info, combined_zipcode_data

    def creating_distance_matrix(self, final_zipcode_data):
        """
        For a dataframe scaled_pca_data with shape n x m.
        This function calculates the normalized distance between rows and 
        returns a n x n dataframe.
        """
        logging.info(
            f"Creating distance matrix. It would take approx 20 minutes to complete.",
        )
        start = time.time()
        final_zipcode_data.set_index('zip',inplace=True)
        distance_matrix_data = np.triu(distance_matrix(final_zipcode_data.values, final_zipcode_data.values))
        distance_matrix_data = pd.DataFrame(distance_matrix_data, 
            index=final_zipcode_data.index, 
            columns=final_zipcode_data.index)
        end = time.time()
        logging.info(
            f"Creating distance matrix function ran successfully. Time taken - {round(end-start,2)}",
        )
        return distance_matrix_data
        
    def execute(self):
        """
        This is the main function, calls all other functions and return the final output.
        
        """
        zipcode = pd.read_csv(self.zipcode_data)
        zipcode = self.clean_zipcode_data(zipcode)
        
        # considering only the zipcodes where populations is greater than 1000
        zipcode = zipcode[zipcode['population']>1000]
        print("zipcode shape --> ", zipcode.shape)
        
        cat_columns = [col for col in zipcode.columns if zipcode[col].dtypes=='object']
        cat_data = zipcode[cat_columns]
        cat_data = self.featurizing_categorical_data(cat_data)
        
        numerical_data = zipcode.drop(cat_columns, axis=1)
        numerical_data = self.imputing_missing_data(numerical_data)
        
        zipcode_data = self.processing_zipcode_data(numerical_data)
        zip_X, scaled_pca_data = self.scaling_and_featurizing_data(zipcode_data)

        final_zipcode_data = self.creating_clusters(scaled_pca_data, zip_X)
        distance_matrix_data = self.creating_distance_matrix(final_zipcode_data)
        cat_data.reset_index(drop=True, inplace=True)
        final_zipcode_data.reset_index(inplace=True)
        final_zipcode_data = pd.concat([cat_data, final_zipcode_data], axis=1)
        final_zipcode_data.reset_index(inplace=True)
        distance_matrix_data.reset_index(inplace=True)
        final_zipcode_data.to_parquet(os.path.join(os.getcwd(),'data','reduced_23k_processed_zipcode_data.parquet'),index=False)
        distance_matrix_data.columns = distance_matrix_data.columns.astype(str)
        # Identify float32 columns
        float64_columns = distance_matrix_data.select_dtypes(include='float64').columns

        # Convert float64 columns to float16
        distance_matrix_data[float64_columns] = distance_matrix_data[float64_columns].astype('float16')
        
        distance_matrix_data.to_pickle("reduced_f16_23k_distance_matrix_data.pkl.gz", compression = "gzip")
#         distance_matrix_data.to_parquet(os.path.join(os.getcwd(),'data','distance_matrix_data.parquet'),index=False)
        logging.info(
            f"Processed zipcode data. Created distance matrix data. Exiting now !!",
        )
        
if __name__ == "__main__":
    zipcodeobj = ZipCodeProcessor()
    zipcodeobj.execute()  

Function clean_zipcode_data ran successfully. Output Dataframe Shape - (41044, 74)
Featurizing categorical data.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
Starting data imputation for missing numerical columns. This will take around 12 minutes to complete.
Completed numerical data imputation. Time taken - (685.82) seconds
processing zipcode data !!
Scaled the data and created PCA features. zip_X shape - (41044,), scaled_pca_data shape - (41044, 39)
Running KMeans algorithm and creating clusters.
Creating distance matrix. It would take approx 20 minutes to complete.
Creating distance matrix function ran successfully. Time taken - 355.1
Processed zipcode data. Created distance matrix data. Exiting now !!


 # creating clusters and zipcode dict

In [17]:
def creating_clusters_info(cluster_zipcode_mapping_df):
    """"
    This function assigns each row with a cluster value ranging from 0 to clusters-1.
    It also stores the zipcodes per cluster.
    """       
    cluster_info = {}
    for cluster in cluster_zipcode_mapping_df.cluster_label.unique():
        cluster_info[str(cluster)] = cluster_zipcode_mapping_df[cluster_zipcode_mapping_df['cluster_label']==cluster].zip.tolist()

    return cluster_info

In [18]:
import pandas as pd
import json

In [19]:
df = pd.read_parquet("reduced_23k_processed_zipcode_data.parquet")
df.head()

Unnamed: 0,index,city,state_name,county_name,timezone,zip,scaped_pca_0,scaped_pca_1,scaped_pca_2,scaped_pca_3,...,scaped_pca_30,scaped_pca_31,scaped_pca_32,scaped_pca_33,scaped_pca_34,scaped_pca_35,scaped_pca_36,scaped_pca_37,scaped_pca_38,cluster_label
0,0,Holtsville,New York,Suffolk,America/New_York,501,3.00497,6.080473,3.230265,-4.841,...,0.556804,3.012783,-0.19268,0.564522,0.870385,-0.314172,0.519797,-0.021815,-0.376686,2
1,1,Holtsville,New York,Suffolk,America/New_York,544,1.425837,4.875224,2.051417,-3.297107,...,1.464187,1.44381,0.082843,-0.34839,0.145306,-1.459391,0.090508,0.185816,-0.561256,2
2,2,Adjuntas,Puerto Rico,Adjuntas,America/Puerto_Rico,601,-5.780947,7.075884,4.196387,-3.592018,...,0.560892,0.113311,1.176035,0.509109,1.635752,-2.163695,-0.257004,-0.248353,-3.822356,2
3,3,Aguada,Puerto Rico,Aguada,America/Puerto_Rico,602,-4.262345,6.9471,4.525468,-5.106659,...,0.310821,0.217988,-0.16817,-3.129755,1.188845,-2.575237,-1.203695,-0.092163,-1.414716,2
4,4,Aguadilla,Puerto Rico,Aguadilla,America/Puerto_Rico,603,-4.087306,6.937841,4.75934,-3.926593,...,0.292455,-0.316128,0.835838,0.094107,0.980672,-1.914424,-0.061645,0.461391,-2.893598,2


In [20]:
cluster_zipcode_mapping = creating_clusters_info(df)
cluster_zipcode_mapping

{'2': [501,
  544,
  601,
  602,
  603,
  604,
  605,
  606,
  610,
  611,
  612,
  613,
  614,
  616,
  617,
  622,
  623,
  624,
  627,
  631,
  636,
  637,
  638,
  641,
  646,
  647,
  650,
  652,
  653,
  656,
  659,
  660,
  662,
  664,
  667,
  669,
  670,
  674,
  676,
  677,
  678,
  680,
  682,
  683,
  685,
  687,
  688,
  690,
  692,
  693,
  694,
  698,
  703,
  704,
  705,
  707,
  714,
  715,
  716,
  717,
  718,
  719,
  720,
  721,
  723,
  725,
  726,
  727,
  728,
  729,
  730,
  731,
  732,
  733,
  734,
  735,
  736,
  737,
  738,
  739,
  740,
  741,
  742,
  744,
  745,
  751,
  754,
  757,
  765,
  766,
  767,
  769,
  771,
  772,
  773,
  775,
  777,
  778,
  780,
  782,
  783,
  784,
  785,
  791,
  792,
  794,
  795,
  801,
  802,
  803,
  805,
  820,
  821,
  822,
  823,
  824,
  830,
  831,
  840,
  841,
  851,
  901,
  902,
  907,
  908,
  909,
  910,
  911,
  912,
  913,
  914,
  915,
  916,
  917,
  918,
  919,
  920,
  921,
  922,
  923,
  924,
  925,
 

In [21]:
with open("reduced_23k_cluster_zipcode_mapping.json", "w") as outfile:
    json.dump(cluster_zipcode_mapping, outfile, indent=4)