In [1]:
import json
import os
import re

import pandas as pd

In [2]:
data_dir = "../data/NAICS/"
naics_datafiles = os.listdir(data_dir)
naics_2022_files = sorted([f for f in naics_datafiles if "2022" in f])
naics_2017_files = sorted([f for f in naics_datafiles if "2017" in f])
naics_2022_files 

['2-6_digit_2022_Codes.xlsx',
 '2022_NAICS_Cross_References.xlsx',
 '2022_NAICS_Descriptions.xlsx',
 '2022_NAICS_Index_File.xlsx',
 '2022_NAICS_Manual.pdf',
 '2022_NAICS_Structure.xlsx',
 '2022_NAICS_Structure_Summary_Table.xlsx',
 '6-digit_2022_Codes.xlsx']

In [3]:
six_digit_codes_filename = data_dir + naics_2022_files[-1]
two_six_digit_codes_filename = data_dir + naics_2022_files[0]

In [4]:
# labmda to replace multiple spaces with a single space, remove 2022, 
# strip leading and trailing spaces, replace spaces with underscores, remove periods
clean_2022_columns = lambda x: re.sub(r'\s+', ' ', x).replace('2022', '').strip().replace(' ', '_').replace('.', '')

## 6 Digit NAICS Codes

### Load data, drop NaN rows/columns, and clean column names

In [5]:
dtype = {'2022 NAICS Code': 'str', '2022 NAICS Title': 'str'}
df_6_digit_2022 = pd.read_excel(six_digit_codes_filename, dtype=dtype).dropna(how="all")
df_6_digit_2022 = df_6_digit_2022.drop(columns = ['Unnamed: 2'])
df_6_digit_2022.rename(columns = clean_2022_columns, inplace=True)
df_6_digit_2022.head()

Unnamed: 0,NAICS_Code,NAICS_Title
1,111110,Soybean Farming
2,111120,Oilseed (except Soybean) Farming
3,111130,Dry Pea and Bean Farming
4,111140,Wheat Farming
5,111150,Corn Farming


### Confirm that all NAICS Codes in this file are 6 digits

In [6]:
assert df_6_digit_2022['NAICS_Code'].str.len().unique() == [6]

## 2-6 Digit NAICS Codes

In [7]:
dtype = {'2022 NAICS US   Code': 'str', '2022 NAICS US Title': 'str'}
df_2_6_digit_2022 = pd.read_excel(two_six_digit_codes_filename, dtype=dtype).dropna(how="all")
print(df_2_6_digit_2022.columns)
jank_colunms = ['Unnamed: 3', 'Unnamed: 4']
df_2_6_digit_2022.drop(columns = jank_colunms, inplace=True)
df_2_6_digit_2022.rename(columns = clean_2022_columns, inplace=True)
df_2_6_digit_2022.head()

Index(['Seq. No.', '2022 NAICS US   Code', '2022 NAICS US Title', 'Unnamed: 3',
       'Unnamed: 4'],
      dtype='object')


Unnamed: 0,Seq_No,NAICS_US_Code,NAICS_US_Title
1,1.0,11,"Agriculture, Forestry, Fishing and Hunting"
2,2.0,111,Crop Production
3,3.0,1111,Oilseed and Grain Farming
4,4.0,11111,Soybean Farming
5,5.0,111110,Soybean Farming


Build a lookup dictionary to find the titles and sub-codes from a NAICS code:

In [8]:
naics_lookup = df_2_6_digit_2022.set_index('NAICS_US_Code')['NAICS_US_Title'].to_dict()

In [9]:
sample_code = '111110'
for i in range(1, len(sample_code)+1):
    code = sample_code[:i]
    desc = naics_lookup.get(sample_code[:i])
    if desc is not None:
        print(f'{code}: {desc}')


11: Agriculture, Forestry, Fishing and Hunting
111: Crop Production
1111: Oilseed and Grain Farming
11111: Soybean Farming
111110: Soybean Farming


In [10]:
with open(f'{data_dir}naics_lookup.json', 'w') as f:
    json.dump(naics_lookup, f)