## Part 1: Load and Process Dataset For Recommender System

In [68]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import pandas as pd
import re

In [69]:
df = pd.read_csv(r'assets/2017_NAICS_Index_File.csv')

In [70]:
df.head()

Unnamed: 0,naics,description
0,111110,"Soybean farming, field and seed production"
1,111120,"Canola farming, field and seed production"
2,111120,"Flaxseed farming, field and seed production"
3,111120,"Mustard seed farming, field and seed production"
4,111120,"Oilseed farming (except soybean), field and se..."


In [71]:
des_df = pd.read_csv(r'assets/2017_NAICS_Descriptions.csv')

In [72]:
des_df.head()

Unnamed: 0,Code,Title,Description
0,11,"Agriculture, Forestry, Fishing and HuntingT","The Sector as a Whole\n\nThe Agriculture, Fore..."
1,111,Crop ProductionT,Industries in the Crop Production subsector gr...
2,1111,Oilseed and Grain FarmingT,This industry group comprises establishments p...
3,11111,Soybean FarmingT,See industry description for 111110.
4,111110,Soybean Farming,This industry comprises establishments primari...


In [73]:
def create_description_df(des_df):
    
    #load description dataframe
    des_df = pd.read_csv(r'assets/2017_NAICS_Descriptions.csv')
    
    # clean space characters
    # des_df['Title'] = des_df['Title'].str.replace(r'[T\b]', '', regex=True)
    des_df['Description'] = des_df['Description'].str.replace('\\n', ' ', regex=True)
    
    # remove common sentences across NAICS descriptions
    des_df = des_df[des_df['Description'].str.contains('See industry description for')==False]
    des_df['Description'] = des_df['Description'].str.replace('The Sector as a Whole', '', regex=False)
    des_df['Description'] = des_df['Description'].str.replace('Cross-References. Establishments primarily engaged in--', '', regex=False)
    
    # delete text after Excluded. This removes words related to descriptions specifically excluded from a class
    des_df['Description'] = des_df['Description'].str.replace('Excluded(.*?)$', '', regex=True)
    
    des_df = des_df.dropna()
    
    des_df['text'] = des_df['Title'] + ' ' + des_df['Description']
    des_df = des_df[['Code', 'text']]
    
    # add the text from each higher-level class to lower ones. i.e class "11" description gets added to "11110" and "111111", etc.
    for vals in des_df.values:
        if len(vals[0]) < 6:
            idx = des_df[(des_df['Code'].str.slice(start=0, stop=len(vals[0])) == vals[0]) & (des_df['Code'].str.len() == 6)].index
            des_df.loc[idx, 'text'] = des_df.loc[idx, 'text'] + ' ' + str(vals[1])

    
    return des_df[des_df['Code'].str.len() == 6]

def load_data(df, des_df):
    """
    Load and process NAICS documents
    """
    #description dataframe
    des_df = create_description_df(des_df).rename(columns= {'Code': 'naics'})
    
    # merge the two dataframes
    df = pd.merge(df, des_df[['naics','text']], how='outer', on='naics').fillna('')
    df['description'] = df[['description', 'text']].agg(' '.join, axis=1)
    
    #remove wildcard NAICS code
    df = df[df['naics'] != '******']
    
    # remove punctuation
    df['description'] = df['description'].str.replace('[^\w\s]', ' ', regex=True)
    df = df.groupby(['naics'])['description'].apply(' '.join).reset_index()

    return df

In [74]:
processed_df = load_data(df, des_df)

In [75]:
# view the results of the cleaned descriptions
processed_df.head()

Unnamed: 0,naics,description
0,111110,Soybean farming field and seed production Soy...
1,111120,Canola farming field and seed production Oils...
2,111130,Bean farming dry field and seed production D...
3,111140,Wheat farming field and seed production Wheat...
4,111150,Corn farming except sweet corn field and se...


## Part 2: Tokenize/Stem/Lemmatize Dataset

In [76]:
# The below code will take a couple minutes to run. It is recommended to skip to RecommenderModels.ipynb where the resulting files are loaded in, rather than waiting for this to run.

In [77]:
stop_words = set(stopwords.words("english"))
STOPWORDS = set(stopwords.words('english'))
MIN_WORDS = 4
MAX_WORDS = 200

PATTERN_S = re.compile("\'s")  # matches `'s` from text  
PATTERN_RN = re.compile("\\r\\n\\b") #matches `\r` and `\n`
PATTERN_PUNC = re.compile(r"[^\w\s]") # matches all non 0-9 A-z whitespace 

def clean_text(text):
    """
    Series of cleaning. String to lower case, remove non words characters and numbers (punctuation, curly brackets etc).
        text (str): input text
    return (str): modified initial text
    """
    text = text.lower()  # lowercase text
    # replace the matched string with ' '
    text = re.sub(PATTERN_S, ' ', text)
    text = re.sub(PATTERN_RN, ' ', text)
    text = re.sub(PATTERN_PUNC, ' ', text)
    return text

def tokenizer(description, stop_words, normalization):
    
    if normalization == 'lemmatize':
        # tokenize and lemmatize text
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(w) for w in word_tokenize(description)]
        
    elif normalization == 'stem':
        # tokenize and stem text
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(w) for w in word_tokenize(description)]
    
   # remove tokens length of 2 or below and make all lowercase and remove stop words
    tokens = [w.lower() for w in tokens if (w.lower() not in stop_words) and (len(w) > 2) and (w.isalpha())]
    
    return tokens    

def process_description(df):
    df['clean_description'] = df['description'].apply(clean_text)
    processed_df['lemmatized'] = df['clean_description'].apply(lambda x: tokenizer(x, stop_words, 'lemmatize'))
    processed_df['stemmed'] = df['clean_description'].apply(lambda x: tokenizer(x, stop_words, 'stem'))
                                                     
    return processed_df


processed_df = process_description(processed_df)

In [78]:
processed_df.head()

Unnamed: 0,naics,description,clean_description,lemmatized,stemmed
0,111110,Soybean farming field and seed production Soy...,soybean farming field and seed production soy...,"[soybean, farming, field, seed, production, so...","[soybean, farm, field, seed, product, soybean,..."
1,111120,Canola farming field and seed production Oils...,canola farming field and seed production oils...,"[canola, farming, field, seed, production, oil...","[canola, farm, field, seed, product, oilse, ex..."
2,111130,Bean farming dry field and seed production D...,bean farming dry field and seed production d...,"[bean, farming, dry, field, seed, production, ...","[bean, farm, dri, field, seed, product, dri, p..."
3,111140,Wheat farming field and seed production Wheat...,wheat farming field and seed production wheat...,"[wheat, farming, field, seed, production, whea...","[wheat, farm, field, seed, product, wheat, far..."
4,111150,Corn farming except sweet corn field and se...,corn farming except sweet corn field and se...,"[corn, farming, except, sweet, corn, field, se...","[corn, farm, except, sweet, corn, field, seed,..."


In [79]:
processed_df.to_pickle(r'assets/processed_df.pkl')