In [7]:
import pandas as pd
import string
from nltk.stem import WordNetLemmatizer
from colour import Color

In [2]:
data = pd.read_csv('../data/ecommerce_product_names.csv', header=0, names=['raw'])

In [3]:
data.head()

Unnamed: 0,raw
0,Alisha Solid Women's Cycling Shorts
1,FabHomeDecor Fabric Double Sofa Bed
2,AW Bellies
3,Sicons All Purpose Arnica Dog Shampoo
4,Eternal Gandhi Super Series Crystal Paper Weig...


In [4]:
# process data functions

GENDERS = ['boy', 'girl', 'kid', 'baby', 'infant', 'child', 'dog', 'cat', 'man', 'woman', 'pet']


def clean_raw_text(raw_text):
    """
    Clean Raw Text
    
    Removes punctuation and lowercases all letters
    
    
    args:
        raw_text: string of one/multiple tokens to be cleaned
        
    returns:
        clean_text: cleaned version of raw_text
    
    """
    
    clean_text = raw_text.translate(str.maketrans('', '', string.punctuation)).lower()

    return clean_text


def get_lemmas(raw_text):
    """
    Get Lemmas
    
    Uses nltk's lemmatizer to find lemmas for given string. 
    Because the lemmatizer only takes one word at a time, the function
    splits the string, then joins it again
    
    args:
        raw_text: string of one/multiple tokens to be lemmatized
        
    returns:
        lemmas: lemmatized version of raw_text
    
    """
    
    lemmatizer = WordNetLemmatizer()
    lemmas = ' '.join([lemmatizer.lemmatize(word) for word in raw_text.split()])
    
    return lemmas


def check_if_color(token):
    """
    Check if Color
    
    Helper function to detect color for a given word
    
    args:
        token: an unknown word (string)
        
    returns:
        True if token is a color, else False
    
    """
    
    try:
        Color(token)
        return True
    except ValueError:
        return False
    
    
def get_colors(raw_text):
    """
    Get Colors
    
    Calls check_if_color for every token in string passed to it
    Removes duplicates (white white) from same description
    
    args:
        raw_text: string to check for colors
        
    returns:
        colors: string of colors found in raw_text
    
    """

    colors = list(set([word for word in raw_text.split() if check_if_color(word)]))
    
    return colors


def get_gender(raw_text, genders_list):
    """
    Get Gender
    
    Uses pre-defined lookup table to find gender. 
    Removes duplicates (woman woman) from same description
    
    args:
        raw_text: string to check for gender
        genders_list: list of genders to search for
        
    returns:
        genders: list of genders found in raw_text
    
    """
    
    genders = list(set([word for word in raw_text.split() if word in genders_list]))
    
    return genders
    
    
    

In [5]:
data['clean'] = data['raw'].apply(lambda x: clean_raw_text(x))
data['lemmas'] = data['clean'].apply(lambda x: get_lemmas(x))
data['color'] = data['lemmas'].apply(lambda x: get_colors(x))
data['gender'] = data['lemmas'].apply(lambda x: get_gender(x, GENDERS))


In [6]:
data.to_csv('../data/processed-data.csv')

# Keyword Extraction

In [10]:
import yake

In [1]:
def get_keywords(raw_text, max_ngram_size=2, numOfKeywords=3):
    """
    Get Keywords
    
    Uses yake's keyword extractor to return keywords from raw text
    
    args:
        raw_text: string of text to extract keywords from
        max_ngram_size: int for max ngram wanted (default 2)
        numOfKeywords: int for how many keywords/ngrams you want to return
    
    returns:
        kws: string of keywords in text
    
    """
    kw_extractor = yake.KeywordExtractor()


    language = "en"
    deduplication_threshold = 0.9
    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
    keywords = custom_kw_extractor.extract_keywords(raw_text)
    
    
    kws = ''
    while len(keywords) > 0:
        kws = kws + keywords.pop()[0] + ' '
    
    
    
    return kws

In [12]:
data['keywords'] = data['lemmas'].apply(lambda x: get_keywords(x, 2, 3))

In [14]:
data.to_csv('../data/processed-data-with-keywords.csv')