In [8]:
import pandas as pd
import string
from nltk.stem import WordNetLemmatizer
from gensim import models, corpora
import pyLDAvis.gensim
import gensim
from nltk.corpus import stopwords

In [9]:
data = pd.read_csv('../data/processed-data.csv')

In [10]:
data

Unnamed: 0.1,Unnamed: 0,raw,clean,lemmas,color,gender
0,0,Alisha Solid Women's Cycling Shorts,alisha solid womens cycling shorts,alisha solid woman cycling short,[],['woman']
1,1,FabHomeDecor Fabric Double Sofa Bed,fabhomedecor fabric double sofa bed,fabhomedecor fabric double sofa bed,[],[]
2,2,AW Bellies,aw bellies,aw belly,[],[]
3,3,Sicons All Purpose Arnica Dog Shampoo,sicons all purpose arnica dog shampoo,sicons all purpose arnica dog shampoo,[],['dog']
4,4,Eternal Gandhi Super Series Crystal Paper Weig...,eternal gandhi super series crystal paper weig...,eternal gandhi super series crystal paper weig...,['silver'],[]
...,...,...,...,...,...,...
12618,12618,Purple Women Heels,purple women heels,purple woman heel,['purple'],['woman']
12619,12619,Uberlyfe Large Vinyl Sticker,uberlyfe large vinyl sticker,uberlyfe large vinyl sticker,[],[]
12620,12620,We Witches Comfy Hues Women Wedges,we witches comfy hues women wedges,we witch comfy hue woman wedge,[],['woman']
12621,12621,Stylistry Women Heels,stylistry women heels,stylistry woman heel,[],['woman']


In [11]:

def remove_stopwords(raw_text, stop_words):
    """
    Remove Stopwords
    
    args:
        raw_text: string to be checked for stop words
        stop_words: list of stop words to be removed
        
    returns: 
        clean_text: string with stop words removed
    
    """
    
    clean_text = ' '.join([word for word in raw_text.split() if word not in stop_words])
    
    return clean_text




def prep_data_for_tm(data, column):
    """
    Prep Data for Topic Model
    
    Creates the needed items for an LDA topic model
    
    args:
        data: dataframe with the raw text
        column: column name indicating which column has the text to prep
        
    returns:
        corpus: the corpus for LDA
        dictionary: AKA id2word, the dictionary for LDA
        docs: the documents for LDA
    
    
    """
    docs = []
    
    for i in range(len(data.index)):
        docs.append(data.loc[i,column].split())


    dictionary = corpora.Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    
    return corpus, dictionary, docs



In [12]:
STOP_WORDS = stopwords.words('english')
STOP_WORDS.extend(['from', 'set', 'use'])

In [13]:
data['tm_text'] = data['lemmas'].apply(lambda x: remove_stopwords(x, STOP_WORDS))

In [14]:
corpus, dictionary, docs = prep_data_for_tm(data, 'tm_text')

In [15]:
lda_model = models.LdaModel(corpus=corpus, 
 num_topics=10, 
 id2word=dictionary, 
 distributed=False, 
 chunksize=200, 
 passes=5, 
 update_every=1, 
 alpha='auto', 
 eta=None, 
 decay=0.5, 
 offset=1.0, 
 eval_every=10, 
 iterations=50, 
 gamma_threshold=0.001, 
 minimum_probability=0.01, 
 random_state=2, 
 ns_conf=None, 
 minimum_phi_value=0.01, 
 per_word_topics=False, 
 callbacks=None)

In [16]:
lda_visualization = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, sort_topics=False, n_jobs=1, mds='mmds')
pyLDAvis.display(lda_visualization)