# gensim - for Topic Modelling 

by Jenny Gong

Official tutorial: 
https://radimrehurek.com/gensim/tutorial.html


#### if you don't have gensim and/or pandas installed on your computer, run the follow line: 

In [1]:
#!pip install gensim 

In [2]:
#!pip install pandas

In [42]:
import pandas as pd #deal with excel 
import gensim
from gensim import corpora,models

## 1. Preprocessing 

In [43]:
# Read dataD
df = pd.read_csv('fashion.csv')
df

Unnamed: 0,year,season,brand,author of review,location,time,review text
0,2016,Spring,A Dtacher,Kristin Anderson,NEW YORK,"September 13, 2015",Detachment was the word of the day at A Dtache...
1,2016,Spring,A.F. Vandevorst,Luke Leitch,PARIS,"October 1, 2015",You heard this collection coming long before y...
2,2016,Spring,A.L.C.,Kristin Anderson,NEW YORK,"September 21, 2015",August saw the announcement of big news for A....
3,2016,Spring,A.P.C.,Nicole Phelps,PARIS,"October 3, 2015","They call me the king of basics, Jean Touitou ..."
4,2016,Spring,A.W.A.K.E.,Maya Singer,NEW YORK,"October 21, 2015",Natalia Alaverdian is a designer with a lot of...
5,2016,Spring,Ace & Jig,Kristin Anderson,NEW YORK,"October 13, 2015",Process has always been paramount to Ace & Jig...
6,2016,Spring,Acne Studios,Chioma Nnadi,PARIS,"October 3, 2015",A bohemian circle of muses have been in heavy ...
7,2016,Spring,Adam Lippes,Nicole Phelps,NEW YORK,"September 12, 2015",Walking into Adam Lippess Washington Square ap...
8,2016,Spring,Adam Selman,Lee Carter,NEW YORK,"September 10, 2015","Ever the adventurer, Adam Selman will gleefull..."
9,2016,Spring,ADEAM,Kristin Anderson,NEW YORK,"September 14, 2015",Hanako Maeda has been busy rediscovering her J...


In [44]:
#convert all review text into list format
reviews = df['review text'].tolist()
reviews

['Detachment was the word of the day at A Dtacher (yes, like the labels name, bien sr). Designer Mona Kowalska loves the high concept, and one imagines that today detachment included being unconcerned with the gaze of others. Kowalskas woman, both as she appears on the runway and the real world, dresses for herself. Her intensely arty bend, and taste for clothes that match it, make A Dtacher a cultishly beloved brand among certain shoppers. This season, Kowalska presented them with a lineup of relatively playful offerings.\rThe collection opened with a pair of midi dresses in an Indonesian-inspired floral print, which reemerged later imagined with allover Pop white polka dots. Elsewhere came cardigans in an uncanny kind of amoxicillin pink that you imagined the A Dtacher woman wearing with tongue firmly in cheek (they had Kawakubo-esque allover holes, to boot). The popcorn knits were pretty fun, too.\rThe choice to use hardier materials lent dresses eccentric volumes, but also led to a

In [45]:
#remove '\r' and punctuations 
import string

new_reviews = []
for review in reviews: 
    for ch in review:
        if ch in string.punctuation:
            review = review.replace(ch,'') #replace punctuation with nothing
        if ch == '\r':
            review = review.replace(ch,' ') #replace \r with space
    new_reviews.append(review) 

new_reviews

['Detachment was the word of the day at A Dtacher yes like the labels name bien sr Designer Mona Kowalska loves the high concept and one imagines that today detachment included being unconcerned with the gaze of others Kowalskas woman both as she appears on the runway and the real world dresses for herself Her intensely arty bend and taste for clothes that match it make A Dtacher a cultishly beloved brand among certain shoppers This season Kowalska presented them with a lineup of relatively playful offerings The collection opened with a pair of midi dresses in an Indonesianinspired floral print which reemerged later imagined with allover Pop white polka dots Elsewhere came cardigans in an uncanny kind of amoxicillin pink that you imagined the A Dtacher woman wearing with tongue firmly in cheek they had Kawakuboesque allover holes to boot The popcorn knits were pretty fun too The choice to use hardier materials lent dresses eccentric volumes but also led to a lineup that often felt frum

In [46]:
#remove stop words, to lowercase and tokenize
from nltk.corpus import stopwords
mystopwords = stopwords.words('english')

tokens_list = [[word for word in review.lower().split(' ') if word not in mystopwords and word.isalpha()]
         for review in new_reviews]

#remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)

for tokens in tokens_list:
    for token in tokens:
        frequency[token] += 1
        
tokens_list = [[token for token in tokens if frequency[token]>1]
              for tokens in tokens_list]

print(tokens_list)

[['detachment', 'word', 'day', 'dtacher', 'yes', 'like', 'labels', 'name', 'bien', 'sr', 'designer', 'mona', 'kowalska', 'loves', 'high', 'concept', 'one', 'imagines', 'today', 'detachment', 'included', 'gaze', 'others', 'kowalskas', 'woman', 'appears', 'runway', 'real', 'world', 'dresses', 'intensely', 'arty', 'taste', 'clothes', 'match', 'make', 'dtacher', 'cultishly', 'beloved', 'brand', 'among', 'certain', 'shoppers', 'season', 'kowalska', 'presented', 'lineup', 'relatively', 'playful', 'offerings', 'collection', 'opened', 'pair', 'midi', 'dresses', 'floral', 'print', 'later', 'imagined', 'allover', 'pop', 'white', 'polka', 'dots', 'elsewhere', 'came', 'cardigans', 'uncanny', 'kind', 'pink', 'imagined', 'dtacher', 'woman', 'wearing', 'tongue', 'firmly', 'cheek', 'allover', 'holes', 'boot', 'popcorn', 'knits', 'pretty', 'fun', 'choice', 'use', 'materials', 'lent', 'dresses', 'eccentric', 'volumes', 'also', 'led', 'lineup', 'often', 'felt', 'albeit', 'times', 'clothes', 'lacked', 'ex

## 2. Generate Term Document Matrix

In [47]:
# generate token dictionary class
dictionary = corpora.Dictionary(tokens_list) 
print(dictionary)

Dictionary(7493 unique tokens: [u'raining', u'pointytoed', u'yellow', u'four', u'hanging']...)


In [48]:
# generate a unique token list 
sort_token = sorted(dictionary.items(),key=lambda k:k[0], reverse = False)
unique_token = [token.encode('utf8') for (ID,token) in sort_token]

In [49]:
# build a corpus
corpus = [dictionary.doc2bow(tokens) for tokens in tokens_list]

print(corpus) 

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 2), (15, 1), (16, 1), (17, 2), (18, 1), (19, 1), (20, 1), (21, 2), (22, 1), (23, 3), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 2), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 2), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 4), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 2), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 3), (94, 1), (95, 1), (96, 1), (97, 2), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 2)

In [50]:
import numpy as np
matrix = gensim.matutils.corpus2dense(corpus,num_terms=len(dictionary),dtype = 'int')
matrix = matrix.T #transpose the matrix 

#convert the numpy matrix into pandas data frame
matrix_df = pd.DataFrame(matrix, columns=unique_token)

In [51]:
#write matrix dataframe into csv
matrix_df.to_csv('Term_Document_matrix.csv')

## 3. LDA, LSI model

#### a. LDA model 

In [52]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=10) #fit lda model

lda.print_topics(10) #V matrix, topic matrix

[(0,
  u'0.007*"new" + 0.005*"collection" + 0.005*"like" + 0.005*"one" + 0.004*"dresses" + 0.004*"season" + 0.004*"show" + 0.004*"clothes" + 0.004*"dress" + 0.004*"silk"'),
 (1,
  u'0.008*"collection" + 0.005*"new" + 0.005*"dresses" + 0.004*"one" + 0.004*"fashion" + 0.004*"show" + 0.004*"dress" + 0.003*"also" + 0.003*"said" + 0.003*"like"'),
 (2,
  u'0.007*"collection" + 0.006*"one" + 0.006*"dresses" + 0.005*"new" + 0.005*"said" + 0.004*"season" + 0.004*"spring" + 0.004*"like" + 0.004*"also" + 0.004*"came"'),
 (3,
  u'0.009*"collection" + 0.006*"like" + 0.006*"new" + 0.005*"designer" + 0.005*"show" + 0.005*"dresses" + 0.004*"one" + 0.004*"way" + 0.004*"spring" + 0.003*"fashion"'),
 (4,
  u'0.008*"collection" + 0.007*"one" + 0.006*"new" + 0.005*"dresses" + 0.005*"show" + 0.005*"like" + 0.004*"clothes" + 0.004*"spring" + 0.004*"dress" + 0.004*"designers"'),
 (5,
  u'0.006*"collection" + 0.004*"one" + 0.004*"silk" + 0.004*"dress" + 0.004*"spring" + 0.004*"new" + 0.004*"clothes" + 0.004*"s

In [55]:
# Generate U Matrix for LDA model
corpus_lda = lda[corpus] #transform lda model

#convert corpus_lda to numpy matrix
U_matrix_lda = gensim.matutils.corpus2dense(corpus_lda,num_terms=10).T

#write U_matrix into pandas dataframe and output
U_matrix_lda_df = pd.DataFrame(U_matrix_lda)
U_matrix_lda_df.to_csv('U_matrix_lda.csv')

In [56]:
print matrix_df.shape
print U_matrix_lda_df.shape

(434, 7493)
(434, 10)


We decrease features from 7493 to 10.

#### b.LSI model 

In [53]:
# Tfidf Transformation 
tfidf = models.TfidfModel(corpus) #fit tfidf model
corpus_tfidf = tfidf[corpus]      #transform tfidf model

In [54]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)

lsi.print_topics()

[(0,
  u'0.073*"show" + 0.072*"new" + 0.069*"white" + 0.068*"silk" + 0.067*"dresses" + 0.067*"dress" + 0.067*"one" + 0.066*"season" + 0.065*"black" + 0.065*"like"'),
 (1,
  u'-0.098*"show" + -0.097*"models" + -0.091*"fashion" + -0.087*"wang" + -0.078*"versace" + 0.072*"particularly" + -0.071*"west" + -0.071*"shows" + -0.069*"armani" + -0.061*"things"'),
 (2,
  u'-0.102*"brand" + -0.088*"denim" + 0.088*"blue" + 0.084*"yellow" + -0.084*"jeans" + 0.083*"fabric" + 0.081*"white" + -0.077*"seasons" + 0.076*"red" + 0.074*"black"'),
 (3,
  u'0.138*"lee" + 0.094*"chow" + 0.092*"dkny" + 0.092*"osborne" + -0.074*"johnson" + 0.072*"collections" + 0.071*"looks" + 0.068*"tone" + 0.066*"fabric" + 0.065*"biker"'),
 (4,
  u'0.103*"leather" + -0.099*"johnson" + 0.071*"biker" + -0.067*"wang" + -0.064*"gown" + -0.061*"looks" + 0.061*"black" + 0.060*"shirts" + 0.058*"chanel" + 0.058*"handbags"'),
 (5,
  u'0.078*"armani" + -0.075*"wearable" + -0.073*"anderson" + -0.073*"want" + -0.067*"really" + -0.061*"thi

In [57]:
# Generate U Matrix for LSI model
corpus_lsi = lsi[corpus_tfidf] #transform lda model

#convert corpus_lda to numpy matrix
U_matrix_lsi = gensim.matutils.corpus2dense(corpus_lsi,num_terms=10).T

#write U_matrix into pandas dataframe and output
pd.DataFrame(U_matrix_lsi).to_csv('U_matrix_lsi.csv')