In [1]:
import pandas as pd #deal with excel 
import gensim
from gensim import corpora,models



In [2]:
df = pd.read_csv('win_1k.csv')

In [3]:
reviews = df['description'].tolist()

In [4]:
import string

new_reviews = []
for review in reviews: 
    for ch in review:
        if ch in string.punctuation:
            review = review.replace(ch,'') #replace punctuation with nothing
        if ch == '\r':
            review = review.replace(ch,' ') #replace \r with space
    new_reviews.append(review) 


In [5]:
from nltk.corpus import stopwords
mystopwords = stopwords.words('english')

tokens_list = [[word for word in review.lower().split(' ') if word not in mystopwords and word.isalpha()]
         for review in new_reviews]

#remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)

for tokens in tokens_list:
    for token in tokens:
        frequency[token] += 1
        
tokens_list = [[token for token in tokens if frequency[token]>1]
              for tokens in tokens_list]


In [6]:
# generate token dictionary class
dictionary = corpora.Dictionary(tokens_list) 
print(dictionary)

Dictionary(1970 unique tokens: ['acidity', 'alongside', 'apple', 'aromas', 'brisk']...)


In [7]:
# generate a unique token list 
sort_token = sorted(dictionary.items(),key=lambda k:k[0], reverse = False)
unique_token = [token for (ID,token) in sort_token]

In [8]:
# build a corpus
corpus = [dictionary.doc2bow(tokens) for tokens in tokens_list]

In [9]:
import numpy as np
matrix = gensim.matutils.corpus2dense(corpus,num_terms=len(dictionary),dtype = 'int')
matrix = matrix.T #transpose the matrix 

#convert the numpy matrix into pandas data frame
matrix_df = pd.DataFrame(matrix, columns=unique_token)

In [10]:
#write matrix dataframe into csv
matrix_df.to_csv('Wine_Term_Document_matrix.csv')

#### a. LDA model 

In [11]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=10) #fit lda model

lda.print_topics(10) #V matrix, topic matrix

[(0,
  '0.020*"flavors" + 0.016*"palate" + 0.013*"wine" + 0.012*"notes" + 0.012*"finish" + 0.012*"aromas" + 0.012*"fruit" + 0.008*"white" + 0.008*"acidity" + 0.008*"rich"'),
 (1,
  '0.020*"wine" + 0.018*"flavors" + 0.016*"aromas" + 0.016*"fruit" + 0.015*"finish" + 0.012*"acidity" + 0.012*"palate" + 0.010*"drink" + 0.010*"sweet" + 0.008*"apple"'),
 (2,
  '0.024*"wine" + 0.023*"fruit" + 0.022*"flavors" + 0.018*"palate" + 0.015*"aromas" + 0.012*"acidity" + 0.011*"finish" + 0.009*"cherry" + 0.008*"white" + 0.008*"black"'),
 (3,
  '0.040*"wine" + 0.017*"flavors" + 0.016*"aromas" + 0.014*"acidity" + 0.014*"drink" + 0.014*"fruit" + 0.013*"tannins" + 0.013*"ripe" + 0.011*"finish" + 0.008*"palate"'),
 (4,
  '0.017*"wine" + 0.016*"finish" + 0.015*"fruit" + 0.014*"notes" + 0.014*"fresh" + 0.014*"palate" + 0.013*"flavors" + 0.012*"aromas" + 0.011*"acidity" + 0.011*"nose"'),
 (5,
  '0.024*"wine" + 0.019*"drink" + 0.016*"acidity" + 0.015*"palate" + 0.015*"ripe" + 0.014*"flavors" + 0.011*"fruit" + 0.

In [12]:
# Generate U Matrix for LDA model
corpus_lda = lda[corpus] #transform lda model

#convert corpus_lda to numpy matrix
U_matrix_lda = gensim.matutils.corpus2dense(corpus_lda,num_terms=10).T

#write U_matrix into pandas dataframe and output
U_matrix_lda_df = pd.DataFrame(U_matrix_lda)
U_matrix_lda_df.to_csv('U_matrix_lda.csv')

In [13]:
print (matrix_df.shape)
print (U_matrix_lda_df.shape)

(1001, 1970)
(1001, 10)


#### b.LSI model 

In [14]:
# Tfidf Transformation 
tfidf = models.TfidfModel(corpus) #fit tfidf model
corpus_tfidf = tfidf[corpus]      #transform tfidf model

In [15]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)

lsi.print_topics()

[(0,
  '0.164*"wine" + 0.148*"ripe" + 0.146*"drink" + 0.146*"acidity" + 0.142*"fruit" + 0.140*"flavors" + 0.136*"tannins" + 0.135*"black" + 0.132*"aromas" + 0.129*"palate"'),
 (1,
  '0.220*"black" + -0.194*"citrus" + -0.183*"apple" + -0.180*"crisp" + 0.179*"cabernet" + 0.178*"cherry" + 0.176*"tannins" + -0.167*"white" + -0.146*"fresh" + -0.143*"pear"'),
 (2,
  '-0.233*"fruits" + -0.207*"ready" + -0.199*"drink" + -0.198*"wine" + -0.174*"character" + -0.168*"rich" + -0.164*"ripe" + 0.161*"white" + 0.150*"palate" + 0.147*"aromas"'),
 (3,
  '-0.218*"white" + -0.162*"offers" + -0.151*"alongside" + -0.140*"bright" + 0.139*"flavors" + -0.137*"tannins" + -0.136*"flower" + -0.131*"note" + -0.123*"fruits" + 0.120*"oak"'),
 (4,
  '0.462*"cabernet" + 0.355*"sauvignon" + 0.270*"blend" + 0.258*"merlot" + 0.202*"franc" + -0.159*"black" + -0.127*"dark" + 0.109*"blanc" + 0.102*"lightly" + 0.100*"sangiovese"'),
 (5,
  '-0.206*"light" + -0.201*"red" + -0.182*"fruity" + -0.161*"raspberry" + -0.154*"berry"

In [16]:
# Generate U Matrix for LSI model
corpus_lsi = lsi[corpus_tfidf] #transform lda model

#convert corpus_lda to numpy matrix
U_matrix_lsi = gensim.matutils.corpus2dense(corpus_lsi,num_terms=10).T

#write U_matrix into pandas dataframe and output
pd.DataFrame(U_matrix_lsi).to_csv('U_matrix_lsi.csv')