In [0]:
import numpy as np
import os.path
import pandas as pd 
import nltk
# nltk.download('all')

In [84]:
#loading the data 
df = pd.read_csv('amazon_reviews.csv')
df[:10]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
5,6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,4,1342051200,Nice Taffy,I got a wild hair for taffy and ordered this f...
6,7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,5,1340150400,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...
7,8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,5,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...
8,9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,5,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...
9,10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,5,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...


In [85]:
#Lets reduce the dataset: only use text 
df_text = df['Text']
df_text.shape 
# type(df_text)#pd series 
# type(df_text [0]) #str


#too large dataset reduce dataset to 20000
df_text = df_text[:20000]
df_text.shape

(20000,)

In [0]:
#preprocess the data 

from nltk import word_tokenize
from nltk.corpus import stopwords
import re 

def preprocess_data(df):
  """
  input:: df pd series 
  output:: nested list 
  """
  #1 tockenize the string words 
  #2 loop through stop words -delete stop words
  #3 append to a new list

  tokenlist =[]
  stop = set(stopwords.words('english'))
  for row,line in enumerate(df): 
    line = line.lower()
    #delete non-words 
    line = re.sub(pattern = '[\W]',repl=' ',string = line )
    # print(line)
    #delete one letter length such as c, w ... 
    line  = re.sub(r"\b[a-zA-Z]\b", "", string = line )
    # print(line)
    line = re.sub(r"\bamazon\b", "", string = line )
    # print(line)
    line = re.sub(r"\bproduct\b", "", string = line )

    #tokenize the string words
    tokens = nltk.word_tokenize(line) 
    #loop through stop words
    wordlist=[] 
    for word in tokens:
      if word not in stop:
        #not allows duplicates 
        if word not in wordlist:
          wordlist.append(word)
      
    tokenlist.append(wordlist)

  
      
  return tokenlist
  
# df_simple = df_text[:10]

# t = preprocess_data(df_simple)


In [0]:
#vectorizing the dataset 

from sklearn.feature_extraction.text import TfidfVectorizer

#since we have tokenized the data above 
def tok(text):
  return text

def vectorize(df):
  """
  input:: nested list of words (preprocessed data)
  output:: dic of 1. vectorized words type of numpy array, 2. features of the vectorizer 
  """
  vectorizer = TfidfVectorizer(lowercase=False, tokenizer=tok,max_features= 1000) #cut to top 1000 words 
  vector_data = vectorizer.fit_transform(df)
  array_vector = vector_data.toarray()
  #array_vector.shape  
  #(num_doc , num words)
  array_vector = array_vector #transpose as the SVD ppt 

  features = vectorizer.get_feature_names()
  vectorized ={}
  vectorized['vector'] = array_vector

  vectorized['feature'] = features
  
  return vectorized 

# vec = vectorize(t)['vector']
# feat = vectorize(t)['feature']

# v = vectorize(t)
# vec.shape #shape(10:documents,171:term(words))  

In [0]:
#Topic Modeling with SVD 
from sklearn.decomposition import TruncatedSVD


def modeling(np,num):
  """
  input:: numpy array of (doc,word) , num(int): n_componenets dictionary 
  output:: dictionary 1) np array (topic,doc) 2) features 

  """
  #don't know how many topics are inside the dataset so randomly choosed 20
  vect = np['vector'] #assign matrix 
  
  tsvd_model = TruncatedSVD(n_components=num, algorithm='randomized', n_iter=100, random_state=122)
  tsvd_model.fit(vect) #fit the vectorized tfidf array
  vt = tsvd_model.components_
  # print(vt.shape) 
  # print(vt)

  modeled ={}
  modeled['tsvd'] = vt
  modeled['feature'] = np['feature']

  return modeled

# a = modeling(v,5)['tsvd'] 
# a.shape #shape is (5,171) since we set the topic num to 5 , words = 171
# new =modeling(v,5)

In [0]:
#infer the topic by the top rated words 
len(feat) #list of words length of 171 

def infer(svd_dic):
  """
  input:: svd_dic (1) tsvd : vectorized matrix , (2) feature : features list 
  output::  NONE print out form 
  """
  tsvd = svd_dic['tsvd']
  feature =svd_dic['feature']

  final_dic ={}
  for top_num, row in enumerate(tsvd): #for each row of tsvd 
    
    sort = sorted(row, reverse=True) #sort row of tsvd 
    
    sort_index = np.argsort(row) #get index number of the sorted row 
    top_ten_value = sort[:10] #cut the top 10 of each value 
    top_ten_index = sort_index[:10]
    
    
    #get the word for the index from the feature list 
    wordlist=[]
    for index in top_ten_index : 
      wordlist.append(feature[index])

    
    #top_ten_value, wordlist for row 

    for i in range(10):

      print('Topic: {}, Word: {} , Value: {}'.format(top_num+1, wordlist[i],top_ten_value[i].round(6)))

    

# infer(new)



In [90]:
#USE DATASET FOR TOPIC MODELING 

df_clean = preprocess_data(df_text)
clean_vect_dic = vectorize(df_clean)

model = modeling(clean_vect_dic,5) #5 topics 
infer(model)

Topic: 1, Word: nutritional , Value: 0.174041
Topic: 1, Word: main , Value: 0.168937
Topic: 1, Word: seconds , Value: 0.160701
Topic: 1, Word: fridge , Value: 0.148957
Topic: 1, Word: outside , Value: 0.143802
Topic: 1, Word: form , Value: 0.140243
Topic: 1, Word: bits , Value: 0.133185
Topic: 1, Word: total , Value: 0.132884
Topic: 1, Word: level , Value: 0.123829
Topic: 1, Word: air , Value: 0.121314
Topic: 2, Word: dog , Value: 0.485529
Topic: 2, Word: food , Value: 0.274918
Topic: 2, Word: treats , Value: 0.168444
Topic: 2, Word: eat , Value: 0.168167
Topic: 2, Word: dogs , Value: 0.160797
Topic: 2, Word: loves , Value: 0.141553
Topic: 2, Word: healthy , Value: 0.135855
Topic: 2, Word: old , Value: 0.131267
Topic: 2, Word: treat , Value: 0.130914
Topic: 2, Word: free , Value: 0.128599
Topic: 3, Word: coffee , Value: 0.221891
Topic: 3, Word: dog , Value: 0.185084
Topic: 3, Word: loves , Value: 0.181509
Topic: 3, Word: price , Value: 0.163735
Topic: 3, Word: treats , Value: 0.159846




1. Topic1: nutritional product?
2. Topic2: a treat that a dog likes
3. Topic3: coffee related, good reviews 
4. Topic4: drink? 
5. Topic5: grocery related?


In [91]:
model2 = modeling(clean_vect_dic,10) #try 10 topics 
infer(model2)

Topic: 1, Word: nutritional , Value: 0.174041
Topic: 1, Word: main , Value: 0.168937
Topic: 1, Word: seconds , Value: 0.160701
Topic: 1, Word: fridge , Value: 0.148957
Topic: 1, Word: outside , Value: 0.143802
Topic: 1, Word: form , Value: 0.140243
Topic: 1, Word: bits , Value: 0.133185
Topic: 1, Word: total , Value: 0.132884
Topic: 1, Word: level , Value: 0.123829
Topic: 1, Word: air , Value: 0.121314
Topic: 2, Word: dog , Value: 0.485529
Topic: 2, Word: food , Value: 0.274918
Topic: 2, Word: treats , Value: 0.168444
Topic: 2, Word: eat , Value: 0.168167
Topic: 2, Word: dogs , Value: 0.160797
Topic: 2, Word: loves , Value: 0.141553
Topic: 2, Word: healthy , Value: 0.135855
Topic: 2, Word: old , Value: 0.131267
Topic: 2, Word: treat , Value: 0.130914
Topic: 2, Word: free , Value: 0.128599
Topic: 3, Word: coffee , Value: 0.221891
Topic: 3, Word: dog , Value: 0.185084
Topic: 3, Word: loves , Value: 0.181509
Topic: 3, Word: price , Value: 0.163735
Topic: 3, Word: treats , Value: 0.159846


Topic10: looks like chips 
 