### 1. Data Collection

In [1]:
import json
import pandas as pd

In [2]:
# Digital Music data (64,706) downloaded from: http://jmcauley.ucsd.edu/data/amazon/ 
json_objs = []
with open("Digital_Music_5.json", "r") as f:
    for json_obj in f:
        json_objs.append(json.loads(json_obj))

In [3]:
df = pd.DataFrame(json_objs)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A3EBHHCZO6V2A4,5555991584,"Amaranth ""music fan""","[3, 3]","It's hard to believe ""Memory of Trees"" came ou...",5.0,Enya's last great album,1158019200,"09 12, 2006"
1,AZPWAXJG9OJXV,5555991584,bethtexas,"[0, 0]","A clasically-styled and introverted album, Mem...",5.0,Enya at her most elegant,991526400,"06 3, 2001"
2,A38IRL0X2T4DPF,5555991584,bob turnley,"[2, 2]",I never thought Enya would reach the sublime h...,5.0,The best so far,1058140800,"07 14, 2003"
3,A22IK3I6U76GX0,5555991584,Calle,"[1, 1]",This is the third review of an irish album I w...,5.0,Ireland produces good music.,957312000,"05 3, 2000"
4,A1AISPOIIHTHXX,5555991584,"Cloud ""...""","[1, 1]","Enya, despite being a successful recording art...",4.0,4.5; music to dream to,1200528000,"01 17, 2008"


### 2. Data Preprocessing

In [4]:
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

In [6]:
stoplist = stopwords.words("english")
lemmatizer = WordNetLemmatizer()
# noun and adjectives
chosen_tags = ["NN", "NNS", "JJ"]

In [7]:
def clean(text):
    # remove punctuation
    nopunc_text = re.sub(r"[!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]", " ", text)
    # remove everything except alphabetic characters
    alpha_text = re.sub(r"[^A-Za-z]", " ", nopunc_text)
    # eliminate multiple spaces
    nomspace_text = re.sub(r"\s+", " ", alpha_text)
    return nomspace_text

In [8]:
def get_pos_tag(tag):
    if tag.startswith("N") or tag.startswith("J"):
        return wordnet.NOUN
    elif tag.startswith("V"):
        return wordnet.VERB
    elif tag.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [9]:
def preprocess(text):
    # clean text
    cleaned_text = clean(text)
    lowered_text = cleaned_text.lower()
    
    # tokenization
    tokens = word_tokenize(lowered_text)
    
    # remove stopwords and single character words
    words = [token for token in tokens if token not in stoplist or len(token) > 1]
    
    # lemmatize words
    tagged_words = nltk.pos_tag(words)
    lemmatized_words = [lemmatizer.lemmatize(word, get_pos_tag(tag)) for word, tag in tagged_words if tag in chosen_tags]
    
    return " ".join(lemmatized_words)

In [10]:
df_clean = df["reviewText"].apply(preprocess)

In [11]:
df_clean.head()

0    hard memory tree year passage time last great ...
1    album memory tree subtlety many song shyness s...
2    sublime height evacuee marble hall shepherd mo...
3    third review irish album write today others cr...
4    successful artist doesn broad appeal other art...
Name: reviewText, dtype: object

### 3. Topic modeling using LSA

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd

In [None]:
vectorizer = CountVectorizer()
bag_of_words = vectorizer.fit_transform(df_clean)

In [None]:
svd = TruncatedSVD(n_components = 10)
lsa = svd.fit_transform(bag_of_words)

In [None]:
lsa

In [None]:
topic_encoded_df = pd.DataFrame(lsa, columns = ["topic1", "topic2", "topic3", "topic4", "topic5", "topic6", "topic7", "topic8", "topic9", "topic10"])
topic_encoded_df["body"] = df_clean
display(topic_encoded_df[["body", "topic1", "topic2", "topic3", "topic4", "topic5", "topic6", "topic7", "topic8", "topic9", "topic10"]])

In [None]:
dictionary = vectorizer.get_feature_names()
dictionary

In [None]:
encoding_matrix = pd.DataFrame(svd.components_, index = ["topic1", "topic2", "topic3", "topic4", "topic5", "topic6", "topic7", "topic8", "topic9", "topic10"], columns = dictionary).T
encoding_matrix.head()

### 6. Visualization