# Topic Modeling

In [108]:
import pandas as pd
import numpy as np

In [2]:
# 
df = pd.read_csv('final_df.csv')

In [3]:
df.isna().any()
df.dropna(inplace=True)

In [4]:
df['reviews'] = df['reviews'].replace('\d+?', ' ', regex=True)

In [6]:
cleaned_reviews =  df.copy(deep=True)['reviews']

In [7]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer

# LSA

In [111]:
# vectorize the data and get check what the top words
tf_idf_vectorize = TfidfVectorizer(max_df=0.5, max_features=10000,
                                  min_df=2, stop_words='english',
                                  use_idf=True)

tf_idf = tf_idf_vectorize.fit_transform(cleaned_reviews)

TypeError: __init__() got an unexpected keyword argument 'n_grams'

In [10]:
# recommended components for LSA
SVD = TruncatedSVD(n_components=100, random_state=1234)

SVD_matrix = SVD.fit_transform(tf_idf)

LSA = Normalizer(copy=False).fit_transform(SVD_matrix)

In [13]:
# 100 rows - these are the features created after applying SVD which reduces the dimensionality of our feature space
# but maintains
LSA_df = pd.DataFrame(SVD.components_, columns=tf_idf_vectorize.get_feature_names())

In [None]:
for compNum in range(0, 10):

    comp = SVD.components_[compNum]
    
    # Sort the weights in the first component, and get the indeces
    indeces = numpy.argsort(comp).tolist()
    
    # Reverse the indeces, so we have the largest weights first.
    indeces.reverse()
    
    # Grab the top 10 terms which have the highest weight in this component.        
    terms = [features[weightIndex] for weightIndex in indeces[0:10]]    
    weights = [comp[weightIndex] for weightIndex in indeces[0:10]]    
   
    # Display these terms and their weights as a horizontal bar graph.    
    # The horizontal bar graph displays the first item on the bottom; reverse
    # the order of the terms so the biggest one is on top.
    terms.reverse()
    weights.reverse()
    positions = arange(10) + .5    # the bar centers on the y axis
    
    figure(compNum)
    barh(positions, weights, align='center')
    yticks(positions, terms)
    xlabel('Weight')
    title('Strongest terms for component %d' % (compNum))
    grid(True)
    show()

# LDA

In [None]:
def print_topics(model, features, n):
    ''' This model prints and saves the 'N' most important words from every topic '''
    # make sure the features is in a numpy array to use .argsort
    if type(features) == list:
        features = np.array(features)
    
    # save the n most important words for each topic
    components = model.components_ 
    top_n = [features[component.argsort()][-n-1:] for component in components]
    
    # print the top words for every each topic
    for i in range(len(top_n)):
        print(f"Topic {i+1} most important words: {top_n[i]}")
    return top_n

In [109]:
# extract feature name and run LDA model
features = tf_idf_vectorize.get_feature_names()
LDA = LatentDirichletAllocation(n_components=10, random_state=1234,
                                n_jobs=-1, verbose=1).fit(tf_idf)

LDA_topics = print_topics(LDA, features , 10)

Topic 1 most important words: ['internet' 'signal' 'connection' 'support' 'time' 'unit' 'network'
 'device' 'work' 'wireless' 'router']
Topic 2 most important words: ['money' 'worked' 'bought' 'time' 'buy' 'item' 'month' 'amazon' 'work'
 'battery' 'product']
Topic 3 most important words: ['board' 'ram' 'motherboard' 'cpu' 'drive' 'power' 'laptop' 'cooler'
 'card' 'case' 'fan']
Topic 4 most important words: ['use' 'tablet' 'key' 'like' 'fit' 'screen' 'ipad' 'cover' 'mouse'
 'keyboard' 'case']
Topic 5 most important words: ['use' 'power' 'mount' 'usb' 'cord' 'plug' 'charger' 'work' 'charge'
 'battery' 'cable']
Topic 6 most important words: ['noise' 'pair' 'volume' 'headset' 'good' 'quality' 'bass' 'speaker' 'ear'
 'headphone' 'sound']
Topic 7 most important words: ['device' 'unit' 'play' 'mp' 'use' 'work' 'sound' 'music' 'player' 'ipod'
 'radio']
Topic 8 most important words: ['picture' 'channel' 'player' 'dvd' 'antenna' 'work' 'monitor' 'remote'
 'hdmi' 'cable' 'tv']
Topic 9 most import

In [None]:
# tfidf bigrams
tf_idf_n2_vec = TfidfVectorizer(max_df=0.5, max_features=8000,
                                min_df=2, stop_words='english',
                                use_idf=True, ngram_range=(1, 2))
# extract features name 
features_n2 = tf_idf_n2_vec.get_feature_names()

# fit transform
tf_idf_n2 = tf_idf_n2_vec.fit_transform(cleaned_reviews)

In [116]:
LDA_n2 = LatentDirichletAllocation(n_components=10, random_state=1234,
                                   n_jobs=-1, verbose=1).fit(tf_idf_n2)

LDA_n2_topics = print_topics(LDA_n2, features_n2 , 10)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
Topic 1 most important words: ['good' 'nice' 'small' 'little' 'strap' 'like' 'laptop' 'fan' 'fit' 'bag'
 'case']
Topic 2 most important words: ['internet' 'support' 'wifi' 'modem' 'device' 'signal' 'connection' 'work'
 'network' 'wireless' 'router']
Topic 3 most important words: ['software' 'memory' 'file' 'hard drive' 'work' 'window' 'gb' 'computer'
 'usb' 'card' 'drive']
Topic 4 most important words: ['use' 'like' 'quality' 'headset' 'key' 'good' 'keyboard' 'ear' 'sound'
 'headphone' 'mouse']
Topic 5 most important words: ['audio' 'player' 'cable' 'hdmi' 'good' 'monitor' 'quality' 'great' 'tv'
 'sound' 'speaker']
Topic 6 most important words: ['gps' 'channel' 'time' 'antenna' 'use' 'button' 'work' 

[array(['good', 'nice', 'small', 'little', 'strap', 'like', 'laptop',
        'fan', 'fit', 'bag', 'case'], dtype='<U25'),
 array(['internet', 'support', 'wifi', 'modem', 'device', 'signal',
        'connection', 'work', 'network', 'wireless', 'router'],
       dtype='<U25'),
 array(['software', 'memory', 'file', 'hard drive', 'work', 'window', 'gb',
        'computer', 'usb', 'card', 'drive'], dtype='<U25'),
 array(['use', 'like', 'quality', 'headset', 'key', 'good', 'keyboard',
        'ear', 'sound', 'headphone', 'mouse'], dtype='<U25'),
 array(['audio', 'player', 'cable', 'hdmi', 'good', 'monitor', 'quality',
        'great', 'tv', 'sound', 'speaker'], dtype='<U25'),
 array(['gps', 'channel', 'time', 'antenna', 'use', 'button', 'work',
        'device', 'unit', 'tv', 'remote'], dtype='<U25'),
 array(['quality', 'flash', 'light', 'use', 'good', 'protector', 'picture',
        'canon', 'screen', 'lens', 'camera'], dtype='<U25'),
 array(['fit', 'great', 'work', 'kindle', 'charger', 't

# NMF

In [131]:
# NMF no grams model
nmf = NMF(n_components=10, random_state=123, alpha=.1, l1_ratio=.5).fit(tf_idf)
nmf_topics = print_topics(nmf, features , 10)

Topic 1 most important words: ['set' 'use' 'player' 'problem' 'remote' 'screen' 'time' 'router' 'device'
 'unit' 'tv']
Topic 2 most important words: ['zoom' 'bag' 'video' 'photo' 'image' 'quality' 'mm' 'canon' 'picture'
 'lens' 'camera']
Topic 3 most important words: ['better' 'price' 'volume' 'music' 'bass' 'good' 'quality' 'ear' 'speaker'
 'headphone' 'sound']
Topic 4 most important words: ['stand' 'hold' 'nice' 'like' 'screen' 'tablet' 'kindle' 'fit' 'cover'
 'ipad' 'case']
Topic 5 most important words: ['time' 'phone' 'power' 'original' 'hour' 'charged' 'charging' 'life'
 'charger' 'charge' 'battery']
Topic 6 most important words: ['connection' 'audio' 'end' 'port' 'plug' 'quality' 'tv' 'connector' 'usb'
 'hdmi' 'cable']
Topic 7 most important words: ['port' 'seagate' 'backup' 'data' 'gb' 'external' 'file' 'computer' 'hard'
 'usb' 'drive']
Topic 8 most important words: ['wireless' 'laptop' 'like' 'feel' 'usb' 'use' 'button' 'logitech' 'key'
 'keyboard' 'mouse']
Topic 9 most importa

In [132]:
# NMF bigrams model
nmf_n2 = NMF(n_components=10, random_state=123, alpha=.1, l1_ratio=.5).fit(tf_idf_n2)
nmf2_topics = print_topics(nmf_n2, features_n2 , 10)

Topic 1 most important words: ['remote' 'great' 'problem' 'use' 'router' 'time' 'device' 'unit' 'tv'
 'product' 'work']
Topic 2 most important words: ['shoot' 'zoom' 'shot' 'image' 'quality' 'flash' 'photo' 'canon' 'video'
 'picture' 'camera']
Topic 3 most important words: ['price' 'volume' 'music' 'sound quality' 'bass' 'good' 'quality' 'ear'
 'speaker' 'headphone' 'sound']
Topic 4 most important words: ['hold' 'stand' 'protector' 'like' 'tablet' 'screen' 'kindle' 'fit'
 'cover' 'ipad' 'case']
Topic 5 most important words: ['seagate' 'backup' 'data' 'external' 'computer' 'gb' 'file' 'hard' 'usb'
 'hard drive' 'drive']
Topic 6 most important words: ['quality' 'port' 'plug' 'connector' 'work' 'tv' 'cable work' 'hdmi cable'
 'usb' 'hdmi' 'cable']
Topic 7 most important words: ['phone' 'original' 'power' 'hour' 'charged' 'battery life' 'charging'
 'life' 'charger' 'charge' 'battery']
Topic 8 most important words: ['hand' 'laptop' 'like' 'feel' 'usb' 'use' 'button' 'logitech' 'key'
 'keybo

# Classify the reviews into different topics

In [143]:
import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.tokenize import word_tokenize 

tokened_reviews = [word_tokenize(i) for i in cleaned_reviews]

In [144]:
dictionary = gensim.corpora.Dictionary(tokened_reviews)

In [145]:
dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=10000)

In [148]:
bow_corpus = [dictionary.doc2bow(doc) for doc in tokened_reviews]

In [149]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [150]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.022*"camera" + 0.012*"lens" + 0.009*"battery" + 0.007*"canon" + 0.006*"picture" + 0.006*"d" + 0.005*"nikon" + 0.005*"video" + 0.005*"photo" + 0.005*"image"
Topic: 1 Word: 0.021*"drive" + 0.013*"card" + 0.010*"usb" + 0.008*"gb" + 0.006*"window" + 0.006*"file" + 0.005*"speed" + 0.005*"computer" + 0.005*"ssd" + 0.005*"hard"
Topic: 2 Word: 0.018*"sound" + 0.013*"headphone" + 0.013*"speaker" + 0.011*"ear" + 0.007*"bass" + 0.006*"quality" + 0.006*"headset" + 0.006*"volume" + 0.005*"good" + 0.005*"music"
Topic: 3 Word: 0.017*"protector" + 0.015*"screen" + 0.011*"bubble" + 0.008*"fan" + 0.005*"dust" + 0.004*"case" + 0.004*"air" + 0.004*"sata" + 0.004*"one" + 0.004*"product"
Topic: 4 Word: 0.020*"keyboard" + 0.013*"mouse" + 0.009*"key" + 0.006*"tablet" + 0.006*"button" + 0.004*"use" + 0.004*"screen" + 0.004*"typing" + 0.004*"bluetooth" + 0.004*"touch"
Topic: 5 Word: 0.009*"charge" + 0.007*"phone" + 0.006*"battery" + 0.006*"charger" + 0.006*"iphone" + 0.006*"radio" + 0.005*"char

In [None]:
doc_topics = pd.DataFrame(LDA.transform(td_idf))
topic_column_names = ["topic_{}".format(c) for c in doc_topics.columns]
doc_topics.columns = topic_column_names
sample_with_topics = pd.concat([df, doc_topics], axis=1)