# Capstone: Topic Modelling on AMD vs Nvidia GPU

## Contents
- Data Extraction
- Data Cleaning
- [EDA](#EDA)
- [Prepare data for LDA Analysis](#Prepare-data-for-LDA-Analysis)
- [LDA Model Training](#LDA-Model-Training)
- Model creation
- Model Evaluation

In [1]:
# Common imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pprint import pprint
import os

import re
# NLTK Library
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Import PRAW package
import praw
from praw.models import MoreComments

# Gensim library
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora

import pyLDAvis.gensim
import pickle 
import pyLDAvis

# Detect non-english words
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0

# Detect non-english words using spacy
import spacy
from spacy_langdetect import LanguageDetector
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)


# Import the wordcloud library
from wordcloud import WordCloud

%matplotlib inline

In [2]:
# Set the max rows and columns for Pandas
pd.options.display.max_columns = 100
pd.options.display.max_rows = 200

In [3]:
# Set the style use
plt.style.use('ggplot')

# Tutorial: Data Extraction from reddit using PRAW

In [None]:
 reddit = praw.Reddit(
     client_id="IR7Y4cUBrVAbGg",
     client_secret="podr43kzztn_CoVgtNQiNpDfjI5mjg",
     user_agent="gpu_scrapper"
 )

In [None]:
print(reddit.read_only)  # Output: True

## Obtain from learnpython

In [None]:
# continued from code above

for submission in reddit.subreddit("learnpython").hot(limit=30):
    print(submission.title)

# Output: 10 submissions

## Authorized Reddit instances

In [None]:
 reddit = praw.Reddit(
     client_id="IR7Y4cUBrVAbGg",
     client_secret="podr43kzztn_CoVgtNQiNpDfjI5mjg",
     user_agent="gpu_scrapper",
     username="leader2345",
     password="rPLHgrS8"
 )

In [None]:
print(reddit.read_only)  # Output: False

## Obtain a subreddit

In [None]:
crypto_sub = reddit.subreddit("cryptocurrency")

print(crypto_sub.display_name)  # output: redditdev
print(crypto_sub.title)         # output: reddit development
print(crypto_sub.description)   # output: a subreddit for discussion of ...

## Obtain `Submission` Instances from a subreddit

In [None]:
for submission in crypto_sub.hot(limit=10):
    print(submission.title)
    print(submission.score)
    print(submission.id)
    print(submission.url)

In [None]:
# assume you have a Reddit instance bound to variable `reddit`
submission = reddit.submission(id="ktzv3a")
print(submission.title)  # Output: reddit will soon only be available ...

# or
# submission = reddit.submission(url='https://www.reddit.com/...')

## Obtain `Comment` Instances

In [None]:
# assume you have a Reddit instance bound to variable `reddit`
top_level_comments = list(submission.comments)
all_comments = submission.comments.list()

In [None]:
all_comments

In [None]:
# assume you have a Reddit instance bound to variable `reddit`
submission = reddit.submission(id="ktzv3a")
submission.comment_sort = "new"
top_level_comments = list(submission.comments)

In [None]:
top_level_comments

In [None]:
import pprint
# assume you have a Reddit instance bound to variable `reddit`
submission = reddit.submission(id="39zje0")
# print(submission.title) # to make it non-lazy
pprint.pprint(vars(submission))

## Extracting comments with PRAW

In [None]:
submission = reddit.submission(id="3g1jfi")

In [None]:
for top_level_comment in submission.comments:
    print(top_level_comment.body)

In [None]:
for top_level_comment in submission.comments:
    if isinstance(top_level_comment, MoreComments):
        continue
    print(top_level_comment.body)

In [None]:
submission.comments.replace_more(limit=0)
for top_level_comment in submission.comments:
    print(top_level_comment.body)

In [None]:
submission.comments.replace_more(limit=None)
lst = []
for top_level_comment in submission.comments:
    lst.append(top_level_comment.body)

In [None]:
len(lst)

## Obtaining the replies of the top comments

In [None]:
submission.comments.replace_more(limit=None)
for top_level_comment in submission.comments:
    for second_level_comment in top_level_comment.replies:
        print(second_level_comment.body)

#### Obtain the second level comments

In [None]:
submission.comments.replace_more(limit=None)
comment_queue = submission.comments[:]  # Seed with top-level
while comment_queue:
    comment = comment_queue.pop(0)
    print(comment.body)
    comment_queue.extend(comment.replies)

In [None]:
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
    print(comment.body)

# Obtain the comments from RTX 3080 

## Setting up the reddit instance

In [7]:
 reddit = praw.Reddit(
     client_id="IR7Y4cUBrVAbGg",
     client_secret="podr43kzztn_CoVgtNQiNpDfjI5mjg",
     user_agent="gpu_scrapper_32",
     username="leader2345",
     password="rPLHgrS8"
 )

In [5]:
print(reddit.read_only)  # Output: False

False


In [6]:
# Creating the submission object for rtx 3080 megathread
submission = reddit.submission(id="itw87x")

### Top level comments only extraction

In [None]:
rtx_3080_lst = []

In [None]:
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
    rtx_3080_lst.append(comment.body)

In [None]:
# Converting it to a Series
rtx_3080_df = pd.Series(rtx_3080_lst)
rtx_3080_df

In [None]:
rtx_3080_df.loc[1500]

In [None]:
# Save rtx 3080 comments to csv file
rtx_3080_df.to_csv('./reddit dataset/rtx_3080.csv', index=False)

### First and Second level comments

In [None]:
submission.comments.replace_more(limit=None)
for top_level_comment in submission.comments:
    for second_level_comment in top_level_comment.replies:
        print(second_level_comment.body)

## Create a helper function to scrap the dataset in reddit

In [4]:
gpu_sub_dict = {'rtx_3060ti': 'k4mctp', 'rtx_3070':'jj8k0l', 'rtx_3080': 'itw87x', 'rtx_3090': 'iyy5sx', 'rtx_3000': 'iko4ir'}
gpu_sub_dict_test = {'rtx_3060ti': 'k4mctp'} 

In [8]:
 reddit = praw.Reddit(
     client_id="IR7Y4cUBrVAbGg",
     client_secret="podr43kzztn_CoVgtNQiNpDfjI5mjg",
     user_agent="gpu_scrapper_32",
     username="leader2345",
     password="rPLHgrS8"
 )

In [20]:
def scrapeGPUComment(gpu_dict):
    for key, value in gpu_sub_dict.items():
        gpu_lst = []
        # Creating the submission object for rtx megathreads
        submission = reddit.submission(id=value)
        
        submission.comments.replace_more(limit=None)
        for comment in submission.comments.list():
            gpu_lst.append(comment.body)
        # Converted to Series format
        rtx_df = pd.DataFrame({'Reddit comments':gpu_lst})
        rtx_df['tag'] = key
        rtx_df.to_csv('./reddit dataset/' + key + '.csv', index=False)

In [21]:
scrapeGPUComment(gpu_sub_dict)

In [22]:
gpu_df = pd.read_csv('./reddit dataset/rtx_3060ti.csv')
gpu_df.head()

Unnamed: 0,Reddit comments,tag
0,"Having just weathered the 6800/XT launch, it f...",rtx_3060ti
1,EVGA queue\n\nhttps://www.evga.com/products/pr...,rtx_3060ti
2,Wow guru3d actually commented on the pricing s...,rtx_3060ti
3,"Like a lot of people here, I have a 500W PSU a...",rtx_3060ti
4,Seems like a great card and it seems like the ...,rtx_3060ti


In [15]:
gpu_df.

SyntaxError: invalid syntax (<ipython-input-15-ab01192883ad>, line 1)

# Data cleaning

## Removing the null values

In [None]:
# Read the existing csv file
GPU_df = pd.read_csv('./amazon dataset/gpu_df_1.csv')

In [None]:
# Check the dimensions of the data
GPU_df.shape

In [None]:
# Check for null values
GPU_df.isnull().sum()

## Cleaning the customer review title and reviews

In [None]:
# Full function to clean the title and the post
def clean_post(df):
    """
    This function removes the unnecessary characters, punctuations, removes stop words and lemmantizes the words
    from the posts and titles. Lemmantization is used as I want to preserve the meaning of the words in which it'll compare the words against a dictionary.
    """
    new_lst = []
    
    # Stop words
    stops = set(stopwords.words('english'))
    
    # Lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    for post in df:
        # Lowercase the text
        post = post.lower()

        # Find the https websites and removes them
        post = re.sub(r'\(https:.*?\)','',post)

        # Removes youtube links
        post = re.sub('https:.*?\\n','',post)

        # Removes uncaptured url links at the bottom of the text
        post = re.sub('https.*?[\\n|"]','',post)

        # Removes characters: \n\n&amp;#x200B;
        post = re.sub('\\n\\n&amp;#x200b;\\n\\n','',post)

        # Removing the special characters, like punctuation marks, periods
        post = re.sub(r'[^\w]',' ',post)
        
        # Removes digits and keeps the letters
        # post = re.sub(r'[^a-zA-Z]', ' ', post)

        # Removes underscores
        post = re.sub(' _', ' ',post)

        # Removes addtional white spaces
        post = re.sub(' +', ' ',post)
        
        # Stores the words in a list 
        lst = [] 
        
        # If the word is not in the stop words then, lemmantize the words
        for word in post.split():
            if not word in stops:
                lst.append(lemmatizer.lemmatize(word))
            
        new_lst.append(" ".join(lst))
        
    return new_lst

In [None]:
GPU_df = pd.read_csv('./reddit dataset/rtx_3080.csv')
GPU_df

In [None]:
# Cleans the Customer Review column
GPU_df['Customer Review'] = clean_post(GPU_df['0'])
GPU_df['Customer Review']

In [None]:
GPU_df.drop('0', axis=1, inplace=True)

In [None]:
GPU_df

In [None]:
# Randomly going through the rows to check if it's cleaned properly 
GPU_df['Customer Review'].loc[np.random.randint(1707)]

In [None]:
# Cleans the Customer Review Title column
GPU_df['Customer Review Title'] = clean_post(GPU_df['Customer Review Title'])
GPU_df['Customer Review Title']

In [None]:
# Randomly going through the rows to check if it's cleaned properly 
GPU_df['Customer Review Title'].loc[np.random.randint(2048)]

### Removing non-english words in the reviews

In [None]:
# Removing non english by creating a helper function
from langdetect import detect
def isenglish(text):
    try:
        if nlp(text)._.language.get('language') == 'en':
            return 1
        else:
            return 0
    except:
        return 0

In [None]:
GPU_df['isenglish'] = GPU_df['Customer Review'].apply(isenglish)

In [None]:
GPU_df[GPU_df.loc[:,'isenglish'] == 0][['Customer Review']].head(10)

In [None]:
# Count number of rows with the string deleted in them
GPU_df['Customer Review'].map(lambda x: x.count('deleted')).sum()

In [None]:
GPU_df[GPU_df.loc[:,'isenglish'] == 0][['Customer Review']].count()

A total of 130 rows were non-english reviews. These have to be removed.

In [None]:
GPU_df.shape

In [None]:
GPU_df.drop(GPU_df[GPU_df['isenglish'] == 0].index, inplace=True)

## Checking for duplicates

In [None]:
GPU_df[['Customer Review Title', 'Customer Review']].loc[GPU_df['Customer Review'].duplicated()]

In [None]:
GPU_df[['Customer Review Title', 'Customer Review']].loc[GPU_df[['Customer Review Title']].duplicated()]

In [None]:
GPU_df[GPU_df['Customer Review Title'] == 'far good']

The duplicate values doesn't be seem to be actually duplicates, just a few words that were written by the customers.

In [None]:
# Save to csv file
GPU_df.to_csv('./amazon dataset/cleaned_gpu_df_1.csv',index=False)

# EDA

In [None]:
# Read the existing csv file
GPU_df = pd.read_csv('./amazon dataset/cleaned_gpu_df_1.csv')

In [None]:
GPU_df.shape

In [None]:
# Check for null values
GPU_df.isnull().sum()

In [None]:
# Drop the rows with null values
GPU_df.dropna(inplace=True)

In [None]:
# Remove the Review title and reviews
GPU_df_no_reviews = GPU_df.drop(['Customer Review Title', 'Customer Review'], axis=1)
GPU_df_no_reviews.head()

In [None]:
# Check if the ids match
list(GPU_df_no_reviews.drop_duplicates(['id'])['id'].unique()) == list(GPU_df['id'].unique())

In [None]:
# Remove the duplicate values in the GPU_df_no_reviews
GPU_df_no_reviews = GPU_df_no_reviews.drop_duplicates(['id'])
GPU_df_no_reviews.head()

In [None]:
# Set the id as the index and reset the index
GPU_df_no_reviews = GPU_df_no_reviews.set_index('id').reset_index(drop=True)
GPU_df_no_reviews

In [None]:
GPU_df_no_reviews.shape

### Price distribution

In [None]:
# plt.figure(figsize=(99,99))
sns.displot(GPU_df_no_reviews['Price'], bins=12, aspect=1.5, height=6, color='green')
plt.axvline(GPU_df_no_reviews['Price'].mean(),color='red')
plt.axvline(GPU_df_no_reviews['Price'].median(),color='yellow')

plt.title('Distribution of sale price of GPUs', size=13)
plt.legend(['Mean sale price','Median sale price']);

The distribution shows a right skewed graph with most of the GPUs falling below the 100 dollars range. The mean and the median prices are far part showing that they are some outliers in the price distribution as seen in the price range of 800 and 1000 dollars range. 

### Distribution of AMD and Nvidia Chipsets

In [None]:
GPU_df_no_reviews['Chipset Brand'].value_counts(normalize=True)

It seems that most of the GPUs are under Nvidia with a proportion of 70% while Amd has a proportion of 30%.

### Most popular brands by their rating

In [None]:
GPU_df_no_reviews['Manufacturer'].value_counts()

As NVIDIA, NVIDIA Corporation and Althon Micro Inc. have only 1 GPUs, I'll not include them in the popular brand investigation

In [None]:
manufacturer_list = ['AMD','ASRock','Aiposen','SAPPHIRE', 'Althon Micro Inc.', 'NVIDIA']
GPU_df_no_reviews.groupby('Manufacturer').mean().drop(manufacturer_list)['Overall Customer Rating'].sort_index(ascending=False).plot(kind='barh', 
                                                                                                                                            title='Most popular brand by rating', 
                                                                                                                                            figsize=(11,7), 
                                                                                                                                            color='green')
plt.xlabel('Rating')
plt.ylabel('Brand', rotation=360);

In [None]:
GPU_df_no_reviews.groupby('Manufacturer').mean().drop(manufacturer_list)['Overall Customer Rating'].sort_values(ascending=False)

Without including Nvidia and Althon Micro Inc as they have only 1 type of GPU, Asus, EVGA and SAPPHIRE are the most popular brands given their high ratings.

The reason behind this is that consumers usually prefer 3rd party coolers fitted into the GPUs compared to the Nvidia's coolers as they're much more effective in controlling the airflow and decreasing the GPU temperature. 

### Which Chipset Brand has a higher customer rating?

In [None]:
GPU_df_no_reviews.groupby('Chipset Brand').mean()['Overall Customer Rating']

Nvidia is slightly ahead of AMD in terms of the Overall Customer rating.

### Which Manufacturer produces GPUs with higher Memory Speed and Size?

In [None]:
GPU_df_no_reviews.info()

In [None]:
GPU_df_no_reviews.groupby('Manufacturer').mean()['Memory Speed(MHz)'].sort_values().plot(kind='barh', figsize=(11,7))

plt.title('Memory speed of the GPUs produced by individual manufacturers')
plt.xlabel('Memory speed (MHz)')
plt.ylabel('Manufacturer',rotation=360);

In [None]:
GPU_df_no_reviews.groupby('Manufacturer').mean()['Memory Speed(MHz)'].sort_values(ascending=False)

ASRock, Gigabyte, XFX and EVGA manufacturers produces GPUs with high amount of memory speed which shows that they're premium brands that produce 'Enthusiast Grade' types of GPUs.

### EDA on Customer Review Title

In [None]:
customer_review_title = " ".join(GPU_df['Customer Review Title'])

In [None]:
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, 
                      contour_width=5, contour_color='steelblue', width=700, height=500)
wordcloud.generate(customer_review_title)
# Visualize the word cloud
wordcloud.to_image()

Based on the word cloud, it seems that consumers are mostly satisfied with their GPU purchase with 'good', 'great' and 'best' words coming out at the top. The consumers are mostly gamers and most of them play in 1080p resolution and they seem to be price sensitive with the words such as 'bang buck' and 'great value' having a bigger size. 

In [None]:
customer_review_title_list = customer_review_title.split()
customer_review_title_dict = {}

for word in customer_review_title_list:
    if word not in customer_review_title_dict.keys():
        customer_review_title_dict[word] = customer_review_title_list.count(word)
    else:
        continue
        
customer_review_title_dict

In [None]:
df = {'words': customer_review_title_dict.keys(), 'freq': customer_review_title_dict.values()}
customer_review_title_df = pd.DataFrame(df)
customer_review_title_df.sort_values('freq', ascending=False).set_index('words').head(10).plot(kind='barh', figsize=(11,7),
                                                                                              title='Frequency of words in customer review title')
plt.xticks(fontsize=12)
plt.legend([]);

The graph shows consistency with the word cloud on the frequency of the words appearing in the customer review title.

In [None]:
# customer_review_title_df['freq'].hist(bins=150)
# plt.xlim(0,50)

### EDA on Customer Review

In [None]:
customer_review = " ".join(GPU_df['Customer Review'])

In [None]:
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, 
                      contour_width=5, contour_color='steelblue', width=700, height=500)
wordcloud.generate(customer_review)
# Visualize the word cloud
wordcloud.to_image()

Similar to the customer review title word cloud, consumers who purchase GPUs tend to be gamers and they play on 1080p resolution. GPU fans are an important factor when making a GPU purchase as the word 'fan' size is rather big. The word 'issue' and 'problem' shows up big which suggests that consumers may have encountered issues with the GPUs they have purchased. The two brands 'amd' and 'nvidia' shows that these 2 are the major players in the GPU market. GPU drivers seem to play an important role in making sure that the GPU is functioning.

In [None]:
customer_review_list = customer_review.split()
customer_review_dict = {}

for word in customer_review_list:
    if word not in customer_review_dict.keys():
        customer_review_dict[word] = customer_review_list.count(word)
    else:
        continue
        
customer_review_dict

In [None]:
review_df = {'words': customer_review_dict.keys(), 'freq': customer_review_dict.values()}
customer_review_df = pd.DataFrame(review_df)
customer_review_df.sort_values('freq', ascending=False).set_index('words').head(10).plot(kind='barh', figsize=(11,7),
                                                                                              title='Frequency of words in customer review title')
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.legend([]);

The graph shows consistency with the word cloud on the frequency of the words appearing in the customer review title.

## Prepare data for LDA Analysis

I'll be using only Customer Review to conduct the LDA Analysis as it makes up the bulk of the words.

In [None]:
# Converting to the customer reviews from series to a list.
data = GPU_df['Customer Review'].values.tolist()
data[600]

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(sentence))

In [None]:
texts = list(sent_to_words(data))

In [None]:
# Prints the first document with up to 30 words in them
print(texts[:1][0][:30])

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(texts)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])

## LDA Model Training

In [None]:
# number of topics
num_topics = 10

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                      passes=20, random_state=42)

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join(os.getcwd()+'\\visualization\\'+'ldavis_prepared_'+str(num_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if False:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, os.getcwd()+ '\\visualization\\' + 'ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared

# Others