# Capstone: Topic Modelling on AMD vs Nvidia GPU

## Contents
- Data Extraction
- Data Cleaning
- [EDA](#EDA)
- [Prepare data for LDA Analysis](#Prepare-data-for-LDA-Analysis)
- [LDA Model Training](#LDA-Model-Training)
- Model creation
- Model Evaluation

In [1]:
# Common imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pprint import pprint
import os

import re
# NLTK Library
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Import PRAW package
import praw
from praw.models import MoreComments

# Gensim library
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora

import pyLDAvis.gensim
import pickle 
import pyLDAvis

# Detect non-english words
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0

# Detect non-english words using spacy
import spacy
from spacy_langdetect import LanguageDetector
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)


# Import the wordcloud library
from wordcloud import WordCloud

%matplotlib inline

In [2]:
# Set the max rows and columns for Pandas
pd.options.display.max_columns = 100
pd.options.display.max_rows = 200

In [3]:
# Set the style use
plt.style.use('ggplot')

## Create a helper function to scrap the dataset in reddit

In [4]:
nvidia_gpu_sub_dict = {'rtx_3060ti': 'k4mctp', 'rtx_3070':'jj8k0l', 'rtx_3080': 'itw87x', 'rtx_3090': 'iyy5sx', 'rtx_3000': 'iko4ir'}
amd_gpu_sub_dict = {'amd_gpu': 'iknr7g', 'rx_6000_rdna2':'jjq6v1', 'rx_6000_nov_18':'jvxm8z', 'radeon_rx_6000':'jwesyt'} 

In [5]:
 reddit = praw.Reddit(
     client_id="IR7Y4cUBrVAbGg",
     client_secret="podr43kzztn_CoVgtNQiNpDfjI5mjg",
     user_agent="gpu_scrapper_32",
     username="leader2345",
     password="rPLHgrS8"
 )

In [6]:
def scrapeGPUComment(gpu_dict):
    for key, value in gpu_dict.items():
        # Creates the GPU list to hold the reddit comments
        gpu_lst = []
        # Creating the submission object for rtx megathreads
        submission = reddit.submission(id=value)
        
        # Extract all the commments
        submission.comments.replace_more(limit=None)
        for comment in submission.comments.list():
            gpu_lst.append(comment.body)
        # Converted to Dataframe format
        rtx_df = pd.DataFrame({'Reddit comments':gpu_lst})
        rtx_df['tag'] = key
        rtx_df.to_csv('./reddit dataset/' + key + '.csv', index=False)

In [None]:
# Scrapping of nvidia commments
# scrapeGPUComment(nvidia_gpu_sub_dict)

In [None]:
# Scrapping of amd commments
# scrapeGPUComment(amd_gpu_sub_dict)

In [7]:
# Read in the dataframes

# Nvidia's comments
rtx_3000 = pd.read_csv('./reddit dataset/rtx_3000.csv')
rtx_3060ti = pd.read_csv('./reddit dataset/rtx_3060ti.csv')
rtx_3070 = pd.read_csv('./reddit dataset/rtx_3070.csv')
rtx_3080 = pd.read_csv('./reddit dataset/rtx_3080.csv')
rtx_3090 = pd.read_csv('./reddit dataset/rtx_3090.csv')

# Amd's comments
amd_gpu = pd.read_csv('./reddit dataset/amd_gpu.csv')
rx_6000 = pd.read_csv('./reddit dataset/radeon_rx_6000.csv')
rx_6000_nov_18 = pd.read_csv('./reddit dataset/rx_6000_nov_18.csv')
rx_6000_rdna2 = pd.read_csv('./reddit dataset/rx_6000_rdna2.csv')

In [8]:
# Concat the dataframes by their rows
combined_df = pd.concat([rtx_3000, rtx_3060ti, rtx_3070, rtx_3080,
                        rtx_3090, amd_gpu, rx_6000, rx_6000_nov_18, rx_6000_rdna2],
                       axis=0, ignore_index = False)

In [9]:
combined_df.shape

(25093, 2)

# Data cleaning

## Removing the null values

In [10]:
# Check the dimensions of the data
combined_df.shape

(25093, 2)

In [11]:
# Checking for null values
combined_df.isnull().sum()

Reddit comments    3
tag                0
dtype: int64

In [12]:
# Check for null values
combined_df.dropna(inplace = True)

In [13]:
# Checking for null values
combined_df.isnull().sum()

Reddit comments    0
tag                0
dtype: int64

## Dropping comments with `[deleted]` and `[removed]` in them

In [14]:
combined_df.shape

(25090, 2)

In [15]:
removed_deleted_comments_idx = combined_df[(combined_df['Reddit comments'] == '[removed]') | (combined_df['Reddit comments'] == '[deleted]')].index
removed_deleted_comments_idx

Int64Index([   46,    56,    57,    82,    83,   118,   150,   157,   160,
              244,
            ...
            10580, 10582, 10584, 10586, 10587, 10588, 10590, 10591, 10592,
            10594],
           dtype='int64', length=530)

A total of 530 comments will be dropped from the dataframe.

In [16]:
# Drop the deleted and removed comments
combined_df.drop(removed_deleted_comments_idx, axis=0, inplace = True)

In [17]:
combined_df.shape

(22770, 2)

## Cleaning the Reddit comments

In [18]:
# Full function to clean the title and the post
def clean_post(df):
    """
    This function removes the unnecessary characters, punctuations, removes stop words and lemmantizes the words
    from the posts and titles. Lemmantization is used as I want to preserve the meaning of the words in which it'll compare the words against a dictionary.
    """
    new_lst = []
    
    # Stop words
    stops = stopwords.words('english')
    # Addin additional stop words
    stops.extend(['nvidia', 'amd', 'card', 'gpu', 'http', 'www'])
    stops = set(stops)
    
    # Lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    for post in df:
        # Lowercase the text
        post = post.lower()

        # Find the https websites and removes them
        post = re.sub(r'\(https:.*?\)','',post)

        # Removes youtube links
        post = re.sub('https:.*?\\n','',post)

        # Removes uncaptured url links at the bottom of the text
        post = re.sub('https.*?[\\n|"]','',post)

        # Removes characters: \n\n&amp;#x200B;
        post = re.sub('\\n\\n&amp;#x200b;\\n\\n','',post)

        # Removing the special characters, like punctuation marks, periods
        post = re.sub(r'[^\w]',' ',post)
        
        # Removes digits and keeps the letters
        # post = re.sub(r'[^a-zA-Z]', ' ', post)

        # Removes underscores
        post = re.sub(' _', ' ',post)

        # Removes addtional white spaces
        post = re.sub(' +', ' ',post)
        
        # Stores the words in a list 
        lst = [] 
        
        # If the word is not in the stop words then, lemmantize the words
        #for word in post.split():
        #    if not word in stops:
        #        lst.append(lemmatizer.lemmatize(word))
        lst = [lemmatizer.lemmatize(word) for word in post.split() if word not in stops]
            
        new_lst.append(" ".join(lst))
        
    return new_lst

In [19]:
# Cleans the Reddit comments column
combined_df['Reddit comments'] = clean_post(combined_df['Reddit comments'])
combined_df['Reddit comments']

0                      pre order time releasing 17th seems
1        price revelation insane expect magic rdna 2 wa...
2                                going hard grab 3080 17th
3        3080 seems like best high end right 1440 144hz...
4        uk price 3090 1399 3080 649 3070 469 scan aib ...
                               ...                        
10583    probably score higher english proficiency test...
10585    prove know concept cost opportunity efficiency...
10589    cost opportunity perfectly ok way say many nam...
10593                                       take care babe
10595                                       take care babe
Name: Reddit comments, Length: 22770, dtype: object

## Dropping comments that are empty

In [20]:
combined_df.describe()

Unnamed: 0,Reddit comments,tag
count,22770.0,22770
unique,22057.0,9
top,,rx_6000_rdna2
freq,125.0,10081


In [21]:
(combined_df['Reddit comments'] == '').sum()

125

There are 125 empty comments that have to be dropped.

In [22]:
empty_comments_idx = combined_df[combined_df['Reddit comments'] == ''].index
combined_df.drop(empty_comments_idx, axis=0, inplace=True)

In [23]:
(combined_df['Reddit comments'] == '').sum()

0

The empty comments have been removed.

## Removing duplicates in the dataframe

In [24]:
combined_df.duplicated(subset='Reddit comments', keep = False).sum()

822

There are 822 comments that are duplicated and have to be dropped.

In [25]:
combined_df[combined_df.duplicated(subset='Reddit comments', keep = False)]

Unnamed: 0,Reddit comments,tag
26,preorder,rtx_3000
58,pre order,rtx_3000
130,pre order,rtx_3000
159,http evga com article 01434 evga geforce rtx 3...,rtx_3000
408,scared new mining performance card new memory ...,rtx_3000
...,...,...
10506,see,rx_6000_rdna2
10554,thanks,rx_6000_rdna2
10556,problem,rx_6000_rdna2
10593,take care babe,rx_6000_rdna2


In [26]:
combined_df.shape

(22353, 2)

In [27]:
combined_df.drop_duplicates(subset='Reddit comments', keep = False, ignore_index= True, inplace = True)

In [28]:
combined_df.shape

(21531, 2)

In [29]:
combined_df.duplicated(subset='Reddit comments', keep = False).sum()

0

The duplicated comments have been removed.

## Setting the tags with either `Nvidia` or `Amd`

In [30]:
combined_df['tag'].unique()

array(['rtx_3000', 'rtx_3060ti', 'rtx_3070', 'rtx_3080', 'rtx_3090',
       'amd_gpu', 'radeon_rx_6000', 'rx_6000_nov_18', 'rx_6000_rdna2'],
      dtype=object)

These tags need to be renamed to either `Nvidia` or `Amd`

In [31]:
nvidia_tag_lst = list(nvidia_gpu_sub_dict)
amd_tag_lst = list(amd_gpu_sub_dict)

In [32]:
combined_df['tag'].replace(nvidia_tag_lst, ['nvidia' for _ in range(len(nvidia_tag_lst))], inplace = True)
combined_df['tag'].replace(amd_tag_lst, ['amd' for _ in range(len(amd_tag_lst))], inplace = True)

In [33]:
combined_df['tag'].value_counts()

amd       16016
nvidia     5515
Name: tag, dtype: int64

In [43]:
# Randomly going through the rows to check if it's cleaned properly 
combined_df['Reddit comments'].loc[np.random.randint(1707)]

'confused successor 2080 ti 3090 insanely powerful seems like titan people equipment take advantage full capability also much expensive 2080 ti launch jensen called 3080 flagship typically best best flagship'

### Removing non-english words in the reviews (Possible to ignore)

In [None]:
# Removing non english by creating a helper function
from langdetect import detect
def isenglish(text):
    try:
        if nlp(text)._.language.get('language') == 'en':
            return 1
        else:
            return 0
    except:
        return 0

In [None]:
GPU_df['isenglish'] = GPU_df['Customer Review'].apply(isenglish)

In [None]:
GPU_df[GPU_df.loc[:,'isenglish'] == 0][['Customer Review']].head(10)

In [None]:
# Count number of rows with the string deleted in them
GPU_df['Customer Review'].map(lambda x: x.count('deleted')).sum()

In [None]:
GPU_df[GPU_df.loc[:,'isenglish'] == 0][['Customer Review']].count()

A total of 130 rows were non-english reviews. These have to be removed.

## Exporting the cleaned csv file 

In [44]:
# Save to csv file
combined_df.to_csv('./reddit dataset/cleaned_combined_df.csv', index=False)

# EDA

In [46]:
# Read the existing csv file
GPU_df = pd.read_csv('./reddit dataset/cleaned_combined_df.csv')
GPU_df

Unnamed: 0,Reddit comments,tag
0,pre order time releasing 17th seems,nvidia
1,price revelation insane expect magic rdna 2 wa...,nvidia
2,going hard grab 3080 17th,nvidia
3,3080 seems like best high end right 1440 144hz...,nvidia
4,uk price 3090 1399 3080 649 3070 469 scan aib ...,nvidia
...,...,...
21526,dude really understand example confused get sa...,amd
21527,still know example like still thought looked,amd
21528,probably score higher english proficiency test...,amd
21529,prove know concept cost opportunity efficiency...,amd


In [47]:
GPU_df.describe()

Unnamed: 0,Reddit comments,tag
count,21531,21531
unique,21531,2
top,okay finally able get store phone 0 fulfilled ...,amd
freq,1,16016


In [48]:
GPU_df.shape

(21531, 2)

In [49]:
# Check for null values
GPU_df.isnull().sum()

Reddit comments    0
tag                0
dtype: int64

In [55]:
# Checking for duplicated comments
GPU_df.duplicated(subset='Reddit comments', keep=False).sum()

0

### Price distribution

In [None]:
# plt.figure(figsize=(99,99))
sns.displot(GPU_df_no_reviews['Price'], bins=12, aspect=1.5, height=6, color='green')
plt.axvline(GPU_df_no_reviews['Price'].mean(),color='red')
plt.axvline(GPU_df_no_reviews['Price'].median(),color='yellow')

plt.title('Distribution of sale price of GPUs', size=13)
plt.legend(['Mean sale price','Median sale price']);

The distribution shows a right skewed graph with most of the GPUs falling below the 100 dollars range. The mean and the median prices are far part showing that they are some outliers in the price distribution as seen in the price range of 800 and 1000 dollars range. 

### Distribution of AMD and Nvidia Chipsets

In [None]:
GPU_df_no_reviews['Chipset Brand'].value_counts(normalize=True)

It seems that most of the GPUs are under Nvidia with a proportion of 70% while Amd has a proportion of 30%.

### Most popular brands by their rating

In [None]:
GPU_df_no_reviews['Manufacturer'].value_counts()

As NVIDIA, NVIDIA Corporation and Althon Micro Inc. have only 1 GPUs, I'll not include them in the popular brand investigation

In [None]:
manufacturer_list = ['AMD','ASRock','Aiposen','SAPPHIRE', 'Althon Micro Inc.', 'NVIDIA']
GPU_df_no_reviews.groupby('Manufacturer').mean().drop(manufacturer_list)['Overall Customer Rating'].sort_index(ascending=False).plot(kind='barh', 
                                                                                                                                            title='Most popular brand by rating', 
                                                                                                                                            figsize=(11,7), 
                                                                                                                                            color='green')
plt.xlabel('Rating')
plt.ylabel('Brand', rotation=360);

In [None]:
GPU_df_no_reviews.groupby('Manufacturer').mean().drop(manufacturer_list)['Overall Customer Rating'].sort_values(ascending=False)

Without including Nvidia and Althon Micro Inc as they have only 1 type of GPU, Asus, EVGA and SAPPHIRE are the most popular brands given their high ratings.

The reason behind this is that consumers usually prefer 3rd party coolers fitted into the GPUs compared to the Nvidia's coolers as they're much more effective in controlling the airflow and decreasing the GPU temperature. 

### Which Chipset Brand has a higher customer rating?

In [None]:
GPU_df_no_reviews.groupby('Chipset Brand').mean()['Overall Customer Rating']

Nvidia is slightly ahead of AMD in terms of the Overall Customer rating.

### Which Manufacturer produces GPUs with higher Memory Speed and Size?

In [None]:
GPU_df_no_reviews.info()

In [None]:
GPU_df_no_reviews.groupby('Manufacturer').mean()['Memory Speed(MHz)'].sort_values().plot(kind='barh', figsize=(11,7))

plt.title('Memory speed of the GPUs produced by individual manufacturers')
plt.xlabel('Memory speed (MHz)')
plt.ylabel('Manufacturer',rotation=360);

In [None]:
GPU_df_no_reviews.groupby('Manufacturer').mean()['Memory Speed(MHz)'].sort_values(ascending=False)

ASRock, Gigabyte, XFX and EVGA manufacturers produces GPUs with high amount of memory speed which shows that they're premium brands that produce 'Enthusiast Grade' types of GPUs.

### EDA on Customer Review Title

In [None]:
customer_review_title = " ".join(GPU_df['Customer Review Title'])

In [None]:
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, 
                      contour_width=5, contour_color='steelblue', width=700, height=500)
wordcloud.generate(customer_review_title)
# Visualize the word cloud
wordcloud.to_image()

Based on the word cloud, it seems that consumers are mostly satisfied with their GPU purchase with 'good', 'great' and 'best' words coming out at the top. The consumers are mostly gamers and most of them play in 1080p resolution and they seem to be price sensitive with the words such as 'bang buck' and 'great value' having a bigger size. 

In [None]:
customer_review_title_list = customer_review_title.split()
customer_review_title_dict = {}

for word in customer_review_title_list:
    if word not in customer_review_title_dict.keys():
        customer_review_title_dict[word] = customer_review_title_list.count(word)
    else:
        continue
        
customer_review_title_dict

In [None]:
df = {'words': customer_review_title_dict.keys(), 'freq': customer_review_title_dict.values()}
customer_review_title_df = pd.DataFrame(df)
customer_review_title_df.sort_values('freq', ascending=False).set_index('words').head(10).plot(kind='barh', figsize=(11,7),
                                                                                              title='Frequency of words in customer review title')
plt.xticks(fontsize=12)
plt.legend([]);

The graph shows consistency with the word cloud on the frequency of the words appearing in the customer review title.

In [None]:
# customer_review_title_df['freq'].hist(bins=150)
# plt.xlim(0,50)

### EDA on Customer Review

In [None]:
customer_review = " ".join(GPU_df['Customer Review'])

In [None]:
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, 
                      contour_width=5, contour_color='steelblue', width=700, height=500)
wordcloud.generate(customer_review)
# Visualize the word cloud
wordcloud.to_image()

Similar to the customer review title word cloud, consumers who purchase GPUs tend to be gamers and they play on 1080p resolution. GPU fans are an important factor when making a GPU purchase as the word 'fan' size is rather big. The word 'issue' and 'problem' shows up big which suggests that consumers may have encountered issues with the GPUs they have purchased. The two brands 'amd' and 'nvidia' shows that these 2 are the major players in the GPU market. GPU drivers seem to play an important role in making sure that the GPU is functioning.

In [None]:
customer_review_list = customer_review.split()
customer_review_dict = {}

for word in customer_review_list:
    if word not in customer_review_dict.keys():
        customer_review_dict[word] = customer_review_list.count(word)
    else:
        continue
        
customer_review_dict

In [None]:
review_df = {'words': customer_review_dict.keys(), 'freq': customer_review_dict.values()}
customer_review_df = pd.DataFrame(review_df)
customer_review_df.sort_values('freq', ascending=False).set_index('words').head(10).plot(kind='barh', figsize=(11,7),
                                                                                              title='Frequency of words in customer review title')
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.legend([]);

The graph shows consistency with the word cloud on the frequency of the words appearing in the customer review title.

## Prepare data for LDA Analysis

I'll be using only Customer Review to conduct the LDA Analysis as it makes up the bulk of the words.

In [None]:
# Converting to the customer reviews from series to a list.
data = GPU_df['Customer Review'].values.tolist()
data[600]

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(sentence))

In [None]:
texts = list(sent_to_words(data))

In [None]:
# Prints the first document with up to 30 words in them
print(texts[:1][0][:30])

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(texts)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])

## LDA Model Training

In [None]:
# number of topics
num_topics = 10

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                      passes=20, random_state=42)

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join(os.getcwd()+'\\visualization\\'+'ldavis_prepared_'+str(num_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if False:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, os.getcwd()+ '\\visualization\\' + 'ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared

# Others