# NLP Project - Web Scraping and Text Analysis of Game Reviews on Rock Paper Shotgun
## Part II. Text Analysis & Topic Modelling
### Step 1 - Analyse and Interpret Dataset

In [52]:
import json
import numpy as np
from collections import Counter

In [53]:
# Open the raw JSON file
with open('game_reviews_rps_in_recent_years.json', 'r') as file:
    # Load the JSON content into a variable
    data = json.load(file)

In [54]:
# remove all reviews and brief, reformat for further analysis
# convert dict to array

# items() reference code found in: https://www.programiz.com/python-programming/methods/dictionary/items
# ChatGPT used here for debugging
data = [{k: v for k, v in item.items() if k not in ['Brief', 'Review']} for item in data]


In [55]:
# remove all 'keys' and keep 'values'
# values() reference code found in: https://www.programiz.com/python-programming/methods/dictionary/values
data = [list(item.values()) for item in data]


In [57]:
data = np.array(data)

In [58]:
data.shape

(540, 6)

In [66]:
# seperate each category, slicing code learnt from STEM class
titles = data[:,0]
urls = data[:,1]
game_titles = data[:,2]
developers = data[:,3]
labels = data[:,4]
dates = data[:,5]

In [68]:
# split developers and labels and transfer list of lists into a single list
list_of_labels = [label.split(', ') for label in labels]
list_of_developers = [developer.split(', ') for developer in developers]

flat_list_label = []
for l in list_of_labels:
    for i in l:
        flat_list_label.append(i)

flat_list_developer = []
for l in list_of_developers:
    for i in l:
        flat_list_developer.append(i)

# # ChatGPT used here for search code - 'how to transfer list of lists into a single list using list comprehension'
# # alternative solution - nested list comprehensions
# flat_list_label = [i for l in list_of_labels for i in l]
# flat_list_developer = [i for l in list_of_developers for i in l]

In [69]:
# get the number of articles published each year
dates = data[:,5]
years = [d[0:4] for d in dates]
Counter(years)

Counter({'2022': 168, '2023': 153, '2021': 148, '2020': 71})

In [70]:
len(flat_list_label)

3269

In [72]:
# get the number of all the unqiue labels
unique_label = np.unique(flat_list_label)
len(unique_label)

703

In [73]:
Counter(game_titles).most_common(5)

[('Baldurs Gate 3', 2),
 ('Exoprimal', 2),
 ('Wartales', 2),
 ('Hardspace Shipbreaker', 2),
 ('Dorfromantik', 2)]

In [74]:
Counter(flat_list_label).most_common(5)

[('Wot I Think', 512),
 ('Indie', 228),
 ('Action Adventure', 170),
 ('Strategy', 99),
 ('Reviews', 97)]

In [75]:
Counter(flat_list_developer).most_common(5)

[('Square Enix', 12),
 ('Capcom', 10),
 ('Sledgehammer Games', 7),
 ('Raven Software', 5),
 ('Treyarch', 5)]

### References

- Python Dictionary items() [online] Programiz. Available at: https://www.programiz.com/python-programming/methods/dictionary/items

- Python Dictionary values() [online] Programiz. Available at: https://www.programiz.com/python-programming/methods/dictionary/values