In [135]:
import json
import nltk
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from matplotlib import pyplot as plt

nltk.download(['punkt', 'stopwords'], quiet=True);

In [89]:
def tokenize(doc):
    tokens = []
    sentences = nltk.sent_tokenize(doc)
    for sentence in sentences:
        tokens.append(nltk.word_tokenize(sentence))
    
    return tokens

In [2]:
filename = 'data/processed_reviews.json'
with open(filename, 'rb') as file:
    reviews = json.load(file)

filename = 'data/processed_games.json'
with open(filename, 'rb') as file:
    games = json.load(file)

In [129]:
corpus = set()
stemmer = nltk.stem.snowball.EnglishStemmer()
stopwords = nltk.corpus.stopwords.words('english')
products = list({g['id'] for g in games})[:10]
for product in products:
    for review in reviews:
        if review['product_id'] == product:
            for sentence in tokenize(review['text']):
                for word in sentence:
                    word = word.lower()
                    if word not in stopwords and word.isalnum():
                        word = stemmer.stem(word)
                        corpus.add(word)

In [131]:
product_vectors = []
stemmer = nltk.stem.snowball.EnglishStemmer()
stopwords = nltk.corpus.stopwords.words('english')
products = [list({g['id'] for g in games})[i] for i in [5, 7]]
for product in products:
    bag_of_words = []
    for review in reviews:
        if review['product_id'] == product:
            for sentence in tokenize(review['text']):
                for word in sentence:
                    word = word.lower()
                    if word not in stopwords and word.isalnum():
                        word = stemmer.stem(word)
                        bag_of_words.append(word)
                        
    product_vectors.append([int(word in bag_of_words) for word in corpus])

In [132]:
len(product_vectors)

2

In [133]:
cosine_similarity(product_vectors)

array([[1.        , 0.26461277],
       [0.26461277, 1.        ]])

In [134]:
[g for g in games if g['id'] in products]

[{'publisher': 'Big Blue Bubble',
  'genres': ['Action', 'Indie'],
  'app_name': 'Zombie Bloxx',
  'sentiment': 'Positive',
  'title': 'Zombie Bloxx',
  'url': 'http://store.steampowered.com/app/713090/Zombie_Bloxx/',
  'release_date': '2017-11-22',
  'tags': ['Action', 'Indie', 'Touch-Friendly', 'Zombies', 'Voxel'],
  'reviews_url': 'http://steamcommunity.com/app/713090/reviews/?browsefilter=mostrecent&p=1',
  'specs': ['Single-player',
   'Steam Achievements',
   'Partial Controller Support'],
  'price': 4.99,
  'early_access': False,
  'id': '713090',
  'developer': 'Roosh Interactive'},
 {'publisher': 'Subatomic Studios LLC',
  'genres': ['Strategy', 'Indie', 'Casual'],
  'app_name': 'Fieldrunners',
  'sentiment': 'Mostly Positive',
  'title': 'Fieldrunners',
  'url': 'http://store.steampowered.com/app/209690/Fieldrunners/',
  'release_date': '2012-05-24',
  'tags': ['Tower Defense', 'Strategy', 'Indie', 'Casual', 'Singleplayer'],
  'reviews_url': 'http://steamcommunity.com/app/209