In [1]:
import pandas as pd
import gzip
import json
import re
import os
import pickle
import nltk
import re

from nltk.corpus import stopwords

nltk.download('words')
nltk.download('punkt')

[nltk_data] Downloading package words to
[nltk_data]     /Users/kyledecember1/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kyledecember1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
from tqdm import tqdm
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# assign paths for csv data

reviews_path = os.path.join(os.pardir, os.pardir, 'data/reviews.csv')
games_path = os.path.join(os.pardir, os.pardir, 'data/games.csv')

In [3]:
# create dataframes from csv files

df_reviews = pd.read_csv(reviews_path)
df_games = pd.read_csv(games_path)

In [4]:
# drop unnecessary columns

df_reviews.drop(['found_funny', 'compensation', 'user_id', 'Unnamed: 0', 'products', 'page_order',\
                'date', 'early_access', 'page'], axis=1, inplace=True)

In [5]:
# create a frequency column based on product_id, sort by said column

df_reviews['freq'] = df_reviews.groupby('product_id')['product_id'].transform('count')
df_reviews.sort_values(by=['freq', 'product_id'], ascending=[False, True], inplace=True)

In [6]:
# remove null values

df_reviews.dropna(inplace=True)

In [7]:
# remove reviews by users that had under 1 hour played for the game
# remove games that have less than 500 total reviews

df_reviews = df_reviews[df_reviews['hours'] >= 1]
df_reviews = df_reviews[df_reviews['freq'] >= 500]

In [8]:
# take subsample of data for text manipulation/modeling purposes

df_sample = df_reviews.sample(axis=0, n=250000)
df_sample.sort_values(by=['freq', 'product_id'], ascending=[False, True], inplace=True)

In [9]:
# make lowercase

df_sample['text'] = df_sample['text'].str.lower()

In [10]:
# remove new line indicators

df_sample['text'] = df_sample['text'].str.replace('\n', ' ')
df_sample['text'] = df_sample['text'].str.replace('.\n', ' ')

In [11]:
# tokenize text

df_sample['tokens'] = df_sample['text'].apply(nltk.word_tokenize)
df_sample['tokens']

3965943    [i, am, stunned, by, what, valve, has, pulled,...
3831628    [people, can, be, rude, and, it, may, seem, a,...
3899176    [pros, :, -, class, variety, -, many, differen...
3915021                                 [its, a, good, game]
3600448    [i, liked, this, game, since, i, first, played...
                                 ...                        
6627576                                         [good, game]
6626028    [why, do, u, guys, have, to, hate, every, thin...
6630862                                               [good]
6628347    [the, game, does, have, some, bugs, inb, it, b...
6629891                                                [rip]
Name: tokens, Length: 250000, dtype: object

In [12]:
# join tokens into single string

df_sample['clean_text'] = df_sample['tokens'].apply(', '.join)

In [13]:
df_sample['clean_text']

3965943    i, am, stunned, by, what, valve, has, pulled, ...
3831628    people, can, be, rude, and, it, may, seem, a, ...
3899176    pros, :, -, class, variety, -, many, different...
3915021                                   its, a, good, game
3600448    i, liked, this, game, since, i, first, played, it
                                 ...                        
6627576                                           good, game
6626028    why, do, u, guys, have, to, hate, every, thing...
6630862                                                 good
6628347    the, game, does, have, some, bugs, inb, it, bu...
6629891                                                  rip
Name: clean_text, Length: 250000, dtype: object

## modeling

** attempts to improve from FSM that simply recommended the most reviewed games

In [15]:
train, test = train_test_split(df_sample, test_size=0.3, random_state=42)

In [16]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

In [17]:
train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['clean_text']), tags=[r['product_id']]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['clean_text']), tags=[r['product_id']]), axis=1)

In [19]:
train_tagged.values[5000]

TaggedDocument(words=['if', 'you', 'are', 'even', 'remotely', 'fan', 'of', 'star', 'trek', 'heck', 'if', 'you', 'have', 'even', 'ever', 'see', 'an', 'episode', 'you', 'need', 'to', 'try', 'this', 'game', 'did', "n't", 'try', 'it', 'for', 'the', 'longest', 'time', 'because', 'assumed', 'the', 'producation', 'quality', 'was', 'poor', 'and', 'would', "n't", 'be', 'able', 'to', 'get', 'into', 'it', 'was', 'so', 'wrong', 'can', 'not', 'stop', 'playing', 'this', 'game', 'do', 'yourself', 'favor', 'and', 'give', 'it', 'try', 'it', 'is', 'amazing', 'the', 'customization', 'is', 'incredible', 'and', 'the', 'gameplay', 'is', 'very', 'well', 'done', 'want', 'to', 'be', 'ferengi', 'captain', 'and', 'have', 'crew', 'of', 'nothing', 'but', 'ferengis', 'you', 'can', 'do', 'that', 'want', 'to', 'captain', 'the', 'original', 'enterprise', 'and', 'make', 'character', 'that', 'looks', 'like', 'captain', 'kirk', 'you', 'can', 'do', 'that', 'too', 'long', 'story', 'short', 'try', 'this', 'game', 'and', 'pu

In [20]:
import multiprocessing
cores = multiprocessing.cpu_count()

In [21]:
model1 = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model1.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 175000/175000 [00:00<00:00, 1700400.08it/s]


In [23]:
len(train_tagged)

175000

In [25]:
model1.train(train_tagged, total_examples=len(train_tagged), epochs=30)

In [26]:
model1.save('model1.d2v')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
def vector_for_learning(model, input_docs):
    sents = input_docs
    targets, feature_vectors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, feature_vectors