In [1]:
import pandas as pd
import gzip
import json
import re
import os
import pickle
import nltk
import re
import multiprocessing

from nltk.corpus import stopwords

nltk.download('words')
nltk.download('punkt')

[nltk_data] Downloading package words to
[nltk_data]     /Users/kyledecember1/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kyledecember1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from tqdm import tqdm
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
from sklearn.metrics import accuracy_score, f1_score

import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
cores = multiprocessing.cpu_count()

def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

In [4]:
# assign paths for csv data

reviews_path = os.path.join(os.pardir, os.pardir, 'data/reviews.csv')
games_path = os.path.join(os.pardir, os.pardir, 'data/games.csv')

In [5]:
#import df_games 
#drop null values
# create product_id column in df_games based on str formatted id column

df_games = pd.read_csv(games_path)
df_games['id'].dropna(inplace=True)
df_games['product_id'] = df_games['id'].astype(int).astype(str)

In [4]:
# create dataframes from csv files

df_reviews = pd.read_csv(reviews_path)
df_games = pd.read_csv(games_path)

In [5]:
# drop unnecessary columns

df_reviews.drop(['found_funny', 'compensation', 'user_id', 'Unnamed: 0', 'products', 'page_order',\
                'date', 'early_access', 'page'], axis=1, inplace=True)

In [6]:
# create a frequency column based on product_id, sort by said column

df_reviews['freq'] = df_reviews.groupby('product_id')['product_id'].transform('count')
df_reviews.sort_values(by=['freq', 'product_id'], ascending=[False, True], inplace=True)

In [7]:
# remove null values

df_reviews.dropna(inplace=True)

In [8]:
# remove reviews by users that had under 1 hour played for the game
# remove games that have less than 500 total reviews

df_reviews = df_reviews[df_reviews['hours'] >= 1]
df_reviews = df_reviews[df_reviews['freq'] >= 500]

In [95]:
# convert product_id to strings because Doc2Vec needs strings as Tags

df_reviews['product_id'] = df_reviews['product_id'].astype(str)

KeyboardInterrupt: 

In [11]:
# take subsample of data for text manipulation/modeling purposes

df_sample = df_reviews.sample(axis=0, n=250000)
df_sample.sort_values(by=['freq', 'product_id'], ascending=[False, True], inplace=True)

In [12]:
# make lowercase

df_sample['text'] = df_sample['text'].str.lower()

In [13]:
# remove new line indicators

df_sample['text'] = df_sample['text'].str.replace('\n', ' ')
df_sample['text'] = df_sample['text'].str.replace('.\n', ' ')

In [14]:
# tokenize text

df_sample['tokens'] = df_sample['text'].apply(nltk.word_tokenize)
df_sample['tokens']

3772219                                  [team, fortress, 2]
3930870    [team, fortress, 2, is, everything, you, loved...
3907317                          [hats, and, jungle, update]
3848556                     [this, is, a, blody, good, game]
3882190                                               [:, )]
                                 ...                        
1898007    [where, to, start, ., i, decided, to, write, t...
1902413                               [verry, good, game, !]
1903853                                    [boring, game, (]
1901512    [this, game, sucks, a, lot, ., the, ui, is, la...
1902008                                         [good, game]
Name: tokens, Length: 250000, dtype: object

In [15]:
# join tokens into single string

df_sample['clean_text'] = df_sample['tokens'].apply(', '.join)

In [16]:
df_sample['clean_text']

3772219                                    team, fortress, 2
3930870    team, fortress, 2, is, everything, you, loved,...
3907317                            hats, and, jungle, update
3848556                       this, is, a, blody, good, game
3882190                                                 :, )
                                 ...                        
1898007    where, to, start, ., i, decided, to, write, th...
1902413                                 verry, good, game, !
1903853                                      boring, game, (
1901512    this, game, sucks, a, lot, ., the, ui, is, lag...
1902008                                           good, game
Name: clean_text, Length: 250000, dtype: object

# Model_1

** attempts to improve from FSM that simply recommended the most reviewed games

In [17]:
train, test = train_test_split(df_sample, test_size=0.3, random_state=42)

In [18]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

In [19]:
train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['clean_text']), tags=[r['product_id']]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['clean_text']), tags=[r['product_id']]), axis=1)

In [22]:
import multiprocessing
cores = multiprocessing.cpu_count()

In [23]:
# initiate model

model1 = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model1.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 175000/175000 [00:00<00:00, 1890401.49it/s]


In [24]:
# train model on tagged documents

model1.train(train_tagged, total_examples=len(train_tagged), epochs=30)

In [None]:
# save model

model1.save('model1.d2v')

In [40]:
# create string for testing

sent = 'epic fast paced shooter game, that allows me to customize my character and play with friends'.split(' ')
sent_vec = model1.infer_vector(sent)

In [97]:
# generate recommendations based on most similar vectors

recs = model1.docvecs.most_similar([sent_vec])

In [99]:
# return data related to recommendation

df_sample[df_sample['product_id'] == recs[0][0]]

Unnamed: 0,username,hours,product_id,text,freq,tokens,clean_text
6186197,Doc Bison,1.6,354500,bain really needs a haircut.,690,"[bain, really, needs, a, haircut, .]","bain, really, needs, a, haircut, ."


In [18]:
#import df_games 
#drop null values
# create product_id column in df_games based on str formatted id column

df_games = pd.read_csv(games_path)
df_games['id'].dropna(inplace=True)
df_games['product_id'] = df_games['id'].astype(int).astype(str)

NameError: name 'games_path' is not defined

In [145]:
# start cleaning df_games



In [146]:
# create product_id column in df_games based on str formatted id column



In [142]:
# remove trailing decimal places 

df_games['product_id'] = df_games['product_id'].str.replace('.0', '')

In [152]:
# get game information based on recommendations

df_games[df_games['product_id'] == recs[0][0]]

Unnamed: 0.1,Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,discount_price,reviews_url,specs,price,early_access,id,developer,sentiment,metascore,product_id
27178,27178,,,PAYDAY: The Web Series,PAYDAY: The Web Series,http://store.steampowered.com/app/354500/PAYDA...,2015-03-10,"['Action', 'Heist', 'Violent', 'Free to Play',...",,http://steamcommunity.com/app/354500/reviews/?...,,Free,False,354500.0,,Very Positive,,354500


In [137]:
df_games['product_id']

0        761140.0
1        643980.0
2        670290.0
3        767400.0
4        773570.0
           ...   
32130    773640.0
32131    733530.0
32132    610660.0
32133    658870.0
32134    681550.0
Name: product_id, Length: 32135, dtype: object

(250000, 7)

## Findings - Model_1
    
    using non-aggregated text returns only a singular review, instead of the most similar review among the whole Tag
    
    must return to using aggregated reviews

# Prep for Model 2

In [101]:
testdict = {440:'the most popular game',
           26: 'this is my bday',
           1492: 'columbus sailed the ocean blue',
           87: 'four score and seven years ago'}

In [109]:
list(testdict.keys())

[440, 26, 1492, 87]

In [124]:
list(testdict.values())

['the most popular game',
 'this is my bday',
 'columbus sailed the ocean blue',
 'four score and seven years ago']

In [110]:
testdf = pd.DataFrame()

In [131]:
testdf['product_id'] = list(testdict.keys())
testdf['reviews'] = list(testdict.values())

In [132]:
testdf.dtypes

product_id     int64
reviews       object
dtype: object

In [126]:
testdf['product_id'] = testdf['product_id'].apply(str)

In [127]:
testdf['product_id']

0     440
1      26
2    1492
3      87
Name: product_id, dtype: object

In [151]:
# code loads the aggregated reviews of the full 7 million reviews

# with open('agg_reviews.p', 'rb') as fp:
#     loaded_file = pickle.load(fp)

dict

## Need to reduce aggregate data size
    creating new aggregated data based on the sample dataframe

In [155]:
# products = list(df_sample['product_id'].unique())

In [156]:
# product_dict = {}
# for product_id in products:
#     product_dict[product_id] = ''

In [159]:
# for key in product_dict:
#     for index, row in df_sample[df_sample['product_id'] == key].iterrows():
#         product_dict[key] = product_dict[key] + ' ' + row['text']

In [160]:
# with open('subsample_agg_reviews.p', 'wb') as fp:
#     pickle.dump(product_dict, fp, protocol=pickle.HIGHEST_PROTOCOL)

    reintroduce df from pickled aggregate texts

In [6]:
with open('subsample_agg_reviews.p', 'rb') as fp:
    loaded_file = pickle.load(fp)

In [7]:
agg_df = pd.DataFrame()

In [8]:
agg_df['product_id'] = list(loaded_file.keys())
agg_df['reviews'] = list(loaded_file.values())

# Model_2

In [9]:
tagged_docs = agg_df.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['reviews']), tags=[r['product_id']]), axis=1)

In [10]:
model2 = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model2.build_vocab([x for x in tqdm(tagged_docs.values)])

100%|██████████| 1996/1996 [00:00<00:00, 524550.80it/s]


In [11]:
model2.train(tagged_docs, total_examples=len(tagged_docs), epochs=30)

In [25]:
model2.save('model2.d2v')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [12]:
sent2 = 'epic fast paced shooter game, that allows me to customize my character and play with friends'
tokes2 = sent2.split(' ')
sent_vec2 = model2.infer_vector(tokes2)

In [13]:
recs2 = model2.docvecs.most_similar([sent_vec2])

In [14]:
recs2[:5]

[('354500', 0.6271480917930603),
 ('413850', 0.6216908693313599),
 ('510050', 0.4521377682685852),
 ('546390', 0.44616881012916565),
 ('250600', 0.4367648661136627)]

In [21]:
df_games[df_games['product_id'] == recs2[0][0]]

Unnamed: 0.1,Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,discount_price,reviews_url,specs,price,early_access,id,developer,sentiment,metascore,product_id
4622,4622,,,CS:GO Player Profiles,CS:GO Player Profiles,http://store.steampowered.com/app/413850/CSGO_...,2015-10-26,"['Free to Play', 'FPS', 'Gaming', 'Shooter', '...",,http://steamcommunity.com/app/413850/reviews/?...,,Free,False,413850.0,,Very Positive,,413850


In [22]:
recs2[0][0]

'413850'

In [15]:
def get_recs(model, user_input, n=5):
    description = user_input.split(' ')
    desc_vec = model.infer_vector(description)
    recs = model.docvecs.most_similar([desc_vec])[:n]
    
    return recs

In [16]:
def show_game_desc(recs, df):
    games = {}
    for rec in recs:
        games[rec[0]] = df[df['product_id'] == rec[0]]['title']
        
    return games

In [34]:
test_sent1 = """
an intense and fast paced rpg that allows me to customize my character and defeat my enemies 
using sorcery and weapons"""

test_sent2 = """
includes strategic and action-packed combat, i want to customize both my character and my kingdom
while conquering my enemies through intrigue
"""

test_sent3 = """
dragons and monsters fight to the death, extremely unforgiving combat, difficult as dark souls
"""

test_sent4 = """
similar to age of empires, where i build my kingdom from the ground up, form alliances and engage in intrigue,
research technological advancements and stand the test of time
"""

test_sent5 = """
turn based strategy game with the complexity of Civilization 5, but where I can also control an individual 
unit within combat
"""

In [23]:
recs3 = get_recs(model2, sent3)

In [24]:
show_game_desc(recs3, df_games)

{'413850': 4622    CS:GO Player Profiles
 Name: title, dtype: object, '354500': 27178    PAYDAY: The Web Series
 Name: title, dtype: object, '250600': 1909    The Plan
 Name: title, dtype: object, '283640': 6184    Salt and Sanctuary
 Name: title, dtype: object, '345180': 4059    Victor Vran ARPG
 Name: title, dtype: object}

In [35]:
all_sents = [test_sent1, test_sent2, test_sent3, test_sent4, test_sent5]

In [27]:
show_game_desc(get_recs(model2, test_sent2), df_games)

{'354500': 27178    PAYDAY: The Web Series
 Name: title, dtype: object, '413850': 4622    CS:GO Player Profiles
 Name: title, dtype: object, '250600': 1909    The Plan
 Name: title, dtype: object, '243950': 1510    Divinity: Dragon Commander
 Name: title, dtype: object, '346250': 27536    The Old Tree
 Name: title, dtype: object}

In [28]:
show_game_desc(get_recs(model2, test_sent3), df_games)

{'205190': 1140    Rocksmith™
 Name: title, dtype: object, '47400': 30944    Stronghold 3 Gold
 Name: title, dtype: object, '285310': 29018    RollerCoaster Tycoon®: Deluxe
 Name: title, dtype: object, '49470': 660    Magic: The Gathering - Duels of the Planeswalk...
 Name: title, dtype: object, '55230': 31113    Saints Row: The Third
 Name: title, dtype: object}

In [36]:
for sent in all_sents:
    print(sent, show_game_desc(get_recs(model2, sent), df_games))


an intense and fast paced rpg that allows me to customize my character and defeat my enemies 
using sorcery and weapons {'413850': 4622    CS:GO Player Profiles
Name: title, dtype: object, '354500': 27178    PAYDAY: The Web Series
Name: title, dtype: object, '250600': 1909    The Plan
Name: title, dtype: object, '283640': 6184    Salt and Sanctuary
Name: title, dtype: object, '387860': 4139    the static speaks my name
Name: title, dtype: object}

includes strategic and action-packed combat, i want to customize both my character and my kingdom
while conquering my enemies through intrigue
 {'354500': 27178    PAYDAY: The Web Series
Name: title, dtype: object, '413850': 4622    CS:GO Player Profiles
Name: title, dtype: object, '250600': 1909    The Plan
Name: title, dtype: object, '243950': 1510    Divinity: Dragon Commander
Name: title, dtype: object, '346250': 27536    The Old Tree
Name: title, dtype: object}

dragons and monsters fight to the death, extremely unforgiving combat, diff

In [37]:
recs_test = model2.docvecs.most_similar(test_sent2)

TypeError: '<' not supported between instances of 'str' and 'int'