# Prepare TF-IDF dictionary and corpus for game tags

To engineer a feature that quantifies the similarity of a games tags and the tags of the games a user currently owns, we have to generate a dictionary and corpus of words that we can feed to the tf-idf model. What the model does is vectirize word representations based on term frequency in each document (or tags in this case). We can then use these vectorized representations to compute how similar two different document vectors are.

In [None]:
import logging
import json
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
from gensim import corpora, models, similarities
from nltk.tokenize import word_tokenize
import pandas as pd
import ast
import re
json_data=open("/home/iain/gameDict.json").read()
jsonFile = json.loads(json_data)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [94]:
# Define SQL database info
db_name  = 'UserInfo'
username = 'username'
host     = 'localhost'
pwd      = 'pasword'
port     = '5432'
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(username, pwd, host, port, db_name))
print(engine.url)

# connect to database:
con = None
con = psycopg2.connect(database = db_name, user = username, password = pwd, host = host)
cur = con.cursor() # get a cursor to our current connection

postgresql://iain:4ll3nd3@localhost:5432/UserInfo


## Query game data

In [95]:
# make a query to grab all game information
create_table_sql = """
SELECT * FROM allGames WHERE tags IS NOT NULL;
"""
games = pd.read_sql_query(create_table_sql,con)

## Tokenize game tags

In [96]:
# Tokenize each game 
documents   = [] # initialise documents
allGameTags = games['tags'].values.tolist() # Get a list of all game lists (this is a string)
for tags in allGameTags:
    currentTags = ast.literal_eval(tags) # Evaluate to get in list form
    documents.append(' '.join(currentTags)) # Join all together with spaces
# Loop through each game and tokenize game tags
texts = [[w.lower() for w in word_tokenize(document)] for document in documents]

## Get dictionary from tags

In [97]:
# Generate a dictionary of tags from our preprocessed list of game tags
dictionary = corpora.Dictionary(texts)
dictionary.save('/home/gameTagDict.dict')  # store the dictionary, for future reference
print(dictionary)

2017-09-17 13:29:35,330 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-09-17 13:29:35,521 : INFO : built Dictionary(335 unique tokens: ['action', 'fps', 'multiplayer', 'shooter', 'classic']...) from 4722 documents (total 34135 corpus positions)
2017-09-17 13:29:35,524 : INFO : saving Dictionary object under /home/iain/Documents/InsghtProject/gameTagDict.dict, separately None
2017-09-17 13:29:35,526 : INFO : saved /home/iain/Documents/InsghtProject/gameTagDict.dict


Dictionary(335 unique tokens: ['action', 'fps', 'multiplayer', 'shooter', 'classic']...)


## Get corpus from tags

In [98]:
# Generate a corpus matrix from the tags 
corpus = [dictionary.doc2bow(text) for text in texts]
# Save the corpus to disk
corpora.MmCorpus.serialize('/home/gameTagCorpus.mm', corpus) 

2017-09-17 13:29:39,104 : INFO : storing corpus in Matrix Market format to /home/iain/Documents/InsghtProject/gameTagCorpus.mm
2017-09-17 13:29:39,111 : INFO : saving sparse matrix to /home/iain/Documents/InsghtProject/gameTagCorpus.mm
2017-09-17 13:29:39,114 : INFO : PROGRESS: saving document #0
2017-09-17 13:29:39,162 : INFO : PROGRESS: saving document #1000
2017-09-17 13:29:39,198 : INFO : PROGRESS: saving document #2000
2017-09-17 13:29:39,237 : INFO : PROGRESS: saving document #3000
2017-09-17 13:29:39,258 : INFO : PROGRESS: saving document #4000
2017-09-17 13:29:39,272 : INFO : saved 4722x335 matrix, density=2.156% (34113/1581870)
2017-09-17 13:29:39,273 : INFO : saving MmCorpus index to /home/iain/Documents/InsghtProject/gameTagCorpus.mm.index


## Initialize tf-idf model

In [8]:
# Initialise a term-frequency inverse document frequency model based on the corpus
tfidf = models.TfidfModel(corpus) 
corpus_tfidf = tfidf[corpus]

2017-09-17 11:21:20,775 : INFO : collecting document frequencies
2017-09-17 11:21:20,777 : INFO : PROGRESS: processing document #0
2017-09-17 11:21:20,794 : INFO : calculating IDF weights for 4722 documents and 334 features (34113 matrix non-zeros)


## Initialize similarity index

In [9]:
# Initialise a similarity index using the stored index, corpus, and dictionary
sims = similarities.Similarity('/home/',tfidf[corpus],num_features=len(dictionary))

2017-09-17 11:21:25,058 : INFO : starting similarity index under /home/iain/Documents/InsghtProject/


## Example use case:

In [12]:
# Now let's test the model on a hypothetical game with certain tags
# Tokenize
query_doc = [w.lower() for w in word_tokenize("casual space indie FPS early access ")]
print(query_doc)
# Compare to dictionary we computed on all other games
query_doc_bow = dictionary.doc2bow(query_doc)
print(query_doc_bow)
# Vectorize word representation
query_doc_tf_idf = tfidf[query_doc_bow]
print(query_doc_tf_idf)
query_similarity = sims[query_doc_tf_idf]
print(query_similarity[:10])

['casual', 'space', 'indie', 'fps', 'early', 'access']
[(1, 1), (29, 1), (30, 1), (40, 1)]
[(1, 0.6525071878230797), (29, 0.24677895019256244), (30, 0.7100944028664118), (40, 0.09539632382260485)]
[ 0.09971548  0.1439738   0.          0.36022088  0.          0.13437054
  0.13437054  0.          0.          0.00299126]
