# Prepare TF-IDF dictionary and corpus for game tags

To engineer a feature that quantifies the similarity of a games tags and the tags of the games a user currently owns, we have to generate a dictionary and corpus of words that we can feed to the tf-idf model. What the model does is vectirize word representations based on term frequency in each document (or tags in this case). We can then use these vectorized representations to compute how similar two different document vectors are.

In [None]:
import logging
import json
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
from gensim import corpora, models, similarities
from nltk.tokenize import word_tokenize
import pandas as pd
import ast
import re
json_data=open("/home/iain/gameDict.json").read()
jsonFile = json.loads(json_data)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# Define SQL database info
db_name  = 'UserInfo'
username = 'username'
host     = 'localhost'
pwd      = 'password'
port     = '5432'
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(username, pwd, host, port, db_name))
print(engine.url)

# connect to database:
con = None
con = psycopg2.connect(database = db_name, user = username, password = pwd, host = host)
cur = con.cursor() # get a cursor to our current connection

## Query game data

In [None]:
# make a query to grab all game information
create_table_sql = """
SELECT * FROM allGames WHERE tags IS NOT NULL;
"""
games = pd.read_sql_query(create_table_sql,con)

## Tokenize game tags

In [None]:
# Tokenize each game 
documents   = [] # initialise documents
allGameTags = games['tags'].values.tolist() # Get a list of all game lists (this is a string)
for tags in allGameTags:
    currentTags = ast.literal_eval(tags) # Evaluate to get in list form
    documents.append(' '.join(currentTags)) # Join all together with spaces
# Loop through each game and tokenize game tags
texts = [[w.lower() for w in word_tokenize(document)] for document in documents]

## Get dictionary from tags

In [None]:
# Generate a dictionary of tags from our preprocessed list of game tags
dictionary = corpora.Dictionary(texts)
dictionary.save('/home/gameTagDict.dict')  # store the dictionary, for future reference
print(dictionary)

## Get corpus from tags

In [None]:
# Generate a corpus matrix from the tags 
corpus = [dictionary.doc2bow(text) for text in texts]
# Save the corpus to disk
corpora.MmCorpus.serialize('/home/gameTagCorpus.mm', corpus) 

## Initialize tf-idf model

In [None]:
# Initialise a term-frequency inverse document frequency model based on the corpus
tfidf = models.TfidfModel(corpus) 
corpus_tfidf = tfidf[corpus]

## Initialize similarity index

In [None]:
# Initialise a similarity index using the stored index, corpus, and dictionary
sims = similarities.Similarity('/home/',tfidf[corpus],num_features=len(dictionary))

## Example use case:

In [None]:
# Now let's test the model on a hypothetical game with certain tags
# Tokenize
query_doc = [w.lower() for w in word_tokenize("casual space indie FPS early access ")]
print(query_doc)
# Compare to dictionary we computed on all other games
query_doc_bow = dictionary.doc2bow(query_doc)
print(query_doc_bow)
# Vectorize word representation
query_doc_tf_idf = tfidf[query_doc_bow]
print(query_doc_tf_idf)
query_similarity = sims[query_doc_tf_idf]
print(query_similarity[:10])