# Prepare TF-IDF dictionary and corpus for game descriptions

To engineer a feature that quantifies the similarity of a games description and the descriptions of the games a user currently owns, we have to generate a dictionary and corpus of words that we can feed to the tf-idf model. What the model does is vectirize word representations based on term frequency in each document (or description in this case). We cab then use these vectirized representations to compute how similar two different document vectors are. 

In [None]:
import logging
import json
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
from gensim import corpora, models, similarities
from nltk.tokenize import word_tokenize
import pandas as pd
import ast
import re
json_data=open("/home/iain/gameDict.json").read()
jsonFile = json.loads(json_data)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# Define SQL database info
db_name  = 'UserInfo'
username = 'username'
host     = 'localhost'
pwd      = 'password'
port     = '5432'
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(username, pwd, host, port, db_name))
print(engine.url)

# connect to database:
con = None
con = psycopg2.connect(database = db_name, user = username, password = pwd, host = host)
cur = con.cursor() # get a cursor to our current connection

## Query game data

In [None]:
# make a query to grab all game information
create_table_sql = """
SELECT * FROM allGames WHERE detailedescription IS NOT NULL;
"""
games = pd.read_sql_query(create_table_sql,con)

## Define function to preprocess game descriptions

In [None]:
def preprocess_descriptions(description):
    # Define a list of words that we will not include, since they do not convey meaning
    try:
        removelist = set('for a of the and to in on is or be as where it its at an - with by'.split())
        description = re.sub(r"\s+", " ", description) # remove line breaks and tabs
        description = re.sub(r"<.*?>", "", description) # remove <>
        description = description.replace(",","") # remove commas
        description = description.replace(".","") # remove periods
        # Split text into a list of words
        text = [word for word in description.lower().split() if word not in removelist]
        return text
    except:
        return []

## Make descriptions into lists of words

In [None]:
# Load in game descriptions, and then preprocess the text for each game
gameDescriptions = [preprocess_descriptions(document) for document in games.detailedescription.tolist()]

## Get dictionary

In [None]:
# Generate a dictionary of tags from our preprocessed list of game tags
dictionary = corpora.Dictionary(gameDescriptions)
dictionary.save('/home/gameDescriptionDict.dict')  # store the dictionary, for future reference
print(dictionary)

## Get corpus

In [None]:
# Generate a corpus matrix from the game descriptions 
corpus = [dictionary.doc2bow(text) for text in gameDescriptions]
# Save the corpus to disk
corpora.MmCorpus.serialize('/home/gameDescriptionCorpus.mm', corpus) 

## Initialize a tf-idf model

In [None]:
# Initialise a term-frequency inverse document frequency model based on the corpus
tfidf = models.TfidfModel(corpus) 
corpus_tfidf = tfidf[corpus]

## Initialize similarity index

In [None]:
# Initialise a similarity index using the stored index, corpus, and dictionary
sims = similarities.Similarity('/home/',tfidf[corpus],num_features=len(dictionary))

## Example use case:

In [None]:
# Now let's test the model on a hypothetical game with certain tags
# Tokenize
query = "Fly to mars in a space ship and partake in warfare with some terrorist with your teammates and complete strategic missions."
query_doc = [w.lower() for w in word_tokenize(query)]
print(query_doc)
# Compare to dictionary we computed on all other games
query_doc_bow = dictionary.doc2bow(query_doc)
print(query_doc_bow)
# Vectorize word representation
query_doc_tf_idf = tfidf[query_doc_bow]
print(query_doc_tf_idf)
query_similarity = sims[query_doc_tf_idf]
print(query_similarity[:10])