# Predictive Analytics: Milestone 3
#### Joshua Greenert, Gabriel Avinaz, and Mithil Patel
#### DSC630-T301 Predictive Analytics
#### 12/26/2022

In [2]:
# Import the required libaries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [2]:
# Pull in the data.  Each person can have their own path.  Comment out the one that isn't yours if different.
# Josh G
df_steam_reviews = pd.read_csv('../../../../../Downloads/steam_reviews.csv', low_memory=False)
# Gabe A
# df_steam_reviews = pd.read_csv('steam_reviews.csv', low_memory=False)

In [3]:
# Show the head of the dataframe to confirm the data is present.
df_steam_reviews.head(5)

Unnamed: 0.1,Unnamed: 0,app_id,app_name,review_id,language,review,timestamp_created,timestamp_updated,recommended,votes_helpful,...,steam_purchase,received_for_free,written_during_early_access,author.steamid,author.num_games_owned,author.num_reviews,author.playtime_forever,author.playtime_last_two_weeks,author.playtime_at_review,author.last_played
0,0,292030,The Witcher 3: Wild Hunt,85185598,schinese,不玩此生遗憾，RPG游戏里的天花板，太吸引人了,1611381629,1611381629,True,0,...,True,False,False,76561199095369542,6,2,1909.0,1448.0,1909.0,1611343000.0
1,1,292030,The Witcher 3: Wild Hunt,85185250,schinese,拔DIAO无情打桩机--杰洛特!!!,1611381030,1611381030,True,0,...,True,False,False,76561198949504115,30,10,2764.0,2743.0,2674.0,1611386000.0
2,2,292030,The Witcher 3: Wild Hunt,85185111,schinese,巫师3NB,1611380800,1611380800,True,0,...,True,False,False,76561199090098988,5,1,1061.0,1061.0,1060.0,1611384000.0
3,3,292030,The Witcher 3: Wild Hunt,85184605,english,"One of the best RPG's of all time, worthy of a...",1611379970,1611379970,True,0,...,True,False,False,76561199054755373,5,3,5587.0,3200.0,5524.0,1611384000.0
4,4,292030,The Witcher 3: Wild Hunt,85184287,schinese,大作,1611379427,1611379427,True,0,...,True,False,False,76561199028326951,7,4,217.0,42.0,217.0,1610788000.0


### Data Preparation

In [4]:
# Using the language column, we can remove all other languages besides english.
df_steam_reviews = df_steam_reviews[df_steam_reviews['language'] == 'english']

In [5]:
# Drop the columns that we don't need.
df_steam_reviews = df_steam_reviews.drop(['Unnamed: 0', 'review_id', 'language','author.num_games_owned', 'author.last_played' ], axis = 1)

In [6]:
# Make dummies of the columns that can conform.
df_reviews_dummies = pd.get_dummies(df_steam_reviews, columns=['recommended', 'steam_purchase', 'received_for_free',
                                                              'written_during_early_access'])

In [7]:
df_reviews_dummies.head(5)

Unnamed: 0,app_id,app_name,review,timestamp_created,timestamp_updated,votes_helpful,votes_funny,weighted_vote_score,comment_count,author.steamid,...,author.playtime_last_two_weeks,author.playtime_at_review,recommended_False,recommended_True,steam_purchase_False,steam_purchase_True,received_for_free_False,received_for_free_True,written_during_early_access_False,written_during_early_access_True
3,292030,The Witcher 3: Wild Hunt,"One of the best RPG's of all time, worthy of a...",1611379970,1611379970,0,0,0.0,0,76561199054755373,...,3200.0,5524.0,0,1,0,1,1,0,1,0
5,292030,The Witcher 3: Wild Hunt,"good story, good graphics. lots to do.",1611379264,1611379264,0,0,0.0,0,76561198170193529,...,823.0,823.0,0,1,0,1,1,0,1,0
6,292030,The Witcher 3: Wild Hunt,"dis gud,",1611379091,1611379091,0,0,0.0,0,76561198119302812,...,3398.0,4192.0,0,1,0,1,1,0,1,0
18,292030,The Witcher 3: Wild Hunt,favorite game of all time cant wait for the Ne...,1611373086,1611373086,0,0,0.0,0,76561198065591528,...,177.0,23329.0,0,1,0,1,1,0,1,0
20,292030,The Witcher 3: Wild Hunt,Why wouldn't you get this,1611371978,1611371978,0,0,0.0,0,76561198996835044,...,2004.0,8557.0,0,1,0,1,1,0,1,0


In [8]:
# Create the lemmatizer, porter, and codes.
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()
VERB_CODES = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}

# Define a function that processes the lemmatizing on each sentence.
def process_sentence_lemmatize(text):
    # changing to lower case, tokenizing, and lemmatizing each of the descriptions
    text = str(text).lower()
    temp_sent =[]
    words = word_tokenize(text)
    tags = nltk.pos_tag(words)
    
    # For loop to remove verb portion of words
    for i, word in enumerate(words):
        if tags[i][1] in VERB_CODES: 
            lemmatized = lemmatizer.lemmatize(word, 'v')
        else:
            lemmatized = lemmatizer.lemmatize(word)
        if lemmatized not in stop_words and lemmatized.isalpha():
            temp_sent.append(lemmatized)
            
    # joining tokenized words and removing contractions
    finalsent = ' '.join(temp_sent)
    finalsent = finalsent.replace("n't", " not")
    finalsent = finalsent.replace("'m", " am")
    finalsent = finalsent.replace("'s", " is")
    finalsent = finalsent.replace("'re", " are")
    finalsent = finalsent.replace("'ll", " will")
    finalsent = finalsent.replace("'ve", " have")
    finalsent = finalsent.replace("'d", " would")
    
    return finalsent

# Define a function that performs additional preparation by replacing contractions with full words.
def process_sentence_stemm(text):
    # changing to lower case, tokenizing, and stemm each of the descriptions
    text = str(text).lower()
    temp_sent =[]
    words = word_tokenize(text)
    tags = nltk.pos_tag(words)
    for word in words:
        words = porter.stem(word) 
    words = ' '.join([word for word in words])

    # joining tokenized words and removing contractions
    finalsent = ' '.join(temp_sent)
    finalsent = finalsent.replace("n't", " not")
    finalsent = finalsent.replace("'m", " am")
    finalsent = finalsent.replace("'s", " is")
    finalsent = finalsent.replace("'re", " are")
    finalsent = finalsent.replace("'ll", " will")
    finalsent = finalsent.replace("'ve", " have")
    finalsent = finalsent.replace("'d", " would")
    return finalsent

In [10]:
# Download the required libraries.
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Josh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Josh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Josh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Josh\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Josh\AppData\Roaming\nltk_data...


True

## WARNING: This lemmatization takes up to 6 hours to complete.

In [11]:
# Calling the process_sentence_lemmatize function through the apply method.
df_reviews_dummies['prepped_review_lemm'] = df_reviews_dummies.review.apply(process_sentence_lemmatize)
df_reviews_dummies.sample(10)

Unnamed: 0,app_id,app_name,review,timestamp_created,timestamp_updated,votes_helpful,votes_funny,weighted_vote_score,comment_count,author.steamid,...,author.playtime_at_review,recommended_False,recommended_True,steam_purchase_False,steam_purchase_True,received_for_free_False,received_for_free_True,written_during_early_access_False,written_during_early_access_True,prepped_review_lemm
11662534,239140,Dying Light,good game can i get my badge now plz,1561837608,1561837608,0,1,0.0,0,76561198125978430,...,6162.0,0,1,1,0,1,0,1,0,good game get badge plz
8734990,508440,Totally Accurate Battle Simulator,"It`s a great game with lots of units, wacky ph...",1591409179,1591409179,0,0,0.0,0,76561197980810778,...,6825.0,0,1,0,1,1,0,0,1,great game lot unit wacky physic cool map watc...
1553901,4000,Garry's Mod,don't do it,1358385102,1403953474,0,0,0.477454,0,76561198046475532,...,136497.0,1,0,0,1,1,0,1,0,
11723714,239140,Dying Light,"I slept on this game for a whole year, and i'm...",1461730763,1461730763,0,0,0.0,0,76561198046541591,...,620.0,0,1,1,0,1,0,1,0,sleep game whole year glad come sens buy defin...
5424579,782330,DOOM Eternal,superior to the previous one in every possible...,1586116993,1586116993,3,0,0.508857,0,76561197978654658,...,3694.0,0,1,0,1,1,0,1,0,superior previous one every possible way
196239,292030,The Witcher 3: Wild Hunt,The game works perfect with Proton and creates...,1577227250,1577227250,0,0,0.0,0,76561197993576059,...,2060.0,0,1,0,1,1,0,1,0,game work perfect proton create really nice at...
15589221,582010,Monster Hunter: World,"Amazing game, if you find it on sale do yourse...",1562009601,1562009601,0,0,0.0,0,76561198074453016,...,3651.0,0,1,0,1,1,0,1,0,amazing game find sale favor buy upcoming upda...
17674808,626690,Sword Art Online: Fatal Bullet,Customizations are so realistic I m fappin jus...,1519413690,1519413690,1,1,0.521739,0,76561198263533863,...,366.0,0,1,0,1,0,1,1,0,customizations realistic fappin jus lookin cha...
3216993,285190,"Warhammer 40,000: Dawn of War III",Very fun game! Clearly a lot of care and thoug...,1500864135,1500864135,8,1,0.493968,0,76561198056106362,...,1445.0,0,1,1,0,1,0,1,0,fun game clearly lot care think go game clear ...
7812166,252490,Rust,"Before I begin, Rust is not a ""Zombie"" Surviva...",1391806636,1391806636,1,0,0.521739,0,76561198120541302,...,3103.0,0,1,0,1,1,0,0,1,begin rust zombie survival game survival game ...


## Checkpoint File:  File present to ensure you don't have to rerun the 6 hour code above.

In [None]:
df_reviews_dummies.to_csv('Prepped_test_out.csv', index=False)

----------------------------------------------------------------------

## New Start Point: Model Finalization

In [7]:
# Import the required libaries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [15]:
# Pull in the dataframe.

# Josh G
df_reviews_dummies = pd.read_csv('../../../../../Downloads/Prepped_test_out.csv', low_memory=False)
df_reviews = pd.read_csv('../../../../../Downloads/steam_reviews.csv', low_memory=False)
# Gabe
# df_reviews_dummies = pd.read_csv('Prepped_test_out.csv', low_memory=False)
# df_reviews = pd.read_csv('steam_reviews.csv', low_memory=False)

In [16]:
# merge the two columns into one
df_reviews_dummies["recommended"] = df_reviews_dummies['recommended_False'] + df_reviews_dummies['recommended_True']

# Set the user recommendation dataframes.
user_rec_df_reviews = df_reviews[["app_id","author.steamid", "recommended"]].copy()
user_rec_df_prepped = df_reviews_dummies[["app_id","author.steamid", "recommended"]].copy()

In [17]:
# Set the values that will be used for the matrix dimensions.
n_ratings_reviews = len(user_rec_df_reviews)
n_games_reviews = len(user_rec_df_reviews['app_id'].unique())
n_users_reviews = len(user_rec_df_reviews['author.steamid'].unique())

n_ratings_prepped = len(user_rec_df_prepped)
n_games_prepped = len(user_rec_df_prepped['app_id'].unique())
n_users_prepped = len(user_rec_df_prepped['author.steamid'].unique())

# Print the results to confirm they have values.
print(n_ratings_reviews, n_games_reviews, n_users_reviews)
print(n_ratings_prepped, n_games_prepped, n_users_prepped)

21747371 315 12406560
630693 7 575099


In [18]:
# maps idices to users and game IDs
user_map_reviews = dict(zip(np.unique(user_rec_df_reviews["author.steamid"]), list(range(n_users_reviews))))
game_map_reviews = dict(zip(np.unique(user_rec_df_reviews["app_id"]), list(range(n_games_reviews))))

# Confirm that the total reviews equal the n_games_reviews. Should be true, true.
print(len(user_map_reviews) == n_users_reviews)
print(len(game_map_reviews) == n_games_reviews)

user_i_map_reviews = dict(zip(list(range(n_users_reviews)), np.unique(user_rec_df_reviews["author.steamid"])))
game_i_map_reviews = dict(zip(list(range(n_games_reviews)), np.unique(user_rec_df_reviews["app_id"])))

# Perform the same operations on the prepped items.
user_map_prepped = dict(zip(np.unique(user_rec_df_prepped["author.steamid"]), list(range(n_users_prepped))))
game_map_prepped = dict(zip(np.unique(user_rec_df_prepped["app_id"]), list(range(n_games_prepped))))

# Confirm that the total reviews equal the n_games_reviews. Should be true, true.
print(len(user_map_prepped) == n_users_prepped)
print(len(game_map_prepped) == n_games_prepped)

user_i_map_prepped = dict(zip(list(range(n_users_prepped)), np.unique(user_rec_df_prepped["author.steamid"])))
game_i_map_prepped = dict(zip(list(range(n_games_prepped)), np.unique(user_rec_df_prepped["app_id"])))

True
True
True
True


In [20]:
# creating indices for csr_matrix
user_index_reviews = [user_map_reviews[i] for i in user_rec_df_reviews['author.steamid']]
game_index_reviews = [game_map_reviews[i] for i in user_rec_df_reviews['app_id']]

user_index_prepped = user_rec_df_prepped['author.steamid'].map(user_map_prepped, na_action='ignore').fillna(-1).astype(int)
game_index_prepped = [game_map_prepped[i] for i in user_rec_df_prepped['app_id']]

In [22]:
# creates csr matrixes
matrix_reviews = csr_matrix((user_rec_df_reviews["recommended"], (game_index_reviews, user_index_reviews)), shape=(n_games_reviews, n_users_reviews))
matrix_prepped = csr_matrix((user_rec_df_prepped["recommended"], (game_index_prepped, user_index_prepped)), shape=(n_games_prepped, n_users_prepped))

In [25]:
# creating a dictionary to pull the name of games from based on IDs
game_names_reviews = dict(zip(df_reviews['app_id'], df_reviews['app_name']))
game_names_prepped = dict(zip(df_reviews_dummies['app_id'], df_reviews_dummies['app_name']))

In [26]:
'''
This function uses both datasets to determine the best match for returning a list of games.  It uses KNN
along with the cosine function to find similiaries and store them into a list of neighbor ids.  These ids
are then compared for accuracy and the highest matches are returned.

@param game_id: the id that references the game found with Steam.
@param total_matches: the total number of matches to return
'''
def find_related_games(game_id, total_matches):
    
    # Set variables
    reviews_neighbour_ids_with_distance = {}
    prepped_neighbour_ids_with_distance = {}
    
    # Prepare index/vectorizations.
    game_index_reviews = game_map_reviews[game_id]
    game_vector_reviews = matrix_reviews[game_index_reviews]
    
    game_index_prepped = game_map_prepped[game_id]
    game_vector_prepped = matrix_prepped[game_index_prepped]
    
    # Increment total matches.
    total_matches += 1
    
    # Set the KNN model and fit them.
    kNN_reviews = NearestNeighbors(algorithm = 'brute', metric='cosine')
    kNN_reviews.fit(matrix_reviews)
    
    kNN_prepped = NearestNeighbors(algorithm = 'brute', metric='cosine')
    kNN_prepped.fit(matrix_prepped)
    
    # reshape and determine distances for KNN values.
    game_vec_reshaped_reviews = game_vector_reviews.reshape(1,-1)
    distances_reviews, indices_reviews = kNN_reviews.kneighbors(game_vec_reshaped_reviews, n_neighbors=total_matches)
    
    game_vec_reshaped_prepped = game_vector_prepped.reshape(1,-1)
    distances_prepped, indices_prepped = kNN_prepped.kneighbors(game_vec_reshaped_prepped, n_neighbors=total_matches)
    
    # Loop through and flatten the distances provided
    for i in range(0,len(distances_reviews.flatten())):
        n = indices_reviews.flatten()[i]
        neighbour_id = game_i_map_reviews[n]
        reviews_neighbour_ids_with_distance[game_names_reviews[neighbour_id]] = distances_reviews.flatten()[i]
    reviews_neighbour_ids_with_distance.pop(game_names_reviews[game_id], None) # removes the same game
    
    for i in range(0,len(distances_prepped.flatten())):
        n = indices_prepped.flatten()[i]
        neighbour_id = game_i_map_prepped[n]
        prepped_neighbour_ids_with_distance[game_names_prepped[neighbour_id]] = distances_prepped.flatten()[i]
    prepped_neighbour_ids_with_distance.pop(game_names_prepped[game_id], None) 
    
    # Sort the data by accuracy
    sorted_neighbour_ids_with_distance_reviews = sorted(reviews_neighbour_ids_with_distance.items(), key=lambda x: x[1], reverse=True)
    sorted_neighbour_ids_with_distance_prepped = sorted(prepped_neighbour_ids_with_distance.items(), key=lambda x: x[1], reverse=True)
    
    # finalize the list amongst the two created.
    combined_list = sorted_neighbour_ids_with_distance_reviews + sorted_neighbour_ids_with_distance_prepped
    sorted_combined_list = sorted(combined_list, key=lambda x: x[1], reverse=True)
    
    # Print the games and their related accuracy.
    count = 0
    
    print(f"Games related to: {game_names_reviews[game_id]}\n")
    for game_name, accuracy in sorted_combined_list:
        if count == total_matches:
            break
        else:
            print(f"{game_name}: {accuracy:.2f}")

In [29]:
# Set a test game id and run the function.
game_id = 70
total_matches = 10
 
find_related_games(game_id, total_matches)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [28]:
game_map_prepped

{70: 0, 240: 1, 420: 2, 620: 3, 2870: 4, 4000: 5, 292030: 6}

In [None]:
# Create the td-idf vectorizer. 
#tfidfvec = TfidfVectorizer()

#tfidf_reviews = tfidfvec.fit_transform((df_reviews_dummies['prepped_review_lemm'].astype('U')))

# Perform a test using a fixed term.
#search = "Tom Clany's Rainbow Six Siege"
#search_processed = re.sub("[^a-zA-Z0-9 ]","",search.lower())

#vec= tfidfvec.transform([search_processed])

# Add cosine similarity
#similarity = cosine_similarity(tfidf_reviews)
#indices = np.argpartition(similarity, -10)[-10:]

# Set the results.
#results = df_reviews_dummies.iloc[indices]

# Print the resulting dataframe.
#results[::-1]