In [83]:
import pandas as pd
import numpy as np
import json

import matplotlib.pyplot as plt
import seaborn as sns

%store -r tags_dict
%store -r all_languages

In [84]:
games_df = pd.read_json('../data/interim/1 - Games DF - Wrangled.json')

games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9962 entries, 0 to 9961
Data columns (total 26 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   app_id                      9962 non-null   int64  
 1   title                       9962 non-null   object 
 2   developer                   9962 non-null   object 
 3   publisher                   9962 non-null   object 
 4   description                 9962 non-null   object 
 5   release_date                9956 non-null   float64
 6   price                       9345 non-null   float64
 7   price_category              9345 non-null   object 
 8   number_of_reviews           9958 non-null   float64
 9   positive_review_percent     9958 non-null   float64
 10  relevant_langs_reviews_sum  9962 non-null   int64  
 11  tags                        9962 non-null   object 
 12  interface_languages         9962 non-null   object 
 13  mod_interface_languages     9962 

In [85]:
# Fix the release date

games_df['release_date'] = pd.to_datetime(games_df['release_date'], infer_datetime_format=True, format="%Y-%m-%d", errors='coerce')

print(games_df.loc[0, 'release_date'])

1970-01-01 00:22:25.507200


Step 1: Pare Down the Columns
---

In [77]:
# Now we'll make a list of columns we really don't need and take them out.

columns_to_drop = ['title', 'positive_review_percent', 'game_page_link', 'developer', 'publisher', \
                   'description', "number_of_reviews", "interface_languages", "full_audio_languages", \
                    "subtitles_languages", "relevant_langs_reviews_sum"]

# We don't need the language count columns any more, either.
columns_to_drop += all_languages

for col in columns_to_drop :
    if col in games_df.columns :
        games_df.drop(col, axis=1, inplace=True)

games_df.columns

Index(['app_id', 'release_date', 'price', 'price_category', 'tags',
       'mod_interface_languages', 'mod_full_audio_languages',
       'mod_subtitles_languages', 'language_comment_counts', 'comment_ratios',
       'comment_diff_agnostic', 'comment_diff_any', 'comment_diff_interface',
       'comment_diff_audio', 'comment_diff_subtitles'],
      dtype='object')

Step 2: Remove Unusable Rows
---
Decide which games have too few tags to be useful for the ML process and remove them from our set.

In [78]:
# Identify games with too few tags to be useful.

def refresh_tag_count(df) :
    """
    Looks for a "tags" row in {df} containing lists
    Creates a "tag_count" row in {df} returning the len of that row's list
    Returns {df}
    """

    df['tag_count'] = 0

    for index, row in df.iterrows() :
        df.loc[index, 'tag_count'] = len(row['tags'])

    print(df['tag_count'].value_counts())

    return df

games_df = refresh_tag_count(games_df)

7    9558
6     130
5     121
4      72
3      50
2      26
1       5
Name: tag_count, dtype: int64


In [79]:
# I'll arbitrarily decide that games need to have the maximum of 7 tags to be considered, and drop the rest of the rows.

print("Len before: "+str(len(games_df)))

indices_to_drop = games_df['tag_count'] < 7

games_df = games_df[~indices_to_drop]

print("Len after: "+str(len(games_df)))

Len before: 9962
Len after: 9558


Step 3: Remove Unusable Tags
---
Decide wich tags are used too infrequently to be useful features.

In [80]:
# First, see how often the tags are used in general.

def get_sorted_tag_frequency(df) :
    """
    Counts the frequency of items in lists of a 'tags' row of {df}.
    Returnes a sorted high-low Series with the items as keys and frequency counts as values.
    Prints the number of tags by frequency at thresholds of 50.
    """
    exploded_tags = df.explode('tags')
    tag_counts = exploded_tags['tags'].value_counts()
    sorted_tag_frequency = tag_counts.sort_values(ascending=False)

    cutoffs = [x for x in range(50, 1001, 50)]

    for cutoff in cutoffs :
        print(f"# of tags used more than {cutoff} times: {len(sorted_tag_frequency[sorted_tag_frequency > cutoff])}")

    return sorted_tag_frequency

sorted_tag_frequency = get_sorted_tag_frequency(games_df)

# of tags used more than 50 times: 212
# of tags used more than 100 times: 151
# of tags used more than 150 times: 114
# of tags used more than 200 times: 84
# of tags used more than 250 times: 74
# of tags used more than 300 times: 58
# of tags used more than 350 times: 47
# of tags used more than 400 times: 37
# of tags used more than 450 times: 31
# of tags used more than 500 times: 26
# of tags used more than 550 times: 26
# of tags used more than 600 times: 21
# of tags used more than 650 times: 18
# of tags used more than 700 times: 18
# of tags used more than 750 times: 15
# of tags used more than 800 times: 13
# of tags used more than 850 times: 13
# of tags used more than 900 times: 11
# of tags used more than 950 times: 11
# of tags used more than 1000 times: 9


In [81]:
# I'll arbitrarily decide that a tag must be used at least 100 times to be considered.
# This corresponds to the first 151 items in the sorted_tags_count Series.
# If we remove all other tags, how much usable data would we have left? Let's check.

# First, get a list of the usable tags.
usable_tags = sorted_tag_frequency[:151]
usable_tags = list(usable_tags.keys())

# Then, record how many usable tags each remaining game has.
games_df['usable_tag_count'] = 0

for index, row in games_df.iterrows() :
    for tag in row['tags'] :
        if tag in usable_tags :
            games_df.loc[index, 'usable_tag_count'] += 1

# Display how many records have how many usable tags.
games_df['usable_tag_count'].value_counts()

7    3854
6    3531
5    1558
4     480
3     109
2      20
1       5
0       1
Name: usable_tag_count, dtype: int64

In [None]:
# There's a huge dropoff after 5 usable tags.
# Let's drop every game that doesn't have at least 5 usable tags.

rows_with_at_least_one_usable_tag = games_df['usable_tag_count'] > 4

games_df = games_df[rows_with_at_least_one_usable_tag]

games_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8943 entries, 1 to 9961
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   app_id                    8943 non-null   int64  
 1   release_date              8938 non-null   float64
 2   price                     8385 non-null   float64
 3   price_category            8385 non-null   object 
 4   tags                      8943 non-null   object 
 5   mod_interface_languages   8943 non-null   object 
 6   mod_full_audio_languages  8943 non-null   object 
 7   mod_subtitles_languages   8943 non-null   object 
 8   language_comment_counts   8943 non-null   object 
 9   comment_ratios            8943 non-null   object 
 10  comment_diff_agnostic     8943 non-null   object 
 11  comment_diff_any          8943 non-null   object 
 12  comment_diff_interface    8943 non-null   object 
 13  comment_diff_audio        8943 non-null   object 
 14  comment_

In [None]:
# Now let's see if we have any "unusable" tags remaining in our dataset:

used_tag_set = set()

for tag_list in games_df['tags'] :
    for tag in tag_list :
        used_tag_set.add(tag)

mismatched_tags = [tag for tag in used_tag_set if tag not in usable_tags]

len(mismatched_tags)

268

In [None]:
# I guess we better remove them!

for index, row in games_df.iterrows() :
    row_tags = row['tags']
    for tag in row_tags :
        if tag in mismatched_tags :
            row_tags.remove(tag)
            games_df.at[index, 'tags'] = row_tags

# Now let's see our tag counts per game...

games_df = refresh_tag_count(games_df)

print(len(games_df))

7    3854
6    3531
5    1558
Name: tag_count, dtype: int64
8943


All of our rows and tags now match our arbitrary criteria of:
- Every row has at least 5 tags
- Every tag is used at least 100 times overall

And we still have 8,943 usable records! Not bad.

In [None]:
# Our index has gotten messed up by deleting rows. Let's fix that.

games_df = games_df.reset_index()

Step 4: Remove rows unqualified for other reasons
---
- Games released too recently will have unbalanced comment counts.
- We can remove games with no price information, for consistency.

In [None]:
print(pd.to_datetime(games_df.loc[0, 'release_date']))

1970-01-01 00:28:11.020800


-----------------------