In [1]:
import pandas as pd
import numpy as np
import pickle

from datetime import datetime, timedelta

%store -r tags_dict
%store -r all_languages

In [2]:
with open('../data/interim/1 - Games DF - Wrangled', 'rb') as file :
    games_df = pickle.load(file)

games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17008 entries, 0 to 17007
Data columns (total 56 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   app_id                       17008 non-null  int64         
 1   title                        17008 non-null  object        
 2   developer                    17008 non-null  object        
 3   publisher                    17008 non-null  object        
 4   description                  17008 non-null  object        
 5   release_date                 17000 non-null  datetime64[ns]
 6   price                        16980 non-null  float64       
 7   price_category               16980 non-null  category      
 8   number_of_reviews            17002 non-null  float64       
 9   positive_review_percent      17002 non-null  float64       
 10  relevant_langs_comments_sum  17008 non-null  int32         
 11  tags                         17008 non-nu

Step 1: Pare Down the Columns
---

In [3]:
# Now we'll make a list of columns we really don't need and take them out.

columns_to_drop = ['title', 'game_page_link', 'developer', 'publisher', \
                   'description', "number_of_reviews", "interface_languages", "full_audio_languages", \
                    "subtitles_languages"]

# We don't need the language count columns any more, either.
columns_to_drop += all_languages

for col in columns_to_drop :
    if col in games_df.columns :
        games_df.drop(col, axis=1, inplace=True)

games_df.columns

Index(['app_id', 'release_date', 'price', 'price_category',
       'positive_review_percent', 'relevant_langs_comments_sum', 'tags',
       'mod_interface_languages', 'mod_full_audio_languages',
       'mod_subtitles_languages', 'language_comment_counts', 'comment_ratios',
       'comment_diff_agnostic', 'comment_diff_any', 'comment_diff_interface',
       'comment_diff_audio', 'comment_diff_subtitles', 'date_scraped'],
      dtype='object')

Step 2: Remove Unusable Rows
---

Let's start by removing records that are unsuitable for our purposes sue to reasons unrelated to feature engineering.

This includes:

- Games that are too recently published to have had time to be localized and/or representative comment counts
- Games with no price info


In [6]:
# Arbitrarily set a cutoff date of 1 year ago
today = datetime.today()
cutoff_date = today - timedelta(weeks=(52))

# Remove 
games_df = games_df[games_df['release_date'] < cutoff_date]
print("Removed games released less than 1 year ago.")
print(f"Games remaining: {len(games_df)}")

Removed games released less than 1 year ago.
Games remaining: 13780


In [7]:
# Remove games with no price info
games_df = games_df[~games_df['price'].isnull()]

print("Removed games with no price info.")
print(f"Games remaining: {len(games_df)}")

Removed games with no price info.
Games remaining: 13757


----------------------

Now let's look at what we have left, and preserve only records that are useful for ML.

Since tags are our main feature, let's remove any games without enough tags. This process must be somewhat arbitrary.

In [8]:
# Identify games with too few tags to be useful.
def refresh_tag_count(df) :
    """
    Looks for a "tags" row in {df} containing lists
    Creates a "tag_count" row in {df} returning the len of that row's list
    Returns {df}
    """

    df['tag_count'] = 0

    for index, row in df.iterrows() :
        df.loc[index, 'tag_count'] = len(row['tags'])

    print(df['tag_count'].value_counts())

    return df

games_df = refresh_tag_count(games_df)

7    12257
5      417
6      407
4      303
3      270
2       87
1       16
Name: tag_count, dtype: int64


In [9]:
# I'll arbitrarily decide that games need to have the maximum of 7 tags to be considered, and drop the rest of the rows.

indices_to_drop = games_df['tag_count'] < 7

games_df = games_df[~indices_to_drop]

print("Removed games with fewer than 7 tags.")
print(f"Games remaining: {len(games_df)}")

Len before: 13757
Removed games with fewer than 7 tags.
Games remaining: 12257


By the same token, tags which are used too infrequently would also lack predictive power.

In [10]:
# First, see how often the tags are used in general.

def get_sorted_tag_frequency(df) :
    """
    Counts the frequency of items in lists of a 'tags' row of {df}.
    Returnes a sorted high-low Series with the items as keys and frequency counts as values.
    Prints the number of tags by frequency at thresholds of 50.
    """
    exploded_tags = df.explode('tags')
    tag_counts = exploded_tags['tags'].value_counts()
    sorted_tag_frequency = tag_counts.sort_values(ascending=False)

    cutoffs = [x for x in range(50, 1001, 50)]

    for cutoff in cutoffs :
        print(f"# of tags used more than {cutoff} times: {len(sorted_tag_frequency[sorted_tag_frequency > cutoff])}")

    return sorted_tag_frequency

sorted_tag_frequency = get_sorted_tag_frequency(games_df)

# of tags used more than 50 times: 232
# of tags used more than 100 times: 172
# of tags used more than 150 times: 136
# of tags used more than 200 times: 101
# of tags used more than 250 times: 84
# of tags used more than 300 times: 71
# of tags used more than 350 times: 59
# of tags used more than 400 times: 51
# of tags used more than 450 times: 42
# of tags used more than 500 times: 38
# of tags used more than 550 times: 30
# of tags used more than 600 times: 28
# of tags used more than 650 times: 26
# of tags used more than 700 times: 26
# of tags used more than 750 times: 24
# of tags used more than 800 times: 21
# of tags used more than 850 times: 18
# of tags used more than 900 times: 17
# of tags used more than 950 times: 15
# of tags used more than 1000 times: 14


In [11]:
# I'll arbitrarily peg the number of times a tag must be used in order
# to be considered in our analysis to 5% of the number of records.
tag_usage_cutoff = int(len(games_df) * 0.05)
print(f'Tags must occur {tag_usage_cutoff} times to be considered.')

# Now let's get a list of the usable tags.
usable_tags = sorted_tag_frequency[sorted_tag_frequency > tag_usage_cutoff].keys()
print(f'There are {len(usable_tags)} such tags.')

Tags must occur 612 times to be considered.
There are 26 such tags.


In [28]:
# Now let's see how many records how many of those usable tags.
games_df['usable_tag_count'] = 0

for index, row in games_df.iterrows() :
    for tag in row['tags'] :
        if tag in usable_tags :
            games_df.loc[index, 'usable_tag_count'] += 1

# Display how many records have how many usable tags.
usable_tag_counts = games_df['usable_tag_count'].value_counts().sort_index()

usable_tag_counts

0     326
1    1063
2    2297
3    3170
4    2885
5    1767
6     665
7      84
Name: usable_tag_count, dtype: int64

In [35]:
# Cumulative tag counts would be easier to process visually...

cumulative_usable_tag_counts = pd.Series()
total_count = len(games_df)

for tag_count in usable_tag_counts.index :
    running_sum = 0
    for i in range(tag_count) :
        running_sum += usable_tag_counts[i + 1]
    usable_value = total_count - running_sum
    index = str(tag_count)
    cumulative_usable_tag_counts[index] = usable_value

cumulative_usable_tag_counts = cumulative_usable_tag_counts[::-1]

for index, value in cumulative_usable_tag_counts.items() :
    print(f"{value} games have at least {index} usable tag(s).")

326 games have at least 7 usable tag(s).
410 games have at least 6 usable tag(s).
1075 games have at least 5 usable tag(s).
2842 games have at least 4 usable tag(s).
5727 games have at least 3 usable tag(s).
8897 games have at least 2 usable tag(s).
11194 games have at least 1 usable tag(s).
12257 games have at least 0 usable tag(s).


  cumulative_usable_tag_counts = pd.Series()


In [None]:
# There's a huge dropoff after 5 usable tags.
# Let's drop every game that doesn't have at least 5 usable tags.

rows_with_at_least_one_usable_tag = games_df['usable_tag_count'] > 4

games_df = games_df[rows_with_at_least_one_usable_tag]

games_df.info()

In [None]:
# Now let's see if we have any "unusable" tags remaining in our dataset:

used_tag_set = set()

for tag_list in games_df['tags'] :
    for tag in tag_list :
        used_tag_set.add(tag)

mismatched_tags = [tag for tag in used_tag_set if tag not in usable_tags]

len(mismatched_tags)

In [None]:
# I guess we better remove them!

for index, row in games_df.iterrows() :
    row_tags = row['tags']
    for tag in row_tags :
        if tag in mismatched_tags :
            row_tags.remove(tag)
            games_df.at[index, 'tags'] = row_tags

# Now let's see our tag counts per game...

games_df = refresh_tag_count(games_df)

print(len(games_df))

After all that, our index is a mess. Let's fix it.

In [None]:
# Our index has gotten messed up by deleting rows. Let's fix that.

games_df = games_df.reset_index()

In [None]:
games_df.info()

Step 4: Remove Games with Too Few Comments
---
I'll determine an arbitrary threshold for average comments in all languages with comments, and drop games beneath that

In [None]:
comment_counts = games_df['relevant_langs_comments_sum']
ordered_comment_counts = comment_counts.sort_values(ascending=False)
ordered_comment_counts

cutoffs = range(0, 10000, 500)

for cutoff in cutoffs :
    print(f"Games with more than {cutoff} comments: {len(ordered_comment_counts[ordered_comment_counts > cutoff])}")

In [None]:
# I'll arbitrarily select 500 as the minimum number of comment that qualify a game for analysis.

games_df = games_df[games_df['relevant_langs_comments_sum'] > 500]

games_df.info()

Step 5: Remove rows unqualified for other reasons
---
- Games released too recently will have unbalanced comment counts.
- We can remove games with no price information, for consistency.

In [None]:
# Clean our index.

games_df = games_df.reset_index()

Prepare for Modeling
---

In [None]:
# Let's do a silly one-hot
for tag in usable_tags :
    games_df[tag] = 0
    for index, row in games_df.iterrows() :
        if tag in row['tags'] :
            games_df.loc[index, tag] = 1


In [None]:
# Normalize our other main predicter variable...
mean = games_df['price'].mean()
std = games_df['price'].std()

games_df['price'] = (games_df['price'] - mean) / std

In [None]:
# Output a subset of the df that will be used for modeling

columns_to_drop = ["price_category", "positive_review_percent", "relevant_langs_comments_sum", \
                    "tags", "language_comment_counts", "date_scraped", "tag_count", "usable_tag_count", \
                    "comment_ratios", "mod_interface_languages", "mod_full_audio_languages", "mod_subtitles_languages"]

games_df = games_df.drop(columns=columns_to_drop)

In [None]:
# Because selecting our specific target will also be something we play with in modeling, we'll do that in the modeling notebook!
# But we can still split things into X-inverses and y-iverses

target_cols = ['comment_diff_agnostic', 'comment_diff_any', 'comment_diff_interface', 'comment_diff_audio', 'comment_diff_subtitles', 'release_date']

X = games_df.drop(columns=target_cols)
y = games_df[target_cols]

Save & Quit
---

In [None]:
with open('../data/processed/2 - Games DF - PreProcessed Features', 'wb') as file :
    pickle.dump(X, file)

with open('../data/processed/2 - Games DF - PreProcessed Targets', 'wb') as file :
    pickle.dump(y, file)

-----------------------