In [1]:
import pandas as pd
import numpy as np
import pickle

from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

%store -r tags_dict
%store -r all_languages

In [2]:
with open('../data/interim/1 - Games DF - Wrangled', 'rb') as file :
    games_df = pickle.load(file)

games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14029 entries, 0 to 14028
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   app_id                      14029 non-null  int64         
 1   title                       14029 non-null  object        
 2   developer                   14029 non-null  object        
 3   publisher                   14029 non-null  object        
 4   description                 14029 non-null  object        
 5   release_date                14023 non-null  datetime64[ns]
 6   price                       14002 non-null  float64       
 7   price_category              14002 non-null  category      
 8   number_of_reviews           14025 non-null  float64       
 9   positive_review_percent     14025 non-null  float64       
 10  relevant_langs_reviews_sum  14029 non-null  int32         
 11  tags                        14029 non-null  object    

Step 1: Pare Down the Columns
---

In [3]:
# Now we'll make a list of columns we really don't need and take them out.

columns_to_drop = ['title', 'game_page_link', 'developer', 'publisher', \
                   'description', "number_of_reviews", "interface_languages", "full_audio_languages", \
                    "subtitles_languages"]

# We don't need the language count columns any more, either.
columns_to_drop += all_languages

for col in columns_to_drop :
    if col in games_df.columns :
        games_df.drop(col, axis=1, inplace=True)

games_df.columns

Index(['app_id', 'release_date', 'price', 'price_category',
       'positive_review_percent', 'relevant_langs_reviews_sum', 'tags',
       'mod_interface_languages', 'mod_full_audio_languages',
       'mod_subtitles_languages', 'language_comment_counts', 'comment_ratios',
       'comment_diff_agnostic', 'comment_diff_any', 'comment_diff_interface',
       'comment_diff_audio', 'comment_diff_subtitles', 'date_scraped'],
      dtype='object')

Step 2: Remove Unusable Rows
---
Decide which games have too few tags to be useful for the ML process and remove them from our set.

In [4]:
# Identify games with too few tags to be useful.

def refresh_tag_count(df) :
    """
    Looks for a "tags" row in {df} containing lists
    Creates a "tag_count" row in {df} returning the len of that row's list
    Returns {df}
    """

    df['tag_count'] = 0

    for index, row in df.iterrows() :
        df.loc[index, 'tag_count'] = len(row['tags'])

    print(df['tag_count'].value_counts())

    return df

games_df = refresh_tag_count(games_df)

7    13013
6      317
5      277
4      186
3      173
2       54
1        9
Name: tag_count, dtype: int64


In [5]:
# I'll arbitrarily decide that games need to have the maximum of 7 tags to be considered, and drop the rest of the rows.

print("Len before: "+str(len(games_df)))

indices_to_drop = games_df['tag_count'] < 7

games_df = games_df[~indices_to_drop]

print("Len after: "+str(len(games_df)))

Len before: 14029
Len after: 13013


Step 3: Remove Unusable Tags
---
Decide wich tags are used too infrequently to be useful features.

In [6]:
# First, see how often the tags are used in general.

def get_sorted_tag_frequency(df) :
    """
    Counts the frequency of items in lists of a 'tags' row of {df}.
    Returnes a sorted high-low Series with the items as keys and frequency counts as values.
    Prints the number of tags by frequency at thresholds of 50.
    """
    exploded_tags = df.explode('tags')
    tag_counts = exploded_tags['tags'].value_counts()
    sorted_tag_frequency = tag_counts.sort_values(ascending=False)

    cutoffs = [x for x in range(50, 1001, 50)]

    for cutoff in cutoffs :
        print(f"# of tags used more than {cutoff} times: {len(sorted_tag_frequency[sorted_tag_frequency > cutoff])}")

    return sorted_tag_frequency

sorted_tag_frequency = get_sorted_tag_frequency(games_df)

# of tags used more than 50 times: 246
# of tags used more than 100 times: 182
# of tags used more than 150 times: 144
# of tags used more than 200 times: 113
# of tags used more than 250 times: 89
# of tags used more than 300 times: 78
# of tags used more than 350 times: 64
# of tags used more than 400 times: 55
# of tags used more than 450 times: 51
# of tags used more than 500 times: 41
# of tags used more than 550 times: 36
# of tags used more than 600 times: 30
# of tags used more than 650 times: 29
# of tags used more than 700 times: 27
# of tags used more than 750 times: 24
# of tags used more than 800 times: 22
# of tags used more than 850 times: 20
# of tags used more than 900 times: 18
# of tags used more than 950 times: 17
# of tags used more than 1000 times: 16


In [7]:
# I'll arbitrarily decide that a tag must be used at least 150 times to be considered.
# If we remove all other tags, how much usable data would we have left? Let's check.

# First, get a list of the usable tags.
usable_tags = sorted_tag_frequency[sorted_tag_frequency > 150]
usable_tags = list(usable_tags.keys())

# Then, record how many usable tags each remaining game has.
games_df['usable_tag_count'] = 0

for index, row in games_df.iterrows() :
    for tag in row['tags'] :
        if tag in usable_tags :
            games_df.loc[index, 'usable_tag_count'] += 1

# Display how many records have how many usable tags.
games_df['usable_tag_count'].value_counts()

6    4831
7    4712
5    2404
4     812
3     203
2      38
1      12
0       1
Name: usable_tag_count, dtype: int64

In [8]:
# There's a huge dropoff after 5 usable tags.
# Let's drop every game that doesn't have at least 5 usable tags.

rows_with_at_least_one_usable_tag = games_df['usable_tag_count'] > 4

games_df = games_df[rows_with_at_least_one_usable_tag]

games_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11947 entries, 1 to 14028
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   app_id                      11947 non-null  int64         
 1   release_date                11941 non-null  datetime64[ns]
 2   price                       11927 non-null  float64       
 3   price_category              11927 non-null  category      
 4   positive_review_percent     11944 non-null  float64       
 5   relevant_langs_reviews_sum  11947 non-null  int32         
 6   tags                        11947 non-null  object        
 7   mod_interface_languages     11947 non-null  object        
 8   mod_full_audio_languages    11947 non-null  object        
 9   mod_subtitles_languages     11947 non-null  object        
 10  language_comment_counts     11947 non-null  object        
 11  comment_ratios              11947 non-null  object    

In [9]:
# Now let's see if we have any "unusable" tags remaining in our dataset:

used_tag_set = set()

for tag_list in games_df['tags'] :
    for tag in tag_list :
        used_tag_set.add(tag)

mismatched_tags = [tag for tag in used_tag_set if tag not in usable_tags]

len(mismatched_tags)

281

In [10]:
# I guess we better remove them!

for index, row in games_df.iterrows() :
    row_tags = row['tags']
    for tag in row_tags :
        if tag in mismatched_tags :
            row_tags.remove(tag)
            games_df.at[index, 'tags'] = row_tags

# Now let's see our tag counts per game...

games_df = refresh_tag_count(games_df)

print(len(games_df))

6    5608
7    4712
5    1627
Name: tag_count, dtype: int64
11947


All of our rows and tags now match our arbitrary criteria of:
- Every row has at least 5 tags
- Every tag is used at least 100 times overall

And we still have 8,943 usable records! Not bad.

In [11]:
# Our index has gotten messed up by deleting rows. Let's fix that.

games_df = games_df.reset_index()

In [12]:
games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11947 entries, 0 to 11946
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   index                       11947 non-null  int64         
 1   app_id                      11947 non-null  int64         
 2   release_date                11941 non-null  datetime64[ns]
 3   price                       11927 non-null  float64       
 4   price_category              11927 non-null  category      
 5   positive_review_percent     11944 non-null  float64       
 6   relevant_langs_reviews_sum  11947 non-null  int32         
 7   tags                        11947 non-null  object        
 8   mod_interface_languages     11947 non-null  object        
 9   mod_full_audio_languages    11947 non-null  object        
 10  mod_subtitles_languages     11947 non-null  object        
 11  language_comment_counts     11947 non-null  object    

Step 4: Remove Games with Too Few Comments
---
I'll determine an arbitrary threshold for average comments in all languages with comments, and drop games beneath that

In [13]:
comment_counts = games_df['relevant_langs_reviews_sum']
ordered_comment_counts = comment_counts.sort_values(ascending=False)
ordered_comment_counts

cutoffs = range(0, 10000, 500)

for cutoff in cutoffs :
    print(f"Games with more than {cutoff} comments: {len(ordered_comment_counts[ordered_comment_counts > cutoff])}")

Games with more than 0 comments: 11947
Games with more than 500 comments: 6468
Games with more than 1000 comments: 4953
Games with more than 1500 comments: 4163
Games with more than 2000 comments: 3673
Games with more than 2500 comments: 3297
Games with more than 3000 comments: 3023
Games with more than 3500 comments: 2782
Games with more than 4000 comments: 2569
Games with more than 4500 comments: 2416
Games with more than 5000 comments: 2267
Games with more than 5500 comments: 2158
Games with more than 6000 comments: 2052
Games with more than 6500 comments: 1959
Games with more than 7000 comments: 1858
Games with more than 7500 comments: 1794
Games with more than 8000 comments: 1738
Games with more than 8500 comments: 1669
Games with more than 9000 comments: 1616
Games with more than 9500 comments: 1565


In [14]:
# I'll arbitrarily select 500 as the minimum number of comment that qualify a game for analysis.

games_df = games_df[games_df['relevant_langs_reviews_sum'] > 500]

games_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6468 entries, 0 to 11944
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   index                       6468 non-null   int64         
 1   app_id                      6468 non-null   int64         
 2   release_date                6462 non-null   datetime64[ns]
 3   price                       6449 non-null   float64       
 4   price_category              6449 non-null   category      
 5   positive_review_percent     6468 non-null   float64       
 6   relevant_langs_reviews_sum  6468 non-null   int32         
 7   tags                        6468 non-null   object        
 8   mod_interface_languages     6468 non-null   object        
 9   mod_full_audio_languages    6468 non-null   object        
 10  mod_subtitles_languages     6468 non-null   object        
 11  language_comment_counts     6468 non-null   object     

Step 5: Remove rows unqualified for other reasons
---
- Games released too recently will have unbalanced comment counts.
- We can remove games with no price information, for consistency.

In [15]:
# Arbitrarily set a cutoff date of 1 year ago
today = datetime.today()
cutoff_date = today - timedelta(weeks=(52))

# Remove 
games_df = games_df[games_df['release_date'] < cutoff_date]
games_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5742 entries, 2 to 11944
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   index                       5742 non-null   int64         
 1   app_id                      5742 non-null   int64         
 2   release_date                5742 non-null   datetime64[ns]
 3   price                       5727 non-null   float64       
 4   price_category              5727 non-null   category      
 5   positive_review_percent     5742 non-null   float64       
 6   relevant_langs_reviews_sum  5742 non-null   int32         
 7   tags                        5742 non-null   object        
 8   mod_interface_languages     5742 non-null   object        
 9   mod_full_audio_languages    5742 non-null   object        
 10  mod_subtitles_languages     5742 non-null   object        
 11  language_comment_counts     5742 non-null   object     

In [16]:
# Remove any games with no price.

games_df = games_df[~games_df['price'].isnull()]
games_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5727 entries, 2 to 11944
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   index                       5727 non-null   int64         
 1   app_id                      5727 non-null   int64         
 2   release_date                5727 non-null   datetime64[ns]
 3   price                       5727 non-null   float64       
 4   price_category              5727 non-null   category      
 5   positive_review_percent     5727 non-null   float64       
 6   relevant_langs_reviews_sum  5727 non-null   int32         
 7   tags                        5727 non-null   object        
 8   mod_interface_languages     5727 non-null   object        
 9   mod_full_audio_languages    5727 non-null   object        
 10  mod_subtitles_languages     5727 non-null   object        
 11  language_comment_counts     5727 non-null   object     

In [17]:
# Clean our index.

games_df = games_df.reset_index()

Prepare for Modeling
---

In [18]:
# Let's do a silly one-hot
for tag in usable_tags :
    games_df[tag] = 0
    for index, row in games_df.iterrows() :
        if tag in row['tags'] :
            games_df.loc[index, tag] = 1


In [19]:
# Normalize our other main predicter variable...
mean = games_df['price'].mean()
std = games_df['price'].std()

games_df['price'] = (games_df['price'] - mean) / std

In [20]:
# Output a subset of the df that will be used for modeling

columns_to_drop = ["price_category", "positive_review_percent", "relevant_langs_reviews_sum", \
                    "tags", "language_comment_counts", "date_scraped", "tag_count", "usable_tag_count", \
                    "comment_ratios", "mod_interface_languages", "mod_full_audio_languages", "mod_subtitles_languages"]

games_df = games_df.drop(columns=columns_to_drop)

In [21]:
# Because selecting our specific target will also be something we play with in modeling, we'll do that in the modeling notebook!
# But we can still split things into X-inverses and y-iverses

target_cols = ['comment_diff_agnostic', 'comment_diff_any', 'comment_diff_interface', 'comment_diff_audio', 'comment_diff_subtitles', 'release_date']

X = games_df.drop(columns=target_cols)
y = games_df[target_cols]

Save & Quit
---

In [22]:
with open('../data/processed/2 - Games DF - PreProcessed Features', 'wb') as file :
    pickle.dump(X, file)

with open('../data/processed/2 - Games DF - PreProcessed Targets', 'wb') as file :
    pickle.dump(y, file)

-----------------------