In [110]:
import pandas as pd
import numpy as np
import pickle

from datetime import datetime, timedelta

%store -r tags_dict
%store -r all_languages

In [111]:
with open('../data/interim/1 - Games DF - Wrangled', 'rb') as file :
    games_df = pickle.load(file)

games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48320 entries, 0 to 48319
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   app_id                       48320 non-null  int64   
 1   release_date                 48296 non-null  object  
 2   price                        48169 non-null  float64 
 3   price_category               48169 non-null  category
 4   positive_review_percent      35262 non-null  float64 
 5   relevant_langs_comments_sum  48320 non-null  int32   
 6   tags                         48320 non-null  object  
 7   tag_list                     48320 non-null  object  
 8   mod_interface_languages      48320 non-null  object  
 9   mod_full_audio_languages     48320 non-null  object  
 10  mod_subtitles_languages      48320 non-null  object  
 11  language_comment_counts      48320 non-null  object  
 12  comment_ratios               48320 non-null  object  
 13  c

Step 1: Pare Down the Columns
---

In [112]:
# NOTE: This step moved to Wrangling to control file size for GitHub.

# Now we'll make a list of columns we really don't need and take them out.

# columns_to_drop = ['title', 'game_page_link', 'developer', 'publisher', \
#                    'description', "number_of_reviews", "interface_languages", "full_audio_languages", \
#                     "subtitles_languages"]

# # We don't need the language count columns any more, either.
# columns_to_drop += all_languages

# for col in columns_to_drop :
#     if col in games_df.columns :
#         games_df.drop(col, axis=1, inplace=True)

# games_df.columns

Step 2: Remove Unusable Rows
---

Let's start by removing records that are unsuitable for our purposes sue to reasons unrelated to feature engineering.

This includes:

- Games that are too recently published to have had time to be localized and/or representative comment counts
- Games with no price info


In [113]:
# Arbitrarily set a cutoff date of 1 year ago
today = datetime.today()
cutoff_date = today - timedelta(weeks=(52))

# Remove 
games_df = games_df[games_df['release_date'] < cutoff_date]
print("Removed games released less than 1 year ago.")
print(f"Games remaining: {len(games_df)}")

Removed games released less than 1 year ago.
Games remaining: 43177


In [114]:
# Remove games with no price info
games_df = games_df[~games_df['price'].isnull()]

print("Removed games with no price info.")
print(f"Games remaining: {len(games_df)}")

Removed games with no price info.
Games remaining: 43044


In [115]:
# Remove games with too few comments

comment_counts = games_df['relevant_langs_comments_sum']
ordered_comment_counts = comment_counts.sort_values(ascending=False)
ordered_comment_counts

cutoffs = range(0, 10000, 500)

for cutoff in cutoffs :
    print(f"Games with more than {cutoff} comments: {len(ordered_comment_counts[ordered_comment_counts > cutoff])}")

Games with more than 0 comments: 43044
Games with more than 500 comments: 8688
Games with more than 1000 comments: 6118
Games with more than 1500 comments: 4942
Games with more than 2000 comments: 4246
Games with more than 2500 comments: 3775
Games with more than 3000 comments: 3421
Games with more than 3500 comments: 3147
Games with more than 4000 comments: 2896
Games with more than 4500 comments: 2708
Games with more than 5000 comments: 2536
Games with more than 5500 comments: 2399
Games with more than 6000 comments: 2275
Games with more than 6500 comments: 2172
Games with more than 7000 comments: 2058
Games with more than 7500 comments: 1983
Games with more than 8000 comments: 1919
Games with more than 8500 comments: 1843
Games with more than 9000 comments: 1777
Games with more than 9500 comments: 1717


In [116]:
games_df = games_df[games_df['relevant_langs_comments_sum'] > 800]

games_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6858 entries, 0 to 43397
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   app_id                       6858 non-null   int64   
 1   release_date                 6858 non-null   object  
 2   price                        6858 non-null   float64 
 3   price_category               6858 non-null   category
 4   positive_review_percent      6855 non-null   float64 
 5   relevant_langs_comments_sum  6858 non-null   int32   
 6   tags                         6858 non-null   object  
 7   tag_list                     6858 non-null   object  
 8   mod_interface_languages      6858 non-null   object  
 9   mod_full_audio_languages     6858 non-null   object  
 10  mod_subtitles_languages      6858 non-null   object  
 11  language_comment_counts      6858 non-null   object  
 12  comment_ratios               6858 non-null   object  
 13  co

----------------------

Now let's look at what we have left, and preserve only records that are useful for ML.

Since tags are our main feature, let's remove any games without enough tags. This process must be somewhat arbitrary.

In [117]:
# Identify games with too few tags to be useful.
def refresh_tag_count(df) :
    """
    Looks for a "tags" row in {df} containing lists
    Creates a "tag_count" row in {df} returning the len of that row's list
    Returns {df}
    """

    df['tag_count'] = 0

    for index, row in df.iterrows() :
        df.loc[index, 'tag_count'] = len(row['tag_list'])

    print(df['tag_count'].value_counts())

    return df

games_df = refresh_tag_count(games_df)

20    4593
16     211
15     184
14     182
19     180
13     173
17     168
11     167
12     166
18     154
10     151
8      149
9      129
7      103
6       56
5       52
4       28
3        6
0        3
1        2
2        1
Name: tag_count, dtype: int64


In [118]:
# I'll arbitrarily decide that games need to have the maximum of 20 tags to be considered, and drop the rest of the rows.

indices_to_drop = games_df['tag_count'] < 20

games_df = games_df[~indices_to_drop]

print("Removed games with fewer than 20 tags.")
print(f"Games remaining: {len(games_df)}")

Removed games with fewer than 20 tags.
Games remaining: 4593


By the same token, tags which are used too infrequently would also lack predictive power.

In [119]:
# First, see how often the tags are used in general.

def get_sorted_tag_frequency(df) :
    """
    Counts the frequency of items in lists of a 'tags' row of {df}.
    Returnes a sorted high-low Series with the items as keys and frequency counts as values.
    Prints the number of tags by frequency at thresholds of 50.
    """
    exploded_tags = df.explode('tag_list')
    tag_counts = exploded_tags['tag_list'].value_counts()
    sorted_tag_frequency = tag_counts.sort_values(ascending=False)

    cutoffs = [x for x in range(800, 2001, 50)]

    for cutoff in cutoffs :
        print(f"# of tags used more than {cutoff} times: {len(sorted_tag_frequency[sorted_tag_frequency > cutoff])}")

    return sorted_tag_frequency

sorted_tag_frequency = get_sorted_tag_frequency(games_df)

# of tags used more than 800 times: 25
# of tags used more than 850 times: 24
# of tags used more than 900 times: 19
# of tags used more than 950 times: 18
# of tags used more than 1000 times: 18
# of tags used more than 1050 times: 17
# of tags used more than 1100 times: 15
# of tags used more than 1150 times: 15
# of tags used more than 1200 times: 12
# of tags used more than 1250 times: 12
# of tags used more than 1300 times: 12
# of tags used more than 1350 times: 10
# of tags used more than 1400 times: 9
# of tags used more than 1450 times: 7
# of tags used more than 1500 times: 7
# of tags used more than 1550 times: 6
# of tags used more than 1600 times: 6
# of tags used more than 1650 times: 6
# of tags used more than 1700 times: 6
# of tags used more than 1750 times: 6
# of tags used more than 1800 times: 5
# of tags used more than 1850 times: 4
# of tags used more than 1900 times: 4
# of tags used more than 1950 times: 4
# of tags used more than 2000 times: 4


In [120]:
# I'll arbitrarily peg the number of times a tag must be used in order
# to be considered in our analysis to 10% of the number of records.
tag_usage_cutoff = int(len(games_df) * 0.1)
print(f'Tags must occur {tag_usage_cutoff} times to be considered.')

# Now let's get a list of the usable tags.
usable_tags = sorted_tag_frequency[sorted_tag_frequency > tag_usage_cutoff].keys()
print(f'There are {len(usable_tags)} such tags.')

Tags must occur 459 times to be considered.
There are 55 such tags.


In [121]:
# Let's make sure these tags are all useful, and that there are no duplicates!

usable_tags = sorted(usable_tags)

for tag in usable_tags :
    print(tag)

2D
3D
Action
Action-Adventure
Adventure
Anime
Atmospheric
Building
Casual
Character Customization
Choices Matter
Co-op
Colorful
Comedy
Controller
Cute
Dark
Difficult
Early Access
Exploration
FPS
Family Friendly
Fantasy
Female Protagonist
First-Person
Free to Play
Funny
Gore
Great Soundtrack
Horror
Indie
Management
Multiplayer
Online Co-Op
Open World
Pixel Graphics
Platformer
Psychological Horror
Puzzle
PvP
RPG
Realistic
Relaxing
Retro
Sandbox
Sci-fi
Shooter
Simulation
Singleplayer
Story Rich
Strategy
Survival
Tactical
Third Person
Violent


Looks like there are some potential conflicts that we should look into:
- Cartoon + Cartoony
- Comedy + Funny
- First-Person + FPS
- Retro + Old School

The only one I think we can reject out-of-hand is "Early Access", since that has nothing to do with the content of the game.

In [122]:
potential_problems = ['Cartoon', 'Cartoony', 'Comedy', 'Funny', 'First-Person', 'FPS', 'Retro', 'Old School']

usable_tags.remove("Early Access")

In [123]:
# Now let's see how many records have how many of those usable tags.
games_df['usable_tag_count'] = 0

for index, row in games_df.iterrows() :
    for tag in row['tag_list'] :
        if tag in usable_tags :
            games_df.loc[index, 'usable_tag_count'] += 1

# Display how many records have how many usable tags.
usable_tag_counts = games_df['usable_tag_count'].value_counts().sort_index()

usable_tag_counts

3       4
4       5
5      29
6      91
7     165
8     288
9     405
10    518
11    657
12    675
13    689
14    495
15    321
16    168
17     64
18     17
19      2
Name: usable_tag_count, dtype: int64

In [124]:
# Cumulative tag counts would be easier to process visually...

cumulative_counts = {}

for count in usable_tag_counts.keys() :
    cumulative_counts[count] = 0
    for index, value in usable_tag_counts.iteritems() :
        if index >= count :
            cumulative_counts[count] += value

for index, value in cumulative_counts.items() :
    print(f"{value} games have at least {index} usable tag(s).")

4593 games have at least 3 usable tag(s).
4589 games have at least 4 usable tag(s).
4584 games have at least 5 usable tag(s).
4555 games have at least 6 usable tag(s).
4464 games have at least 7 usable tag(s).
4299 games have at least 8 usable tag(s).
4011 games have at least 9 usable tag(s).
3606 games have at least 10 usable tag(s).
3088 games have at least 11 usable tag(s).
2431 games have at least 12 usable tag(s).
1756 games have at least 13 usable tag(s).
1067 games have at least 14 usable tag(s).
572 games have at least 15 usable tag(s).
251 games have at least 16 usable tag(s).
83 games have at least 17 usable tag(s).
19 games have at least 18 usable tag(s).
2 games have at least 19 usable tag(s).


  for index, value in usable_tag_counts.iteritems() :


In [125]:
# Let's arbitrarily set the threshold at 15.
# We can always change it later.

rows_with_enough_usable_tags = games_df['usable_tag_count'] > 14

games_df = games_df[rows_with_enough_usable_tags]

games_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 572 entries, 1 to 43397
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   app_id                       572 non-null    int64   
 1   release_date                 572 non-null    object  
 2   price                        572 non-null    float64 
 3   price_category               572 non-null    category
 4   positive_review_percent      572 non-null    float64 
 5   relevant_langs_comments_sum  572 non-null    int32   
 6   tags                         572 non-null    object  
 7   tag_list                     572 non-null    object  
 8   mod_interface_languages      572 non-null    object  
 9   mod_full_audio_languages     572 non-null    object  
 10  mod_subtitles_languages      572 non-null    object  
 11  language_comment_counts      572 non-null    object  
 12  comment_ratios               572 non-null    object  
 13  com

In [126]:
# Now let's see if we have any "unusable" tags remaining in our dataset:

used_tag_set = set()

for tag_list in games_df['tags'] :
    for tag in tag_list :
        used_tag_set.add(tag)

mismatched_tags = [tag for tag in used_tag_set if tag not in usable_tags]

len(mismatched_tags)

175

In [127]:
# I guess we better remove them!

for index, row in games_df.iterrows() :
    row_tags = row['tag_list']
    for tag in row_tags :
        if tag in mismatched_tags :
            row_tags.remove(tag)
            games_df.at[index, 'tag_list'] = row_tags

# Now let's see our tag counts per game...

games_df = refresh_tag_count(games_df)

print(len(games_df))

16    225
17    213
18     82
15     42
19     10
Name: tag_count, dtype: int64
572


Step 4: Remove Games with Too Few Comments
---
I'll determine an arbitrary threshold for average comments in all languages with comments, and drop games beneath that

In [128]:
# comment_counts = games_df['relevant_langs_comments_sum']
# ordered_comment_counts = comment_counts.sort_values(ascending=False)
# ordered_comment_counts

# cutoffs = range(0, 10000, 500)

# for cutoff in cutoffs :
#     print(f"Games with more than {cutoff} comments: {len(ordered_comment_counts[ordered_comment_counts > cutoff])}")

In [129]:
# I'll arbitrarily select 500 as the minimum number of comment that qualify a game for analysis.

# games_df = games_df[games_df['relevant_langs_comments_sum'] > 500]

# games_df.info()

In [130]:
# Our index has gotten messed up by deleting rows. Let's fix that.

games_df = games_df.reset_index()

Prepare for Modeling
---

In [131]:
# Let's do a silly one-hot
for tag in usable_tags :
    games_df[tag] = 0
    for index, row in games_df.iterrows() :
        if tag in row['tag_list'] :
            games_df.loc[index, tag] = 1
        # dx Now, let's do something even sillier!
        # The 'tags' column holds the tags that Steam deems MOST relevant to the game.
        # Let's honor that by giving those tags a little boost on the one-hot...
        if tag in row['tags'] :
            games_df.loc[index, tag] += 1

In [132]:
# Normalize our other main predicter variable...
mean = games_df['price'].mean()
std = games_df['price'].std()

games_df['price'] = (games_df['price'] - mean) / std

In [133]:
# Output a subset of the df that will be used for modeling

columns_to_drop = ["index", "release_date", "price_category", "positive_review_percent", "relevant_langs_comments_sum", \
                    "tags", "tag_list", "language_comment_counts", "date_scraped", "tag_count", "usable_tag_count", \
                    "comment_ratios", "mod_interface_languages", "mod_full_audio_languages", "mod_subtitles_languages"]

games_df = games_df.drop(columns=columns_to_drop)

games_df = games_df.set_index("app_id")

In [134]:
games_df.head()

Unnamed: 0_level_0,price,comment_diff_agnostic,comment_diff_any,comment_diff_interface,comment_diff_audio,comment_diff_subtitles,2D,3D,Action,Action-Adventure,...,Sci-fi,Shooter,Simulation,Singleplayer,Story Rich,Strategy,Survival,Tactical,Third Person,Violent
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
553850,1.463921,"{'german': 0.10284806700274907, 'french': -0.0...","{'german': 0.1061654567301325, 'french': -0.00...","{'german': 0.10621123967582302, 'french': -0.0...","{'german': 0.10119022485294804, 'french': -0.0...","{'german': 0.09949363950374279, 'french': -0.0...",0,0,2,0,...,1,2,0,1,0,0,0,0,1,1
1245620,2.736844,"{'german': 0.022414002872037653, 'french': -0....","{'german': 0.025731392599421088, 'french': -0....","{'german': 0.025777175545111608, 'french': -0....","{'german': nan, 'french': nan, 'spanish': nan,...","{'german': 0.01905957537303138, 'french': -0.0...",0,1,1,0,...,0,0,0,1,0,0,0,0,2,1
1203620,0.827459,"{'german': 0.07807709053510958, 'french': 0.01...","{'german': 0.08139448026249302, 'french': 0.01...","{'german': 0.08144026320818354, 'french': 0.01...","{'german': nan, 'french': nan, 'spanish': nan,...","{'german': nan, 'french': nan, 'spanish': nan,...",0,1,1,0,...,0,0,0,1,0,0,2,0,0,0
381210,0.190997,"{'german': -0.019574800402787085, 'french': -0...","{'german': -0.01625741067540365, 'french': -0....","{'german': -0.01621162772971313, 'french': -0....","{'german': nan, 'french': -0.00296560849453240...","{'german': -0.02292922790179336, 'french': -0....",0,0,1,0,...,0,0,0,0,0,1,2,0,1,1
252490,1.463921,"{'german': 0.023113350648391873, 'french': -0....","{'german': 0.026430740375775308, 'french': -0....","{'german': 0.026476523321465828, 'french': -0....","{'german': 0.021455508498590847, 'french': -0....","{'german': 0.0197589231493856, 'french': -0.00...",0,0,1,0,...,0,1,1,0,0,0,2,0,0,0


In [136]:
# Because selecting our specific target will also be something we play with in modeling, we'll do that in the modeling notebook!
# But we can still split things into X-inverses and y-iverses

target_cols = ['comment_diff_agnostic', 'comment_diff_any', 'comment_diff_interface', 'comment_diff_audio', 'comment_diff_subtitles']

X = games_df.drop(columns=target_cols)
y = games_df[target_cols]

Save & Quit
---

In [137]:
with open('../data/processed/2 - Games DF - PreProcessed Features', 'wb') as file :
    pickle.dump(X, file)

with open('../data/processed/2 - Games DF - PreProcessed Targets', 'wb') as file :
    pickle.dump(y, file)

-----------------------