Data Wrangling & Cleaning
---

This notebook includes the following steps:

1. Load the data from the 0 json
2. Fixes data types
3. Checks the data for reasonableness and 

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from datetime import date

from bs4 import BeautifulSoup
from urllib.request import urlopen

from thefuzz import fuzz
from fuzzywuzzy import process

Step 1: Load up our scraped dataset
---

In [2]:
games_df = pd.read_json('../data/raw/0 - Scraped Games DF.json', orient='records')

games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10008 entries, 0 to 10007
Data columns (total 42 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   app_id                   10008 non-null  int64  
 1   title                    10008 non-null  object 
 2   release_date             10008 non-null  object 
 3   positive_review_percent  9968 non-null   float64
 4   number_of_reviews        9968 non-null   float64
 5   price                    9390 non-null   float64
 6   game_page_link           10008 non-null  object 
 7   tags                     10008 non-null  object 
 8   date_scraped             10008 non-null  object 
 9   developer                10008 non-null  object 
 10  publisher                10006 non-null  object 
 11  description              10008 non-null  object 
 12  interface_languages      10008 non-null  object 
 13  full_audio_languages     10008 non-null  object 
 14  subtitles_languages   

In [72]:
games_df.head()

Unnamed: 0,app_id,title,release_date,positive_review_percent,number_of_reviews,price,game_page_link,tags,date_scraped,developer,...,dutch,norwegian,polish,brazilian,romanian,russian,finnish,swedish,turkish,vietnamese
0,730,Counter-Strike 2,"Aug 21, 2012",0.88,7624537.0,1499.0,https://store.steampowered.com/app/730/Counter...,"[1663, 1774, 3859, 3878, 19, 5711, 5055]",2023-10-03,Valve,...,18072,14293,428161,440207,51544,2008642,44889,55478,392048,9795
1,1086940,Baldur's Gate 3,"Aug 3, 2023",0.95,318163.0,5999.0,https://store.steampowered.com/app/1086940/Bal...,"[122, 6426, 1742, 4747, 21, 4325, 4474]",2023-10-03,Larian Studios,...,998,501,4040,9412,19,20921,493,1153,3546,31
2,1091500,Cyberpunk 2077,"Dec 9, 2020",0.8,575978.0,5999.0,https://store.steampowered.com/app/1091500/Cyb...,"[4115, 1695, 6650, 122, 4182, 3942, 4295]",2023-10-03,CD PROJEKT RED,...,1003,682,17850,18341,143,61375,867,1396,15197,159
3,1172470,Apex Legends™,"Nov 4, 2020",0.79,725075.0,,https://store.steampowered.com/app/1172470/Ape...,"[113, 3859, 176981, 1774, 1663, 3839, 1775]",2023-10-03,Respawn Entertainment,...,1729,1486,11889,14045,666,86575,2653,3697,14975,379
4,1063730,New World,"Sep 28, 2021",0.7,223724.0,3999.0,https://store.steampowered.com/app/1063730/New...,"[128, 1695, 1754, 122, 21, 19, 1775]",2023-10-03,Amazon Games,...,1151,605,7642,18133,151,14059,566,1378,6383,87


Step 2: Standardize our data types
---

In [73]:
# The numbers in the comment count columns are strings
# with extra characters. Let's standardize them as ints.

# First, let's get a list of the columns that we need to process.
# That's all columns from the comments df and one column from the
# original game page scraped df.
%store -r top_10_languages
%store -r all_languages
list_of_languages = top_10_languages.copy()
list_of_languages.append('english')

# We need all our comment counts to be ints.
# Some of them failed to scrape - we can almost certainly remove
# them without impacting our study, but I would like to check to
# make sure something didn't go wrong in our previous step that
# caused too many of them to fail.
# So, we'll two-birds this.
# As we ensure that all the values are ints, we'll remove the
# ones that are "Failed", and then display the removed indexes.
# We also have some games that weren't released at time of
# scraping, so we'll just remove those as well at this stage.

indexes_to_remove = set()

for index, row in games_df.iterrows() :
    for language in list_of_languages :
        # If the count is "Failed", mark it for deletion.
        if (row[language] == 'Failed') or (pd.isna(row[language])) :
            indexes_to_remove.add(index)
        

print(len(indexes_to_remove))

37


In [74]:
# Not too bad. Let's just drop them, then integerize.

games_df = games_df.drop(indexes_to_remove)
games_df = games_df.reset_index(drop=True)

for language in list_of_languages :
    games_df[language] = games_df[language].astype(int)

In [75]:
# Now let's make release_date a datetime object.
games_df['release_date'] = pd.to_datetime(games_df['release_date'], infer_datetime_format=True, format="%Y-%m-%d", errors='coerce')

In [76]:
# For games that have multiple publishers, developers, etc, they're all stored as one string.
# If we want to use this data eventually, we should split it into lists.
# Some games don't have publishers or developers, and that's fine.
# Oddly, some of them have trailing spaces.
for index, row in games_df.iterrows() :
    if not row['developer'] :
        games_df.at[index, 'developer'] = []
    else :
        if type(row['developer']) != list :
            row['developer'] = row['developer'].split(', ')
            # Since the above line changes the str into a list, we can .strip() each item in the list
            # with a comprehension.
            list_of_developers = [name.strip() for name in row['developer']]
            games_df.at[index, 'developer'] = list_of_developers

    # And again.
    if not row['publisher'] :
        games_df.at[index, 'publisher'] = []
    else :
        if type(row['publisher']) != list :
            row['publisher'] = row['publisher'].split(', ')
            list_of_publishers = [name.strip() for name in row['publisher']]
            games_df.at[index, 'publisher'] = list_of_publishers


In [77]:
# The tags are also just a string. We need them listed.
tags_list = []

for index, row in games_df.iterrows() :
    if (type(row['tags']) != list) & (row['tags'] != '') :
        tags_list = row['tags'].strip('[]').split(',')
        games_df.at[index, 'tags'] = tags_list

games_df['tags']

0          [1663, 1774, 3859, 3878, 19, 5711, 5055]
1           [122, 6426, 1742, 4747, 21, 4325, 4474]
2         [4115, 1695, 6650, 122, 4182, 3942, 4295]
3       [113, 3859, 176981, 1774, 1663, 3839, 1775]
4              [128, 1695, 1754, 122, 21, 19, 1775]
                           ...                     
9966     [3843, 1667, 3839, 1685, 1721, 3978, 3834]
9967      [3859, 1774, 113, 1663, 5363, 1775, 3839]
9968     [3843, 1671, 19, 3814, 353880, 3859, 6730]
9969      [4231, 1646, 122, 5851, 3843, 1685, 4182]
9970          [493, 1662, 1695, 5160, 21, 3859, 19]
Name: tags, Length: 9971, dtype: object

In [78]:
games_df.columns

Index(['app_id', 'title', 'release_date', 'positive_review_percent',
       'number_of_reviews', 'price', 'game_page_link', 'tags', 'date_scraped',
       'developer', 'publisher', 'description', 'interface_languages',
       'full_audio_languages', 'subtitles_languages', 'english', 'schinese',
       'tchinese', 'japanese', 'koreana', 'thai', 'bulgarian', 'czech',
       'danish', 'german', 'spanish', 'latam', 'greek', 'french', 'italian',
       'indonesian', 'hungarian', 'dutch', 'norwegian', 'polish', 'brazilian',
       'romanian', 'russian', 'finnish', 'swedish', 'turkish', 'vietnamese'],
      dtype='object')

In [79]:
# Same problem for our languages types columns (interface, audio, subitles).

for index, row in games_df.iterrows() :
    if (type(row['interface_languages']) != list) & (row['interface_languages'] != '') :
        languages_list = row['interface_languages'].strip('[]').split(', ')
        games_df.at[index, 'interface_languages'] = languages_list
    if (type(row['full_audio_languages']) != list) & (row['full_audio_languages'] != '') :
        languages_list = row['full_audio_languages'].strip('[]').split(', ')
        games_df.at[index, 'full_audio_languages'] = languages_list
    if (type(row['subtitles_languages']) != list) & (row['subtitles_languages'] != '') :
        languages_list = row['subtitles_languages'].strip('[]').split(', ')
        games_df.at[index, 'subtitles_languages'] = languages_list

# Check.
print(type(games_df.loc[0, 'interface_languages']))

<class 'list'>


In [80]:
# For future reference, let's create a dictionary of tags codes & their meanings.
# We can get that from the search page.
# Keys will be the codes. Values will be the names.
url = 'https://store.steampowered.com/search'
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')

# The relevant data is in this one code block.
code_block = soup.find('div', id="TagFilter_Container")

# Create the empty dict.
tags_dict = {}

# Iterate over all 400+ tags described in the code block.
for listing in code_block.find_all('div', class_='tab_filter_control_row') :
    tag_code = listing.get('data-value')
    tag_name = listing.get('data-loc')
    tags_dict[tag_code] = tag_name

# We'll probably need it later, so let's save it a couple ways.
# Weirdly, it's quicker and easier to do this via a DF.
tags_dict_df = pd.DataFrame.from_dict(tags_dict, orient='index')
tags_dict_df.to_csv('../data/interim/Tags Dictionary DF.csv')
%store tags_dict

Stored 'tags_dict' (dict)


Implement indicator variables
---

In [81]:
# For language_types, implement pipes (???)
# NOTE:dx WHAT DID I MEAN BY THIS??????

Step 3: Sanity checks
---

In [82]:
# Since our tags column is very important to us, let's make sure
# that the entries are reasonably comparable.

# In order to count the number of tags in each game's tags list,
# let's turn that column into a list of lists (by way of a series)
# so that we can use a list comprehension to get a list of len()s.
series_of_tags_column_values = games_df['tags']
list_of_tags_column_values = series_of_tags_column_values.tolist()
tags_values_lengths = [len(x) for x in list_of_tags_column_values]

# Now we organize the counts into a dictionary, where the keys are
# the lengths and the values are the frequencies of those lengths.
dict_of_lenghts = {}

for length in tags_values_lengths :
    if length in dict_of_lenghts :
        dict_of_lenghts[length] += 1
    else :
        dict_of_lenghts[length] = 1

print(dict_of_lenghts)

# Looks not too bad.

{7: 9566, 4: 72, 5: 121, 6: 131, 2: 26, 3: 50, 1: 5}


In [83]:
# I discovered that summing the number of comments in the top 10 loc languages + En
# as recorded in our DF resulted in a HIGHER comment count than Steam displays (which
# is the one we grabbed from Steam as "number_of_reviews").

# I don't know why that is, but it doesn't really matter. What we want is the RELATIVE
# difference in comments per game WITHIN each language, so as long as Steam calculates
# the number of comments in each language in a consistent way WITHIN each language, our
# method still works.

# Thus the "number_of_reviews" column may be useless to us, BUT I'm loathe to let go
# of any data. Let's keep it for now, and just add a new column that we'll use for our
# calculations: "relevant_langs_review_sum"

games_df['relevant_langs_reviews_sum'] = None

for index, row in games_df.iterrows() :
    games_df.loc[index, 'relevant_langs_reviews_sum'] = row[list_of_languages].sum()

games_df['relevant_langs_reviews_sum'] = games_df['relevant_langs_reviews_sum'].astype(int)

In [84]:
# NOTE: This NLP step is pretty complex and not entirely relevant for the 
# ML task at hand, so we'll leave it out for now.





# # Check for string similarity in all verbal cols.
# # In developer and publisher, this could likely find typos.

# # First, let's create sets of all the extant developer and publisher names
# # as currently typed.

# set_of_developer_names = set()

# for developers_list in games_df['developer'] :
#     for developer in developers_list :
#         set_of_developer_names.add(developer)

# set_of_publisher_names = set()

# for publishers_list in games_df['publisher'] :
#     for publisher in publishers_list :
#         set_of_publisher_names.add(publisher)

# # Now, let's see if any items in that list are super similar to each other.
# # This could possibly the the result of a typo.
# # Let's first find the MOST similar pair in each set.
# # If these pairs are clearly not typos, then it's very likely that there are
# # no typos at all.

# list_of_developer_similarities = []
# listing = []

# for developer in set_of_developer_names :
#     # Create a set of names to test this name against.
#     # We want to avoid testing the name against itself,
#     # since that will return a maximum similarity score
#     # and make our lives harder.
#     # So, we'll create a new set, then drop this name
#     # from the set before doing the comparisons.
#     testing_set = set_of_developer_names.copy()
#     testing_set.remove(developer)
#     highest = process.extract(developer, testing_set, limit=1)
#     # We need to retain both of the tested strings, while .extract only
#     # returns the second one. We'll have to group them manually.
#     # Let's also arbitrarily set 90 as the cutoff for similarity.
#     if highest[0][1] >= 90 :
#         listing = [developer, highest[0][0], highest[0][1]]
#         list_of_developer_similarities.append(listing)


# # Then we do it all again for publishers.
# list_of_publisher_similarities = []

# for publisher in set_of_publisher_names :
#     testing_set = set_of_publisher_names.copy()
#     testing_set.remove(publisher)
#     highest = process.extract(publisher, testing_set, limit=1)
#     if highest[0][1] >= 90 :
#         listing = [publisher, highest[0][0], highest[0][1]]
#         list_of_publisher_similarities.append(listing)


# # Now we sort the lists by the similarity score.
# list_of_developer_similarities.sort(key=lambda x: x[2], reverse=True)
# list_of_publisher_similarities.sort(key=lambda x: x[2], reverse=True)


# # Let's take a look at some of them... and also get a feel for
# # how long these lists are (and therefore how many of these names
# # are truly similar).
# print('Similar developer names:')
# print(list_of_developer_similarities[0:10])
# print('Total similarity scores over 90: '+str(len(list_of_developer_similarities)))
# print('')
# print('Similar publisher names:')
# print(list_of_publisher_similarities[0:10])
# print('Total similarity scores over 90: '+str(len(list_of_publisher_similarities)))

# # Well, what I've learned from this is that the names of developers and publishers
# # would be a LOT of work to fix. Since we don't need them for now (they aren't part
# # of our key analysis), we can just leave them as-is and clean them later if we need
# # them.

Step 4: Determine completeness
---

In [85]:
# While I want to keep all the data around just in case, this is a great
# point for us to determine whether some rows may not be useful for us.

# Our main label is the comment counts per language, so we can safely drop
# all rows that have no language-specific comment counts at all.

indexes_to_drop = []

for index, row in games_df[list_of_languages].iterrows() :
    if row.sum() == 0 :
        indexes_to_drop.append(index)

print(len(indexes_to_drop))


9


In [86]:
# Not bad. Not worth fussing over. Let's just throw them out
# directly.
games_df = games_df.drop(indexes_to_drop)
games_df = games_df.reset_index(drop=True)

In [87]:
# Our main feature is the tags, so we should make sure that all rows have them.
indexes_to_drop = []

for index, row in games_df.iterrows() :
    if len(row['tags']) == 0 :
        indexes_to_drop.append(index)

print(len(indexes_to_drop))
# Excellent! Looks like we're in the clear there.

0


Step 5: A bit more standardization
---

In [88]:
# I've noticed that the way languages are written in the langauge types columns is different
# from the way they're written in the languages columns. We'll need them to match up if we
# want to do a comparative analysis.

# The problems are threefold.
# First, the capitalization is different.
# Second, the language types columns split up some languages (for example, Spanish is split
#   into "Spanish - Spain" and "Spanish - Latin America").
# Third, some are just spelled differently, like "schinese" and "Simplified Chinese".

# Since I don't want to lose data, I'll make new columns to hold the 'reduced' versions
# (for example, where 'Spanish- Spain' is 'reduced' to 'spanish')

games_df['mod_interface_languages'] = [[] for _ in range(len(games_df))]
games_df['mod_full_audio_languages'] = [[] for _ in range(len(games_df))]
games_df['mod_subtitles_languages'] = [[] for _ in range(len(games_df))]


# First, let's just lowcap everything. Turn down the volume a bit.
for index, row in games_df.iterrows() :

    for item in row['interface_languages'] :
        # Make a easy-to-mess-with version
        lang = item.lower()

        # Check to see if it's a variant, replace if so
        for language in list_of_languages :
            if language in lang :
                lang = language

        # And it looks like there are three special cases we need to handle specifically.
        if lang == 'simplified chinese' :
            lang = 'schinese'
        
        if lang == 'portuguese - brazil' :
            lang = 'brazilian'
        
        if lang == 'korean' :
            lang = 'koreana'

        # Add the lowercaseified, standardizified version to the col in the main df
        games_df.loc[index, 'mod_interface_languages'].append(lang)


    # Now we do the same for the other two langauge type columns...
    for item in row['full_audio_languages'] :
        # Make a easy-to-mess-with version
        lang = item.lower()

        # Check to see if it's a variant, replace if so
        for language in list_of_languages :
            if language in lang :
                lang = language

        # And it looks like there are two special cases we need to handle specifically.
        if lang == 'simplified chinese' :
            lang = 'schinese'
        
        if lang == 'portuguese - brazil' :
            lang = 'brazilian'

        if lang == 'korean' :
            lang = 'koreana'

        # Add the lowercaseified, standardizified version to the col in the main df
        games_df.loc[index, 'mod_full_audio_languages'].append(lang)


    for item in row['subtitles_languages'] :
        # Make a easy-to-mess-with version
        lang = item.lower()

        # Check to see if it's a variant, replace if so
        for language in list_of_languages :
            if language in lang :
                lang = language

        # And it looks like there are two special cases we need to handle specifically.
        if lang == 'simplified chinese' :
            lang = 'schinese'
        
        if lang == 'portuguese - brazil' :
            lang = 'brazilian'

        if lang == 'korean' :
            lang = 'koreana'

        # Add the lowercaseified, standardizified version to the col in the main df
        games_df.loc[index, 'mod_subtitles_languages'].append(lang)

Step 6: Wrangle a couple more key columns
---

In [89]:
# Let's look at how our prices are distributed...
list_of_prices = games_df['price']
counts = list_of_prices.value_counts()
print(counts[counts > 50])
print(counts[counts > 50].sum())

1999.0    1736
999.0     1265
1499.0    1202
2999.0     684
2499.0     566
499.0      542
3999.0     386
699.0      264
799.0      252
1299.0     217
599.0      211
299.0      204
5999.0     195
1199.0     174
4999.0     142
399.0      139
199.0      126
1799.0     124
899.0      111
1699.0      99
3499.0      94
99.0        83
1599.0      73
1399.0      55
Name: price, dtype: int64
8944


In [90]:
# We can see that almost 5/6 of our games fall into just a
# few price points! That's significant to know. Let's use
# these as bins, then "below that" and "above that" as
# separate bins, for a total of 13 bins.

# I'm not 100% confident in the efficacy of this method,
# so let's preserve the original price data in the DF.
# Later, when modeling, we can see if there's a reason
# to use one or the other.

ranges = [-5, 0, 700, 1200, 1700, 2200, 2700, 3200, 3700, 4200, 4700, 5200, 5700, 6500, 7500, np.inf]
range_names = ['none', 'under 10', '999', '1499', '1999', '2499', '2999', '3499', '3999', '4499', '4999', '5499', '5999', '6999', 'over 80']
games_df['price_category'] = pd.cut(games_df['price'], bins=ranges, labels=range_names)

In [91]:
# We have all the info we need now, but it's not quite in a form
# that we can use. The total number of comments per language is
# meaningless - we need to know the average proportion of comments in
# that language, so we can tell if any individual game (and therefore
# any specific combination of tags) generates more engagement in that
# market than others do.

# Unfortunately, there's one other thing that probably impacts the
# proportion of comments much more than the game itself - whether or
# not the game has been localized into that language at all. So we can
# control for that, making multiple constants...

# To find the average, we'll need the total amount of comments in all langs
# for all games.
total_comments = games_df['relevant_langs_reviews_sum'].sum()

# Now we can programmatically calculate the percentage of ALL comments
# that each language occupies.
language_averages_agnostic = {}

# Iterate over the languages, summing all comments in that language to find
# the overall average.
for language in list_of_languages :
    total_comments_l = games_df[language].sum()
    language_average = total_comments_l / total_comments
    language_averages_agnostic[language] = language_average

print(language_averages_agnostic)


{'german': 0.036144778355734, 'french': 0.024984206743360108, 'spanish': 0.047231059143115164, 'brazilian': 0.04553095708577151, 'russian': 0.13040473822233248, 'italian': 0.007174079027273987, 'schinese': 0.16373661242532556, 'japanese': 0.005471265311775909, 'koreana': 0.022479065030598478, 'polish': 0.027782788606404412, 'english': 0.4890604500483084}


In [92]:
# Above is the calculation if we ignore whether or not the game is even in that
# language. Let's see if it's much different when we take into account text, voice,
# and subtitle translation, or any of the three.

# Let's subset for these 4 situations, then run the same calculations again.

language_averages_any = {}
language_averages_interface = {}
language_averages_audio = {}
language_averages_subtitles = {}


# Do it for all languages, all situations.
for language in list_of_languages :

    # First, the condition of the target language being localized in any way.
    running_language_comments = 0
    running_total_comments = 0

    for index, row in games_df.iterrows() :
        if language in row['mod_interface_languages'] or language in row['mod_full_audio_languages'] or language in row['mod_subtitles_languages'] :
            if not pd.isna(row[language]) :
                running_language_comments += int(row[language])
            if not pd.isna(row['relevant_langs_reviews_sum']) :
                running_total_comments += row['relevant_langs_reviews_sum']
    if running_total_comments != 0 :
        language_averages_any[language] = running_language_comments / running_total_comments


    # Now, the condition of the target language being an interface language.
    running_total_comments = 0
    running_language_comments = 0

    for index, row in games_df.iterrows() :
        if language in row['mod_interface_languages'] :
            if not pd.isna(row[language]) :
                running_language_comments += row[language]
            if not pd.isna(row['relevant_langs_reviews_sum']) :
                running_total_comments += row['relevant_langs_reviews_sum']
        if running_total_comments != 0 :
            language_averages_interface[language] = running_language_comments / running_total_comments


    # Now, the condition of the target language being an audio langauge.
    running_total_comments = 0
    running_language_comments = 0

    for index, row in games_df.iterrows() :
        if language in row['mod_full_audio_languages'] :
            if not pd.isna(row[language]) :
                running_language_comments += row[language]
            if not pd.isna(row['relevant_langs_reviews_sum']) :
                running_total_comments += row['relevant_langs_reviews_sum']
    if running_total_comments != 0 :
        language_averages_audio[language] = running_language_comments / running_total_comments


    # Now, the condition of the target language being a subtitle langauge.
    running_total_comments = 0
    running_language_comments = 0

    for index, row in games_df.iterrows() :
        if language in row['mod_subtitles_languages'] :
            if not pd.isna(row[language]) :
                running_language_comments += row[language]
            if not pd.isna(row['relevant_langs_reviews_sum']) :
                running_total_comments += row['relevant_langs_reviews_sum']
    if running_total_comments != 0 :
        language_averages_subtitles[language] = running_language_comments / running_total_comments



In [93]:
print(language_averages_agnostic)
print(language_averages_any)
print(language_averages_interface)
print(language_averages_audio)
print(language_averages_subtitles)

{'german': 0.036144778355734, 'french': 0.024984206743360108, 'spanish': 0.047231059143115164, 'brazilian': 0.04553095708577151, 'russian': 0.13040473822233248, 'italian': 0.007174079027273987, 'schinese': 0.16373661242532556, 'japanese': 0.005471265311775909, 'koreana': 0.022479065030598478, 'polish': 0.027782788606404412, 'english': 0.4890604500483084}
{'german': 0.03820873372873624, 'french': 0.026287156605066822, 'spanish': 0.048909878901039636, 'brazilian': 0.04903540680844366, 'russian': 0.14465392518202805, 'italian': 0.007641480840734052, 'schinese': 0.19441194304805706, 'japanese': 0.0056357526935938655, 'koreana': 0.02632102031860673, 'polish': 0.03217858504546673, 'english': 0.4890604500483084}
{'german': 0.03821671790426049, 'french': 0.026292358709004256, 'spanish': 0.048907999769560465, 'brazilian': 0.049015116804730835, 'russian': 0.1446776568327955, 'italian': 0.0076417149023017515, 'schinese': 0.19452605992606867, 'japanese': 0.0056408407643961265, 'koreana': 0.0263267

In [94]:
# I'm not sure if this is "wrangling"  or "feature engineering," but I might as well
# end this section with a full dataset...

# Let's create a column with the DIFFERENCES between the %s of comments in a language
# on a certain game and the average number of comments in that language on all games.
# A positive number here will indicate above-average interest within this language
# group for this game, and a negative number will indicate below-average interest.

# We'll make 5 such columns, one for each of the "language_averages" metrics. Later,
# during EDA, we can see which (if any) of them yield a clearer result.

# Since we have what we need already, let's just dig right in to creating columns.
# Let's make one column for each metric, and store all language values in a dict
# for each game in that column.

holding_dict = {}

games_df['comment_ratios'] = ''
games_df['comment_diff_agnostic'] = ''
games_df['comment_diff_any'] = ''
games_df['comment_diff_interface'] = ''
games_df['comment_diff_audio'] = ''
games_df['comment_diff_subtitles'] = ''


# First we need to create the dict of the ratio of comments for that specific
# language/game pair.
for index, row in games_df.iterrows() :

    for language in list_of_languages :
        score = np.nan
        score = row[language] / row['relevant_langs_reviews_sum']
        holding_dict[language] = score
    games_df.at[index, 'comment_ratios'] = holding_dict 
    holding_dict={}

# Now that that column is filled in, we can populate the rest of them with calculations.
for index, row in games_df.iterrows() :

    # Start with the agnostic metric.
    for language in list_of_languages :
        score = np.nan
        score = row['comment_ratios'][language] - language_averages_agnostic[language]
        holding_dict[language] = score
    games_df.at[index, 'comment_diff_agnostic'] = holding_dict 
    holding_dict={}

    # Now we do the 'any' metric
    for language in list_of_languages :
        score = np.nan
        if language in row['mod_interface_languages'] or \
                language in row['mod_full_audio_languages'] or \
                language in row['mod_subtitles_languages'] :
            score = row['comment_ratios'][language] - language_averages_any[language]
        holding_dict[language] = score
    games_df.at[index, 'comment_diff_any'] = holding_dict
    holding_dict={}

    # "interface_languages" metric.
    for language in list_of_languages :
        score = np.nan
        if language in row['mod_interface_languages'] :
            score = row['comment_ratios'][language] - language_averages_interface[language]
        holding_dict[language] = score
    games_df.at[index, 'comment_diff_interface'] = holding_dict
    holding_dict={}

    # "full_audio" metric.
    for language in list_of_languages :
        score = np.nan
        if language in row['mod_full_audio_languages'] :
            score = row['comment_ratios'][language] - language_averages_audio[language]
        holding_dict[language] = score
    games_df.at[index, 'comment_diff_audio'] = holding_dict
    holding_dict={}

    # Last but not least, 'subtitles.'
    for language in list_of_languages :
        score = np.nan
        if language in row['mod_subtitles_languages'] :
            score = row['comment_ratios'][language] - language_averages_subtitles[language]
        holding_dict[language] = score
    games_df.at[index, 'comment_diff_subtitles'] = holding_dict

In [95]:
# Now that all our ratios are in dictionary form, it feels kind of silly for the
# comment counts to have their own columns. Let's dictify that one as well.

games_df['language_comment_counts'] = ''

holding_dict = {}

for index, row in games_df.iterrows() :
    for language in list_of_languages :
        holding_dict[language] = row[language]
    games_df.at[index, 'language_comment_counts'] = holding_dict


Step 6: Pare down the columns
---

In [96]:
# Not all of our columns are useful for analysis, so let's remove some of them to make things
# easier to look at. I'll keep a version of the full dataframe saved as well, just in case
# we later decide that we needed some of that info. Then we can just add it back in using
# app_id as the key.

games_df.to_csv('../data/interim/0.5 - Games DF - Wrangled, Extra Columns.csv')

In [97]:
# Take a look and see what we need...
games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9962 entries, 0 to 9961
Data columns (total 54 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   app_id                      9962 non-null   int64         
 1   title                       9962 non-null   object        
 2   release_date                9956 non-null   datetime64[ns]
 3   positive_review_percent     9958 non-null   float64       
 4   number_of_reviews           9958 non-null   float64       
 5   price                       9345 non-null   float64       
 6   game_page_link              9962 non-null   object        
 7   tags                        9962 non-null   object        
 8   date_scraped                9962 non-null   object        
 9   developer                   9962 non-null   object        
 10  publisher                   9962 non-null   object        
 11  description                 9962 non-null   object      

In [105]:
index = 0
print("Game: "+games_df.loc[index, "title"])
print("Tags:")
for item in games_df.loc[index, "tags"] :
    print("  -"+tags_dict[str(item)])
i = 0
print("Comment differentials: ")
for item in games_df.loc[index, 'comment_diff_agnostic'].items() :
    print(item)
    i += item[1]

Game: Counter-Strike 2
Tags:
  -FPS
  -Shooter
  -Multiplayer
  -Competitive
  -Action
  -Team-Based
  -eSports
Comment differentials: 
('german', -0.0046936257628908545)
('french', -0.005913951883383603)
('spanish', -0.004395466327152514)
('brazilian', 0.02094086758361116)
('russian', 0.17290279402399292)
('italian', -0.004153902234964181)
('schinese', -0.016709374812716338)
('japanese', -0.004103797633601938)
('koreana', -0.01925866052834403)
('polish', 0.03687007452904677)
('english', -0.1714849569535974)


In [66]:
# Now we'll make a list of columns we really don't need and take them out.

columns_to_drop = ['title', 'positive_review_percent', 'game_page_link', 'developer', 'publisher', \
                   'description', "number_of_reviews", "interface_languages", "full_audio_languages", \
                    "subtitles_languages", "relevant_langs_reviews_sum"]

# We don't need the language count columns any more, either.
columns_to_drop += all_languages

for col in columns_to_drop :
    if col in games_df.columns :
        games_df.drop(col, axis=1, inplace=True)

games_df.columns

Index(['app_id', 'release_date', 'price', 'tags', 'date_scraped',
       'mod_interface_languages', 'mod_full_audio_languages',
       'mod_subtitles_languages', 'price_category', 'comment_ratios',
       'comment_diff_agnostic', 'comment_diff_any', 'comment_diff_interface',
       'comment_diff_audio', 'comment_diff_subtitles',
       'language_comment_counts'],
      dtype='object')

In [67]:
# The columns don't seem to be in a logical order. Let's fix that manually.
new_column_order = ['app_id', 'release_date', 'price', 'price_category', 'tags', 'mod_interface_languages', \
                    'mod_full_audio_languages', 'mod_subtitles_languages', 'language_comment_counts', \
                    'comment_ratios', 'comment_diff_agnostic', 'comment_diff_any', 'comment_diff_interface', \
                    'comment_diff_audio', 'comment_diff_subtitles']

games_df = games_df.reindex(columns=new_column_order)

games_df.columns

Index(['app_id', 'release_date', 'price', 'price_category', 'tags',
       'mod_interface_languages', 'mod_full_audio_languages',
       'mod_subtitles_languages', 'language_comment_counts', 'comment_ratios',
       'comment_diff_agnostic', 'comment_diff_any', 'comment_diff_interface',
       'comment_diff_audio', 'comment_diff_subtitles'],
      dtype='object')

Step 7: Save and quit
---

In [68]:
games_df.to_json('../data/processed/1 - Games DF - Wrangled.json')