In [None]:
#default_exp steam_data

In [None]:
#export 

import scipy
import pandas as pd
import tqdm

from sklearn import compose, feature_extraction, metrics
from functools import reduce, partial
import numpy as np
import attr
import seaborn as sns
import re

In [None]:
#export

METACRITIC_GAME_METADATA_PATH = 'data/metacritic-video-game-comments/metacritic_game_info.csv'
STEAM_GAME_METADATA_PATH = 'data/steam_games_metadata/steam_games.csv'
STEAM_GAME_RATINGS_PATH = 'data/steam-video-games/steam-200k.csv'

# Steam data

In [None]:
%cd ..

In [None]:
!ls data/steam_games_metadata/

In [None]:
!ls data/steam-video-games/

In [None]:
raw_steam_df = pd.read_csv(STEAM_GAME_METADATA_PATH)

# Preprocessing

The two datasets we use:

- trolukovich/steam-games-complete-dataset
- tamber/steam-video-games

Have incompatible names, so we need to normalize them.

In [None]:
# export

characters_replaced_with_space_regex = re.compile('[\:,®-–"™]') 
dropped_patterns_regex = re.compile('beta|demo')


def normalize_numerals(name):
    normalized_name = name
    numeral_patterns = [
        ('ii', '2'),
        ('iii', '3'),
        ('iv', '4'),
        ('vi', '6'),
        ('v', '5')
   ]
    for (numeral, normalized_numeral) in numeral_patterns:
        numeral_pattern = ' ' + numeral + '($| )'
        normalized_numeral_pattern = ' ' + normalized_numeral + '\\1'
        normalized_name = re.sub(numeral_pattern, normalized_numeral_pattern, normalized_name) 
    return normalized_name


def normalize_name(name):
    
    name_without_illegal_characters = re.sub(characters_replaced_with_space_regex, ' ', name)
    name_without_illegal_characters = re.sub(dropped_patterns_regex, '', name_without_illegal_characters)
        
    name_without_illegal_characters = (
        name_without_illegal_characters
        .replace('!', ' ')
        .replace('(', ' ')
        .replace(')', ' ')
        .replace('+', ' ')
        .replace('-', ' ')
        .replace('&', 'and')
        .replace('’', "'")
        .replace('`', "'")
        .replace('’', "'")
        .replace("'s", "s")
        .replace("'t", 't')
        .replace('gold edition', 'gold')
        .replace('gold pack', 'gold')
        .replace('osx', '')
        .lower()
    )
    name_without_multiple_whitespaces = re.sub('\s+', ' ', name_without_illegal_characters)
    name_with_normalized_numerals = normalize_numerals(name_without_multiple_whitespaces)
    return name_with_normalized_numerals.lower().strip()


def normalize_names(name_col):
    return name_col.apply(normalize_name)

In [None]:
assert normalize_name("Sid Meier's Civilization IV") == 'sid meiers civilization 4'
assert normalize_name("Sid Meier's Civilization IV Warlords") == 'sid meiers civilization 4 warlords'

# Basic information

In [None]:
raw_steam_df.head()

In [None]:
raw_steam_df.info()

# Content without names

These correspond to steam product bundles. These aren't really games so we drop them. 

In [None]:
raw_steam_df[raw_steam_df['name'].isna()].head()

# Data cleaning

We remove entries that have a null in any of interesting fields

In [None]:
#export

def clean_steam_df(raw_steam_df):
    text_columns = [
        'name',
        'game_description',
        'game_details',
        'popular_tags',
        'desc_snippet',
        'genre'
    ]
    invalid_indices = reduce(
        lambda s1, s2: s1 | s2,
        [raw_steam_df[c].isna() for c in text_columns])


    tag_cols = ['popular_tags', 'game_details', 'genre']

    steam_df = raw_steam_df.copy()[~invalid_indices]
    steam_df.index = pd.RangeIndex(0, len(steam_df))
    for c in tag_cols:
        steam_df[c] = steam_df[c].str.lower()
        
    steam_df['name_unnormalized'] = steam_df['name']
    steam_df['name'] = normalize_names(steam_df['name'])
    return steam_df


def load_steam_df(steam_games_metadata_path=STEAM_GAME_METADATA_PATH):
    return clean_steam_df(pd.read_csv(steam_games_metadata_path))


def get_games_by_name(steam_df, substr):
    game_names = steam_df['name'][steam_df['name'].str.contains(substr)]
    return steam_df[steam_df['name'].isin(game_names)]

In [None]:
steam_df = load_steam_df() 

In [None]:

steam_df['name']

In [None]:
chosen_games_substring = 's.t.a.l.k'

In [None]:
chosen_games_df = get_games_by_name(steam_df, chosen_games_substring) 

In [None]:
chosen_games_df['name']

In [None]:
chosen_games_df.iloc[0]['popular_tags']

In [None]:
chosen_games_df.iloc[0]['game_details']

In [None]:
chosen_games_df.iloc[0]['genre']

In [None]:
chosen_games_df['game_description'].iloc[0]

In [None]:
chosen_games_df['popular_tags'].iloc[0]

In [None]:
game_descriptions = steam_df['game_description']

# Steam200k

In [None]:
!head $STEAM_GAME_RATINGS_PATH

In [None]:
# export


def load_steam_ratings_df(steam_ratings_path=STEAM_GAME_RATINGS_PATH):
    steam_ratings_df = pd.read_csv(STEAM_GAME_RATINGS_PATH)
    steam_ratings_df.columns = ['user_id', 'name', 'ownership', 'hours', 'useless']
    steam_ratings_df['log_hours'] = np.log1p(steam_ratings_df['hours'])
    steam_ratings_df['played'] = 1 * (steam_ratings_df['ownership'] == 'play')
    steam_ratings_df['interacted'] = 1
    steam_ratings_df['name_unnormalized'] = steam_ratings_df['name']
    steam_ratings_df['name'] = normalize_names(steam_ratings_df['name'])
    return steam_ratings_df.drop(axis='columns', columns=['useless'])


In [None]:
#export


@attr.s
class RecommenderDataset:
    
    data = attr.ib()
    target = attr.ib()
    target_col = attr.ib(default='log_hours')
    
    def filter_out_insufficient_reviews(self, col, threshold):
        is_number_of_reviews_sufficient = self._get_column_with_sufficient_reviews(self.data, col, threshold)
        new_data = self.data[is_number_of_reviews_sufficient]
        new_target = self.target[is_number_of_reviews_sufficient]
        return RecommenderDataset(new_data, new_target, self.target_col)
    
    @classmethod
    def _get_column_with_sufficient_reviews(cls, ratings_df, col, threshold):
        game_review_counts = ratings_df.groupby(col).agg('count').iloc[:,0]
        return ratings_df[col].isin(
            game_review_counts.index[game_review_counts >= threshold]
        )
    
    @staticmethod
    def make_collaborative_filtering_dataset(steam_ratings_df, target_col='log_hours'):
        """
        Clean data so that and 'hours' makes sense as target:
        for players who played a game delete record of buying
        """
        steam_ratings_df = steam_ratings_df[steam_ratings_df['played'] == 1]
        target = steam_ratings_df[target_col]
        data = steam_ratings_df.drop(columns=[target_col])
        return RecommenderDataset(data, target, target_col)
    
    @staticmethod
    def make_implicit_feedback_dataset(steam_ratings_df, target_col='log_hours'):
        """
        Clean data so that and 'hours' is an implicit feedback target:
        players that bought a game and didn't play it are treated as negative example
        """
        filtered_steam_ratings_df = steam_ratings_df.groupby(['name', 'user_id']).apply(
            lambda df: df if len(df) == 1 else df[df['ownership'] == 'play']
        )
        filtered_steam_ratings_df.index = pd.RangeIndex(len(filtered_steam_ratings_df))
        not_played = steam_ratings_df['played'] == 0
        target = filtered_steam_ratings_df[target_col]
        data = filtered_steam_ratings_df.drop(columns=[target_col])
        return RecommenderDataset(data, target, target_col)

In [None]:
#export


def get_item_user_matrix(dataset):
    steam_df, target = dataset.data, dataset.target
    ratings_df = pd.concat([steam_df, target], axis=1)
    item_user_pivoted_df = ratings_df.pivot_table(index='name', columns='user_id', values='log_hours')
    user_game_matrix = scipy.sparse.csr_matrix(item_user_pivoted_df.fillna(0))   
    return user_game_matrix


def get_item_user_matrix_train_val_split(dataset,  train_labels, labels_val):
    steam_df, target = dataset.data, dataset.target
    ratings_df = pd.concat([steam_df, target], axis=1)
    item_user_pivoted_df = ratings_df.pivot_table(index='name', columns='user_id', values='log_hours')
    user_game_matrix_train = scipy.sparse.csr_matrix(item_user_pivoted_df.loc[:, steam_df.iloc[train_labels].index].fillna(0))
    for __, row in tqdm.tqdm(ratings_df.iloc[labels_val].iterrows()):
        item_user_pivoted_df.loc[row['name'], row['user_id']] = 0
    
    user_game_matrix_train = scipy.sparse.csr_matrix(item_user_pivoted_df.loc[:, steam_df.iloc[train_labels]['user_id'].unique()].fillna(0))
    item_user_pivoted_df.iloc[:,:] = 0
    for i, row in tqdm.tqdm(ratings_df.iloc[labels_val].iterrows()):
        value = ratings_df[(ratings_df['user_id'] == row['user_id']) & (ratings_df['name'] == row['name'])]['log_hours'].iloc[0]
        if i == 0:
            print(value)
            print(value.shape)
        item_user_pivoted_df.loc[row['name'], row['user_id']] = value
    
    user_game_matrix_val = scipy.sparse.csr_matrix(item_user_pivoted_df.loc[:, steam_df.iloc[labels_val]['user_id'].unique()].fillna(0))
    return user_game_matrix_train.T, user_game_matrix_val.T

In [None]:
steam_df.describe()

In [None]:
steam_ratings_df = load_steam_ratings_df()
steam_ratings_df.groupby('user_id').agg('count')['name'].describe()

In [None]:
steam_ratings_df['played'].hist()

In [None]:
steam_ratings_df['hours'].min()

In [None]:
sns.distplot(steam_ratings_df['log_hours'], kde=False)

In [None]:
#ds = RecommenderDataset.make_implicit_feedback_dataset(steam_ratings_df)

In [None]:
#ds.data.index = pd.RangeIndex(len(ds.data))

In [None]:
steam_ratings_df.head()

In [None]:
len(set(steam_ratings_df.name))

In [None]:
len(set(steam_df.name))

In [None]:
games_with_metadata = set(normalize_names(steam_df['name']))
games_with_reviews = set(normalize_names(steam_ratings_df['name']))
games_without_metadata = games_with_reviews - games_with_metadata

print(len(games_without_metadata))

In [None]:
steam_ratings_df.shape

Ratings with games with metadata

In [1]:
#export


def filter_ratings_with_metadata(steam_ratings_df, steam_df=None):
    if steam_df is None:
        steam_df = load_steam_df()
    games_with_metadata = set(normalize_names(steam_df['name']))
    games_with_reviews = set(normalize_names(steam_ratings_df['name']))
    games_without_metadata = games_with_reviews - games_with_metadata
    return steam_ratings_df[~steam_ratings_df['name'].isin(games_without_metadata)]


In [None]:
steam_ratings_with_metadata_df = filter_ratings_with_metadata(steam_ratings_df)
steam_ratings_with_metadata_df.shape

In [None]:
from fuzzywuzzy import fuzz
from operator import itemgetter
import tqdm

for game_name in games_without_metadata:
    similarities = [(name, fuzz.ratio(game_name, name) / 100) for name in set(steam_df.name)] 
    similarities = sorted(similarities, key=itemgetter(1), reverse=True)
    if similarities[0][1] > 0.85:
        print(game_name, similarities[0])

In [None]:
steam_ratings_df[steam_ratings_df.name.isin(games_without_metadata)].groupby('name').agg('count')

In [None]:
games_without_metadata

In [None]:
steam_ratings_df[steam_ratings_df.name.str.lower().str.replace(':','').isin(games_without_metadata)].groupby('name').agg('count')

In [None]:
steam_ratings_df[steam_ratings_df.name.str.lower().str.replace(':','').isin(games_without_metadata)].groupby('name').agg('count')