## Data preparation for Steam datasets

In [162]:
import pandas as pd
import pickle
import string
import nltk
import re      
import ast
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import MultiLabelBinarizer

### PLAYTIME DATA PREP

In [163]:
# load steam_playtime.csv
users_df = pd.read_csv('../data/steam_playtime.csv')

In [164]:
len(users_df)

2731661

user playtimes inliers max/min values based on z-score and iqr as shown in data analysis<br>
z_max = ~128000<br>
z_min = 1<br>
iqr_max = ~1600<br>
iqr_min = 1

In [165]:
# drop counterstrike i.e. drop rows where appid is 730, 10 or 240
# users_df = users_df[~users_df['appid'].isin([730, 10, 240])]

# remove user playtimes over x minutes
users_df = users_df[users_df['playtime_forever'] < 1700]

In [166]:
# remove user playtimes under x minutes
#users_df = users_df[users_df['playtime_forever'] > 10]

In [167]:
# drop ridiculous playtime_2weeks values
users_df = users_df[users_df['playtime_2weeks'] < 5000]

In [168]:
# RUN THIS IF YOU WANT TO DROP GAMES WITH ONLY ONE PLAYTIME
# group by appid and get count, mean, median, min, max
playtime_by_game = users_df.groupby('appid').agg({'playtime_forever': ['count', 'mean', 'median', 'min', 'max']}).round(2)
# get appids where count is 1
only_one_playtime = playtime_by_game[playtime_by_game[('playtime_forever', 'count')] == 1].index
# exclude games with only one playtime
users_df = users_df[~users_df['appid'].isin(only_one_playtime)]

### GAME DATA PREP

In [169]:
# load steam_app_metadata.csv
games_df = pd.read_csv('../data/steam_app_metadata.csv')

In [170]:
# remove duplicate appid
games_df = games_df.drop_duplicates(subset=['appid'])

In [171]:
# get duplicates for name
dupl_game_names = games_df[games_df.duplicated(subset=['name'])]

In [172]:
# rename appid's in users_df that are duplicates to original appid
for index, row in dupl_game_names.iterrows():
    dupl_appid = row['appid']
    orig_appid = games_df[games_df['name'] == row['name']]['appid'].iloc[0]
    users_df.loc[users_df['appid'] == dupl_appid, 'appid'] = orig_appid

In [173]:
# remove duplicate game names from games_df
games_df = games_df.drop_duplicates(subset=['name'])

In [174]:
no_description = games_df[games_df['description'].isnull() | (games_df['description'] == '')]
games_df = games_df[~games_df['appid'].isin(no_description['appid'])]

In [175]:
# remove games with no publisher or developer
no_dev_or_pub = games_df[(games_df['developer'] == 'None') & games_df['publisher'].isnull()]
games_df = games_df[~games_df['appid'].isin(no_dev_or_pub['appid'])]

In [176]:
# Fill NaN for publisher with developer
games_df['publisher'] = games_df['publisher'].fillna(games_df['developer'])

In [177]:
# remove games from games_df that are not in users_df
games_df = games_df[games_df['appid'].isin(users_df['appid'].unique())]

In [178]:
# create games df for flask app
games_flask_df = games_df[['appid', 'name', 'developer', 'publisher', 'description']]

In [179]:
# prepend developer to description_clean
games_df['description'] = games_df['developer'] + ' ' + games_df['description']

#### one hot encode categories and genres

In [181]:
# make categories and genres into lists
games_df['categories'] = games_df['categories'].apply(ast.literal_eval)
games_df['genres'] = games_df['genres'].apply(ast.literal_eval)

In [182]:
# games_flask_df genres and categories lists as comma separated strings into games_flask_df
games_flask_df['categories'] = games_df['categories'].apply(lambda x: ', '.join(x))
games_flask_df['genres'] = games_df['genres'].apply(lambda x: ', '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_flask_df['categories'] = games_df['categories'].apply(lambda x: ', '.join(x))


In [183]:
# multilabel binarizer
mlb = MultiLabelBinarizer()

In [184]:
# encode categories 
games_df = games_df.join(pd.DataFrame(mlb.fit_transform(games_df.pop('categories')),
                          columns=mlb.classes_,
                          index=games_df.index))

In [185]:
# encode genres
games_df = games_df.join(pd.DataFrame(mlb.fit_transform(games_df.pop('genres')),
                          columns=mlb.classes_,
                          index=games_df.index))

#### process description column NLP style

In [186]:
# fill 'description' column NaN with empty string, NO LONGER NECESSARY?
games_df['description'] = games_df['description'].fillna('')

In [187]:
stopwords = nltk.corpus.stopwords.words('english')
porter = PorterStemmer()

In [188]:
# lemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [189]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/berry/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [190]:
tag_regex = re.compile(r'<[^>]+>')

In [191]:
# function to clean text for flask app
def clean_text_for_app(text):
    # remove tags
    text = tag_regex.sub(' ', text)
    # remove URL links
    text = re.sub(r'http\S+', ' ', text)
    # remove hex
    text = re.sub(r'[^\x00-\x7f]',r'', text)
    return text


In [192]:
# function to clean text
def clean_text(text):
    # remove tags
    text = tag_regex.sub(' ', text)
    # remove URL links
    text = re.sub(r'http\S+', ' ', text)
    # remove non-breaking space
    text = text.replace(u'\xa0', u' ')
    # remove hex
    text = re.sub(r'[^\x00-\x7f]',r'', text)
    # hyphens to space
    text = text.replace('-', ' ')
    # remove "&quot;
    text = text.replace('&quot;', '')
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # remove numbers
    text = text.translate(str.maketrans('', '', string.digits))
    # make lowercase
    text = text.lower()
    # tokenize text
    tokens = nltk.word_tokenize(text)
    # remove stopwords
    tokens = [w for w in tokens if w not in stopwords]
    # lemmatize words
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    # stem words
    tokens = [porter.stem(w) for w in tokens]
    # make tokens into string
    text = ' '.join(tokens)
    # return clean text
    return text

In [193]:
# apply clean_text function to description column
games_df['description_clean'] = games_df['description'].apply(clean_text)

In [194]:
games_flask_df['description'] = games_flask_df['description'].apply(clean_text_for_app)

#### Save some dataframes and other stuff

In [195]:
users_df = users_df.drop(columns= ['playtime_2weeks'])

In [196]:
# drop rows from users_df where appid is not in games_df
# playtime for games is still in user profiles even if a game is no longer on steam
users_df = users_df[users_df['appid'].isin(games_df['appid'])]

In [197]:
# check datasets appid length
len(users_df['appid'].unique()) == len(games_df['appid'].unique() == len(games_df))

True

In [198]:
# save users_df to csv
users_df.to_csv('../data/steam_playtime_clean.csv', index=False)

In [199]:
# drop features not needed for model
games_df = games_df.drop(columns = ['developer', 'publisher', 'description'])

In [200]:
# reset games_df index
games_df = games_df.reset_index(drop=True)

In [201]:
# save games_df to pickle
games_df.to_pickle('../data/steam_games_clean.pkl')

In [202]:
# save a version for flask_app
games_flask_df.to_csv('../data/steam_metadata_flask.csv', index=False)