## Data preparation for Steam datasets

In [32]:
import pandas as pd
import string
import nltk
import re      
import ast
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import MultiLabelBinarizer

### PLAYTIME DATA PREP

In [33]:
# load steam_playtime.csv
users_df = pd.read_csv('../data/steam_playtime.csv')

In [34]:
len(users_df)

2641446

In [35]:
# drop counterstrike i.e. drop rows where appid is 730, 10 or 240
users_df = users_df[~users_df['appid'].isin([730, 10, 240])]

# remove user playtimes over x minutes
users_df = users_df[users_df['playtime_forever'] < 130000]

In [36]:
# remove user playtimes under x minutes
users_df = users_df[users_df['playtime_forever'] > 10]

In [37]:
# drop ridiculous playtime_2weeks values
users_df = users_df[users_df['playtime_2weeks'] < 5000]

In [38]:
# RUN THIS IF YOU WANT TO DROP GAMES WITH ONLY ONE PLAYTIME
# group by appid and get count, mean, median, min, max
playtime_by_game = users_df.groupby('appid').agg({'playtime_forever': ['count', 'mean', 'median', 'min', 'max']}).round(2)
# get appids where count is 1
only_one_playtime = playtime_by_game[playtime_by_game[('playtime_forever', 'count')] == 1].index
# exclude games with only one playtime
users_df = users_df[~users_df['appid'].isin(only_one_playtime)]

### GAME DATA PREP

In [39]:
# load steam_app_metadata.csv
games_df = pd.read_csv('../data/steam_app_metadata.csv')

In [40]:
# remove duplicate appid
games_df = games_df.drop_duplicates(subset=['appid'])

In [41]:
# get duplicates for name
dupl_game_names = games_df[games_df.duplicated(subset=['name'])]

In [42]:
# rename appid's in users_df that are duplicates to original appid
for index, row in dupl_game_names.iterrows():
    dupl_appid = row['appid']
    orig_appid = games_df[games_df['name'] == row['name']]['appid'].iloc[0]
    users_df.loc[users_df['appid'] == dupl_appid, 'appid'] = orig_appid

In [43]:
# remove duplicate game names from games_df
games_df = games_df.drop_duplicates(subset=['name'])

In [44]:
no_description = games_df[games_df['description'].isnull() | (games_df['description'] == '')]
games_df = games_df[~games_df['appid'].isin(no_description['appid'])]

In [45]:
no_dev_or_pub = games_df[(games_df['developer'] == 'None') & games_df['publisher'].isnull()]
games_df = games_df[~games_df['appid'].isin(no_dev_or_pub['appid'])]

In [46]:
# Fill NaN for publisher with developer
games_df['publisher'] = games_df['publisher'].fillna(games_df['developer'])

In [47]:
# remove games from games_df that are not in users_df
games_df = games_df[games_df['appid'].isin(users_df['appid'].unique())]

In [48]:
games_df

Unnamed: 0,appid,name,description,developer,publisher,categories,genres
1,300,Day of Defeat: Source,Day of Defeat offers intense online action gam...,Valve,Valve,"['Multi-player', 'Cross-Platform Multiplayer',...",['Action']
2,320,Half-Life 2: Deathmatch,Fast multiplayer action set in the Half-Life 2...,Valve,Valve,"['Multi-player', 'Valve Anti-Cheat enabled', '...",['Action']
3,340,Half-Life 2: Lost Coast,Originally planned as a section of the Highway...,Valve,Valve,"['Single-player', 'Commentary available', 'Rem...",['Action']
4,10180,Call of Duty®: Modern Warfare® 2,The most-anticipated game of the year and the ...,Infinity Ward,Activision,"['Single-player', 'Multi-player', 'Co-op', 'St...",['Action']
6,550,Left 4 Dead 2,"Set in the zombie apocalypse, Left 4 Dead 2 (L...",Valve,Valve,"['Single-player', 'Multi-player', 'PvP', 'Onli...",['Action']
...,...,...,...,...,...,...,...
25916,2096610,Crysis 3 Remastered,Experience the single-player experience from ...,Crytek,Crytek,"['Single-player', 'Steam Achievements', 'Full ...",['Action']
25918,438450,3DF Zephyr Lite Steam Edition,3DF Zephyr allows you to automatically and eas...,3Dflow SRL,3Dflow SRL,['Steam Trading Cards'],"['Animation & Modeling', 'Design & Illustratio..."
25925,1416050,Shovel Knight Dig,When Drill Knight and his dastardly digging cr...,Nitrome,Yacht Club Games,"['Single-player', 'Steam Achievements', 'Full ...","['Action', 'Adventure', 'Indie', 'Strategy']"
25928,1894430,Plan B: Terraform,<h1>CHECK OUT OUR PREVIOUS GAME</h1><p><a href...,Gaddy Games,Gaddy Games,"['Single-player', 'Steam Cloud']","['Indie', 'Simulation', 'Strategy', 'Early Acc..."


In [49]:
len(games_df)

19818

In [50]:
# create games df for flask app
games_flask_df = games_df[['appid', 'name', 'developer', 'publisher', 'description', 'genres', 'categories']]

#### one hot encode categories and genres

In [51]:
# make categories and genres into lists
games_df['categories'] = games_df['categories'].apply(ast.literal_eval)
games_df['genres'] = games_df['genres'].apply(ast.literal_eval)

In [52]:
# multilabel binarizer
mlb = MultiLabelBinarizer()

In [53]:
# encode categories 
games_df = games_df.join(pd.DataFrame(mlb.fit_transform(games_df.pop('categories')),
                          columns=mlb.classes_,
                          index=games_df.index))

In [54]:
# encode genres
games_df = games_df.join(pd.DataFrame(mlb.fit_transform(games_df.pop('genres')),
                          columns=mlb.classes_,
                          index=games_df.index))

#### process description column NLP style

In [55]:
# fill 'description' column NaN with empty string, NO LONGER NECESSARY?
games_df['description'] = games_df['description'].fillna('')

In [56]:
stopwords = nltk.corpus.stopwords.words('english')
porter = PorterStemmer()

In [57]:
# lemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [58]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/berry/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [59]:
# function to clean text for flask app
def clean_text_for_app(text):
    # remove tags
    text = re.sub(r'<.*?>',' ',text)
    # remove URL links
    text = re.sub(r'http\S+', ' ', text)
        # remove hex
    text = re.sub(r'[^\x00-\x7f]',r'', text)
    return text


In [60]:
# function to clean text
def clean_text(text):
    # remove tags
    text = re.sub(r'<.*?>',' ',text)
    # remove URL links
    text = re.sub(r'http\S+', ' ', text)
    # remove non-breaking space
    text = text.replace(u'\xa0', u' ')
    # remove hex
    text = re.sub(r'[^\x00-\x7f]',r'', text)
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # remove numbers
    text = re.sub(r'[0-9]+', '', text)
    # make lowercase
    text = text.lower()
    # tokenize text
    tokens = nltk.word_tokenize(text)
    # remove stopwords
    tokens = [w for w in tokens if w not in stopwords]
    # lemmatize words
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    # stem words
    # tokens = [porter.stem(w) for w in tokens]
    # make tokens into string
    text = ' '.join(tokens)
    # return clean text
    return text

In [61]:
# apply clean_text function to description column
games_df['description_clean'] = games_df['description'].apply(clean_text)

In [62]:
games_flask_df['description'] = games_flask_df['description'].apply(clean_text_for_app)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_flask_df['description'] = games_flask_df['description'].apply(clean_text_for_app)


#### Save some dataframes and other stuff

In [63]:
users_df = users_df.drop(columns= ['playtime_2weeks'])

In [64]:
# drop rows from users_df where appid is not in games_df
# playtime for games is still in user profiles even if a game is no longer on steam
users_df = users_df[users_df['appid'].isin(games_df['appid'])]

In [65]:
len(users_df)

2245369

In [66]:
# save users_df to csv
users_df.to_csv('../data/steam_playtime_clean.csv', index=False)

In [67]:
# drop name, developer and publisher from games_df
# games_df = games_df.drop(columns = ['name', 'developer', 'publisher', 'description'])

In [68]:
# save games_df to csv
games_df.to_csv('../data/steam_app_metadata_clean.csv', index=False)

In [69]:
# save a version for flask_app
games_flask_df.to_csv('../data/steam_metadata_flask.csv', index=False)