## Data preparation for Steam datasets

In [1]:
import pandas as pd
import string
import nltk
import re
import ast
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import MultiLabelBinarizer

### PLAYTIME DATA PREP

In [6]:
# load steam_playtime.csv
users_df = pd.read_csv('../data/steam_playtime.csv')

In [12]:
len(users_df)

2329937

In [7]:
# drop counterstrike i.e. drop rows where appid is 730, 10 or 240
users_df = users_df[~users_df['appid'].isin([730, 10, 240])]

# remove user playtimes over x minutes
users_df = users_df[users_df['playtime_forever'] < 130000]

In [9]:
# remove user playtimes under x minutes
users_df = users_df[users_df['playtime_forever'] > 10]

In [10]:
# drop ridiculous playtime_2weeks values
users_df = users_df[users_df['playtime_2weeks'] < 5000]

In [11]:
# RUN THIS IF YOU WANT TO DROP GAMES WITH ONLY ONE PLAYTIME
# group by appid and get count, mean, median, min, max
playtime_by_game = users_df.groupby('appid').agg({'playtime_forever': ['count', 'mean', 'median', 'min', 'max']}).round(2)
# get appids where count is 1
only_one_playtime = playtime_by_game[playtime_by_game[('playtime_forever', 'count')] == 1].index
# exclude games with only one playtime
users_df = users_df[~users_df['appid'].isin(only_one_playtime)]

### GAME DATA PREP

In [13]:
# load steam_app_metadata.csv
games_df = pd.read_csv('../data/steam_app_metadata.csv')

In [14]:
# remove duplicate appid
games_df = games_df.drop_duplicates(subset=['appid'])

In [15]:
# get duplicates for name
dupl_game_names = games_df[games_df.duplicated(subset=['name'])]

In [16]:
# rename appid's in users_df that are duplicates to original appid
for index, row in dupl_game_names.iterrows():
    dupl_appid = row['appid']
    orig_appid = games_df[games_df['name'] == row['name']]['appid'].iloc[0]
    users_df.loc[users_df['appid'] == dupl_appid, 'appid'] = orig_appid

In [17]:
# remove duplicate game names from games_df
games_df = games_df.drop_duplicates(subset=['name'])

In [18]:
no_description = games_df[games_df['description'].isnull() | (games_df['description'] == '')]
games_df = games_df[~games_df['appid'].isin(no_description['appid'])]

In [19]:
no_dev_or_pub = games_df[(games_df['developer'] == 'None') & games_df['publisher'].isnull()]
games_df = games_df[~games_df['appid'].isin(no_dev_or_pub['appid'])]

In [20]:
# Fill NaN for publisher with developer
games_df['publisher'] = games_df['publisher'].fillna(games_df['developer'])

In [21]:
# remove games from games_df that are not in users_df
games_df = games_df[games_df['appid'].isin(users_df['appid'].unique())]

In [22]:
len(games_df)

19818

#### one hot encode categories and genres

In [23]:
# make categories and genres into lists
games_df['categories'] = games_df['categories'].apply(ast.literal_eval)
games_df['genres'] = games_df['genres'].apply(ast.literal_eval)

In [24]:
# multilabel binarizer
mlb = MultiLabelBinarizer()

In [25]:
# encode categories 
games_df = games_df.join(pd.DataFrame(mlb.fit_transform(games_df.pop('categories')),
                          columns=mlb.classes_,
                          index=games_df.index))

In [26]:
# encode genres
games_df = games_df.join(pd.DataFrame(mlb.fit_transform(games_df.pop('genres')),
                          columns=mlb.classes_,
                          index=games_df.index))

#### process description column NLP style

In [22]:
# fill 'description' column NaN with empty string, NO LONGER NECESSARY?
games_df['description'] = games_df['description'].fillna('')

In [27]:
stopwords = nltk.corpus.stopwords.words('english')
porter = PorterStemmer()

In [28]:
# import lemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [29]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/berry/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [30]:
# function to clean text
def clean_text(text):
    # remove tags
    text = re.sub(r'<.*?>',' ',text)
    # remove URL links
    text = re.sub(r'http\S+', ' ', text)
    # remove non-breaking space
    text = text.replace(u'\xa0', u' ')
    # remove hex
    text = re.sub(r'[^\x00-\x7f]',r'', text)
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # remove numbers
    text = re.sub(r'[0-9]+', '', text)
    # make lowercase
    text = text.lower()
    # tokenize text
    tokens = nltk.word_tokenize(text)
    # remove stopwords
    tokens = [w for w in tokens if w not in stopwords]
    # lemmatize words
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    # stem words
    # tokens = [porter.stem(w) for w in tokens]
    # make tokens into string
    text = ' '.join(tokens)
    # return clean text
    return text

In [31]:
# apply clean_text function to description column
games_df['description_clean'] = games_df['description'].apply(clean_text)

#### Save some dataframes and other stuff

In [None]:
# make a dataframe from appid, name, developer and publisher. for our recommendations
# i have to go by location because 'name' is possibly a tf-idf term now
# gameinfo_df = games_df.iloc[:, [0,1,3,4]]

In [None]:
# save gameinfo_df to csv
# gameinfo_df.to_csv('../data/steam_gameinfo.csv', index=False)

In [32]:
users_df = users_df.drop(columns= ['playtime_2weeks'])

In [33]:
# drop rows from users_df where appid is not in games_df
# playtime for games is still in user profiles even if a game is no longer on steam
users_df = users_df[users_df['appid'].isin(games_df['appid'])]

In [34]:
len(users_df)

2245369

In [35]:
# save users_df to csv
users_df.to_csv('../data/steam_playtime_clean.csv', index=False)

In [None]:
# drop name, developer and publisher from games_df
# games_df = games_df.drop(columns = ['name', 'developer', 'publisher', 'description'])

In [36]:
# save games_df to csv
games_df.to_csv('../data/steam_app_metadata_clean.csv', index=False)