# Keras Hybrid model

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
from keras.models import Model
from keras.layers import Input, Dense, Concatenate

In [8]:
# import our processed datasets
users_df = pd.read_csv('../data/steam_playtime_clean.csv')
games_df = pd.read_csv('../data/steam_app_metadata_clean.csv')

### Scale playtimes

In [None]:
# use StandardScaler to scale user playtimes
# maybe not use it and use the normalizing later?
scaler = StandardScaler()
users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].values.reshape(-1, 1))

In [9]:
# Normalize the targets between 0 and 1. Makes it easy to train.
min_playtime = min(users_df['playtime_forever'])
max_playtime = max(users_df['playtime_forever'])
users_df['playtime_forever'] = users_df['playtime_forever'].apply(lambda x: (x - min_playtime) / (max_playtime - min_playtime)).values

### Set up data

In [12]:
# instantiate tfidfvectorizer
tfidf = TfidfVectorizer(max_features=1500, lowercase=False, min_df=5, ngram_range=(1,3))

In [13]:
# make description column into type string, otherwise tfidf cries
games_df['description_clean'] = games_df['description_clean'].astype(str)

In [14]:
# fit tfidfvectorizer to description column
tfidf_matrix = tfidf.fit_transform(games_df['description_clean'])

In [15]:
# games_df columns 5 to second last are the tags
game_tags = games_df.iloc[:, 5:-1]

In [16]:
game_tags

Unnamed: 0,Captions available,Co-op,Commentary available,Cross-Platform Multiplayer,Downloadable Content,Full controller support,Game demo,In-App Purchases,Includes Source SDK,Includes level editor,...,Short,Simulation,Software Training,Sports,Strategy,Tutorial,Utilities,Video Production,Violent,Web Publishing
0,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19813,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19814,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
19815,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
19816,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0


In [17]:
# make matrix into dataframe
tfidf_array = tfidf_matrix.toarray()

In [18]:
cb_values = np.concatenate((game_tags.values, tfidf_array), axis=1)

### Layers setup

In [36]:
n_features = cb_values.shape[1]
n_items = cb_values.shape[0]

In [None]:
# Extract the collaborative filtering data from the dataframe
y = users_df[['playtime_forever']].values

In [46]:
cf_user_ids = users_df[['steam_id']].values
cf_game_ids = users_df[['appid']].values

In [37]:
# Define the input layers for the collaborative filtering data
cf_user_input = Input(shape=(1,))
cf_game_input = Input(shape=(1,))
cf_input = Concatenate()([cf_user_input, cf_game_input])

In [38]:
# Define the input layer for the content-based data
cb_input_shape = (n_features,)
cb_input = Input(shape=cb_input_shape, name='cb_input')

In [39]:
# Define the layers for the collaborative filtering branch
cf_dense = Dense(64, activation='relu')(cf_input)
cf_output = Dense(1)(cf_dense)

In [40]:
# Define the layers for the content-based branch
cb_dense = Dense(64, activation='relu')(cb_input)

In [41]:
# Concatenate the output from the collaborative filtering branch and the content-based branch
merged = Concatenate()([cf_output, cb_dense])

In [42]:
# Define the output layer
output = Dense(1)(merged)

In [43]:
# Define the model with both the collaborative filtering and content-based inputs
model = Model(inputs=[cf_user_input, cf_game_input, cb_input], outputs=output)

In [44]:
# Compile the model with a suitable loss function and optimizer
model.compile(loss='mean_squared_error', optimizer='adam')

In [47]:
# Train the model using both the collaborative filtering and content-based data
model.fit([cf_user_ids, cf_game_ids, cb_values], y, epochs=10, batch_size=32, validation_split=0.2)

ValueError: Data cardinality is ambiguous:
  x sizes: 1796295, 1796295, 19818
  y sizes: 1796295
Make sure all arrays contain the same number of samples.