In [1]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import datetime
import matplotlib
import matplotlib.pyplot as plt

In [2]:
#import single game features made in
import pickle
with open('X.pickle', 'rb') as handle:
    X = pickle.load(handle)
with open('Y.pickle', 'rb') as handle:
    Y = pickle.load(handle)

## Training the ML Model

The approach to generating rankings for each month will be to create pairwise comparisons between games each month. Approach based on __[RankSVM](www.cs.cornell.edu/~tj/publications/joachims_02c.pdf)__. The new features are the differences of the features of the two game then it becomes a classification problem: will the first game rank better or the second. I won't allow for ties since there are no ties in the actual rankings and I won't include comparisons between 2 unranked games in the training data since the games could have actually sold dramatically differently but I don't have the data for that.

The following creates X_new and Y_new which now just an ordered lists containing the feature vector (for X_new) and the comparison result (for Y_new) for a pair of games in some month; index_values is another ordered list which keeps track of the month, 1st game and 2nd game indices for X_new and Y_new.

In [3]:
#doing pairwise comparison, ignores ties for unranked games
(l,m,n) = X.shape
X_new = []
Y_new = []
index_values = [] #saves the month and game indicies
for i in range(0,48): #month range
    for j in range(0,m): #1st game
        r1 = Y[i,j]
        if X[i,j,0]>-1 and list(X[i,j,1:5]) != [0,0,0,0]: #skip if game has not come out yet, or game with 0 relative search history
            for k in range(j+1,m): #looping over 2nd game
                if X[i,k,0]>-1 and list(X[i,k,1:5]) != [0,0,0,0]:
                    r2 = Y[i,k]
                    if r1>r2:
                        y_pair = +1
                        x_pair = list(X[i,j] - X[i,k])
                        Y_new.append(y_pair)
                        X_new.append(x_pair)
                        index_values.append([i,j,k])
                    if r1<r2:
                        y_pair = -1
                        x_pair = list(X[i,j] - X[i,k])
                        Y_new.append(y_pair)
                        X_new.append(x_pair)
                        index_values.append([i,j,k])

#y = -1 if game 1 ranked better, y = +1 if game 2 ranked better
#X_new and Y_new are now just an ordered list containing the feature vector (for X_new) and the comparison result (for Y_new) for a pair of 
#games in some month. 

                
                    
            
                
            

Instead of letting sklearn using built in function to split between train and test sets, I will manually split by month so I can later predict top 10 games of a month that isn't in the training set.

In [4]:
np.random.seed(1)
PRNG = np.random.rand(l)
X_train = []
Y_train = []
index_values_train = []
X_test = []
Y_test = []
index_values_test = []
j = 0
k = 0
month_index = 0
for i in range(0,l):
    r = PRNG[i]
    if r<=0.66:
        while month_index == i and k < len(index_values):
            X_train.append(X_new[k])
            Y_train.append(Y_new[k])
            index_values_train.append(index_values[k])
            k = k+1
            try:
                month_index = index_values[k][0] 
            except:
                break
    else:
        while month_index == i and k < len(index_values):
            X_test.append(X_new[k])
            Y_test.append(Y_new[k])
            index_values_test.append(index_values[k])
            k = k+1 
            try:
                month_index = index_values[k][0] 
            except:
                pass
        
    

In [6]:
clf = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf')
clf.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [None]:
#tuned_parameters = [{'kernel':['rbf'],'gamma':[1e-3,1e-4],'C':[1,10,100,1000]}]
#clf = GridSearchCV(svm.SVC(),tuned_parameters,cv=3,scoring = 'f1')
#clf.fit(X_train, Y_train)
#predictions = clf.predict(X_test)

In [7]:
predictions = clf.predict(X_test)

In [10]:
from sklearn.metrics import f1_score
print("test f1:", str(f1_score(Y_test, predictions, average='macro')))
from sklearn.metrics import precision_score
print("test precision:", str(precision_score(Y_test, predictions, average='macro')))

test f1: 0.9669968469439774
test precision: 0.9685386513645879


In [11]:
#pick out top 10 then sort
import pickle
with open('clf.pickle','wb') as handle:
    pickle.dump(clf,handle,protocol=pickle.HIGHEST_PROTOCOL)
    
with open('predictions.pickle','wb') as handle:
    pickle.dump(predictions,handle,protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
with open('gamelist.pickle', 'rb') as handle:
    gamelist = pickle.load(handle)

## Bradley-Terry Model
From paired comparisons, I will use the Bradley-Terry model to generate rankings. The model estimates the probability, $P(i>j)$ of a paired comparison based on $\pi_i$, a numerical value representing "value" (or "skill" or whatever is appropriate for the context of the problem) of each game. The ordering of $\pi_i$'s then ranks all the games. Below I use a simple algorithm built upon the Bradley-Terry Model to estimate the $\pi_i$'s based on the outcomes of a series of pairwise comparisons.

The Bradley-Terry model and algorithm are explained __[here](https://projecteuclid.org/euclid.aos/1079120141)__.

In [13]:
def BTmodel (indices, Y, n, loops): #seems to rank known ones most accurately when looped multiple times
    p = 100*np.ones(n)/n
    for k in range(0,loops):
        for j in range(0,len(Y)):
            game1_index = indices[j][1]
            game2_index = indices[j][2]
            if Y[j] == -1: #game 1 wins
                p[game1_index] = (p[game2_index]+p[game1_index])
            else: #game 2 wins
                p[game2_index] = (p[game2_index]+p[game1_index])
            p = 100*p/p.sum() #renormalize
    return p

#def BTmodel (indices, Y, n, loops): #seems to do better when looped once
#    p = 100*np.ones(n)/n
#    for k in range(0,loops):
#        j = 0
#        for i in range(0,n):
#            W = 0
#            s = 0
#            game1_index = i
#            while game1_index == i and j < len(Y):
#                game2_index = indices[j][2]
#                if Y[j] == -1:
#                    W = W+1
#                    s = s+ 1/(p[game2_index]+p[game1_index])
#                else:
#                    s = s+ 1/(p[game2_index]+p[game1_index])
#                j = j+1
#                try:
#                    game1_index = indices[j][1]
#                except:
#                    pass
#                if s != 0:
#                    p[i] = W/s
#                    p = 100*p/p.sum() #renormalize
#    return p
    

## Finding the rankings for test months

In [16]:
count = 0
month = 1
while month == 1:
    count = count +1
    month = index_values_test[count][0]

indices_test = index_values_test[0:11417]
yfortest = predictions[0:11417]
#yfortest = Y_test[0:16516] 
    

In [17]:
p = BTmodel(indices_test,yfortest,len(gamelist),10)

In [18]:
dtype = [('game','S29'),('p',float)]
q = []
for i in range(0,len(p)):
    q.append((gamelist[i],p[i]))
qarray = np.array(q,dtype = dtype)
qsort = np.sort(qarray, order = 'p')

In [26]:
#Prediction of Top 20 Games for September 2018
list(qsort[::-1][0:21])

[(b'Shadow Of The Tomb Raider', 22.00999884),
 (b'FIFA 19', 12.51490614),
 (b'Destiny 2: Forsaken', 12.21133606),
 (b'Madden NFL 19', 12.1263797),
 (b'Fe', 6.24261539),
 (b'NHL 19', 6.19338533),
 (b'Naruto To Boruto: Shinobi Str', 5.69904555),
 (b"Tom Clancy's Rainbow Six: Sie", 5.46240467),
 (b'Grand Theft Auto V', 2.80175578),
 (b'Journey', 1.44235394),
 (b'Ark: Survival Evolved', 1.1896242),
 (b'Minecraft', 0.94522078),
 (b'Super Mario Odyssey', 0.63761211),
 (b'Strange Brigade', 0.60475247),
 (b'God of War', 0.60440303),
 (b'Enter the Gungeon', 0.60140172),
 (b'Far Cry 5', 0.57141972),
 (b'Mega Man 11', 0.57101923),
 (b'Detroit: Become Human', 0.51814311),
 (b'Super Mario Party', 0.48776623),
 (b'Kingdom Come: Deliverance', 0.4774594)]

In [22]:
count = 11416
month = 11
while month == 11:
    count = count +1
    month = index_values_test[count][0]
indices_test2 = index_values_test[11416:23444]
yfortest2 = predictions[11416:23444]
#yfortest2 = Y_test[16533:32493]

In [23]:
p2 = BTmodel(indices_test2,yfortest2,len(gamelist),35) #1 loop works best?

dtype = [('game','S29'),('p',float)]
q2 = []
for i in range(0,len(p)):
    q2.append((gamelist[i],p2[i]))
qarray2 = np.array(q2,dtype = dtype)
qsort2 = np.sort(qarray2, order = 'p')

In [27]:
#Prediction of Top 20 Games for August 2017
list(qsort2[::-1][0:21])

[(b"Tom Clancy's Rainbow Six: Sie", 5.10358297),
 (b'Agents of Mayhem', 4.30978915),
 (b"Tom Clancy's Rainbow Six Sieg", 4.10333301),
 (b"Hellblade: Senua's Sacrifice", 4.08659091),
 (b'The Surge', 4.03157331),
 (b'Valkyria Revolution', 3.97153789),
 (b'Sonic Mania', 3.93884321),
 (b"Everybody's Golf", 3.89017056),
 (b'Yakuza Kiwami', 3.86680679),
 (b'Ultra Street Fighter II: The ', 3.86530358),
 (b'Battlefield 1', 3.85680572),
 (b'Prey', 3.73670024),
 (b'Star Trek: Bridge Crew', 3.73085911),
 (b'Pyre', 3.71940178),
 (b'Gravity Rush 2', 3.70234016),
 (b'Tales of Berseria', 3.69649931),
 (b'DiRT 4', 3.66196023),
 (b'Rocket League', 3.6378944),
 (b'NieR: Automata', 3.61833506),
 (b'F1 2017', 3.59503253),
 (b'Minecraft', 3.44645506)]

In [25]:
with open('X_new.pickle','wb') as handle:
    pickle.dump(X_new,handle,protocol=pickle.HIGHEST_PROTOCOL)
    
with open('Y_new.pickle','wb') as handle:
    pickle.dump(Y_new,handle,protocol=pickle.HIGHEST_PROTOCOL)
