In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from leagueGetData import getLeagueData
from leagueDataCleaner import cleanLeagueData

# Data Preparation
In this section, we prepare the data for each player.

First, the players' personal information is retrieved from our database.

In [2]:
targets = pd.DataFrame(columns=['pseudo', 'tagline', 'region', 'birthDate', 'sex'], data=[['Katiounette', 'KITTY', 'europe', '01/01/2006', 1], ['OhSS', '6121', 'europe', '01/01/2006', 0]])
targets

Unnamed: 0,pseudo,tagline,region,birthDate,sex
0,Katiounette,KITTY,europe,01/01/2006,1
1,OhSS,6121,europe,01/01/2006,0


Then, we retrieve game data for the players. The match list and the data for each match are saved for each player in data/matchLists and data/parsedData. (The data path can be modified if needed)

Here the data is retrieved for the first time and saved locally on the computer for future use.

In [3]:
targetID = getLeagueData(targets)
targetID

Saved matchs list for Katiounette#KITTY
Error 404 for match EUW1_6343678419 for Katiounette#KITTY
Error 404 for match EUW1_6343624912 for Katiounette#KITTY
Error 404 for match EUW1_6343264517 for Katiounette#KITTY
Error 404 for match EUW1_6343219631 for Katiounette#KITTY
Saved parsed data for Katiounette#KITTY
Saved matchs list for OhSS#6121
Saved parsed data for OhSS#6121


Unnamed: 0,puuid,pseudo,tagline,region
0,p8bl0MvZHFp_3Xv0n-ct7HIqGxGD1eNKWbTEWeRkygFnIh...,Katiounette,KITTY,europe
1,XVHOCosPT0oX0q7PpWuFaj8Ll2bP9O7fscbCLRVx0tsNxK...,OhSS,6121,europe


Here, the data is already available locally on the computer.

In [3]:
targetID = getLeagueData(targets)
targetID

Got parsed data from os for Katiounette#KITTY
Got parsed data from os for OhSS#6121


Unnamed: 0,puuid,pseudo,tagline,region
0,p8bl0MvZHFp_3Xv0n-ct7HIqGxGD1eNKWbTEWeRkygFnIh...,Katiounette,KITTY,europe
1,XVHOCosPT0oX0q7PpWuFaj8Ll2bP9O7fscbCLRVx0tsNxK...,OhSS,6121,europe


We join two DataFrames to retrieve the complete data. One DataFrame contains the personal information of the players, the other DataFrame contains the player IDs. The join operation combines these DataFrames, enabling access to all necessary data in a unified structure.

In [4]:
players = pd.merge(targets, targetID, on=['pseudo', 'tagline', 'region'])
players

Unnamed: 0,pseudo,tagline,region,birthDate,sex,puuid
0,Katiounette,KITTY,europe,01/01/2006,1,p8bl0MvZHFp_3Xv0n-ct7HIqGxGD1eNKWbTEWeRkygFnIh...
1,OhSS,6121,europe,01/01/2006,0,XVHOCosPT0oX0q7PpWuFaj8Ll2bP9O7fscbCLRVx0tsNxK...


Finally, we retrieve the game data for each player and clean it to extract features, dividing them into two groups: under 18 and over 18. The cleaned game data is then concatenated with the player's personal information to create a comprehensive dataset.

In [5]:
cleanData = cleanLeagueData(players)
cleanData

Unnamed: 0,pseudo,tagline,region,birthDate,sex,puuid,championPref,championCount,hourMostFreq,dayMostFreq,...,gameMode_SIEGE,gameMode_ASSASSINATE,gameMode_ARSR,gameMode_DARKSTAR,gameMode_STARGUARDIAN,gameMode_PROJECT,gameMode_GAMEMODEX,gameMode_ODYSSEY,gameMode_NEXUSBLITZ,gameMode_ULTBOOK
0,Katiounette,KITTY,europe,01/01/2006,1,p8bl0MvZHFp_3Xv0n-ct7HIqGxGD1eNKWbTEWeRkygFnIh...,21,9,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Katiounette,KITTY,europe,01/01/2006,1,p8bl0MvZHFp_3Xv0n-ct7HIqGxGD1eNKWbTEWeRkygFnIh...,21,3,2.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,OhSS,6121,europe,01/01/2006,0,XVHOCosPT0oX0q7PpWuFaj8Ll2bP9O7fscbCLRVx0tsNxK...,101,70,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,OhSS,6121,europe,01/01/2006,0,XVHOCosPT0oX0q7PpWuFaj8Ll2bP9O7fscbCLRVx0tsNxK...,115,81,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Test Classification
The results are not representative due to the small size of the dataset and the fact that we don't use real birth dates to have both under_18 and over_18 data. These tests will be conducted on larger, real-world datasets in the future.

First, we prepare the data to ensure it is ready to be fitted into the models.

In [6]:
data = cleanData.drop(['pseudo', 'tagline', 'region', 'birthDate', 'puuid'], axis=1)
data

Unnamed: 0,sex,championPref,championCount,hourMostFreq,dayMostFreq,monday,tuesday,wednesday,thursday,friday,...,gameMode_SIEGE,gameMode_ASSASSINATE,gameMode_ARSR,gameMode_DARKSTAR,gameMode_STARGUARDIAN,gameMode_PROJECT,gameMode_GAMEMODEX,gameMode_ODYSSEY,gameMode_NEXUSBLITZ,gameMode_ULTBOOK
0,1,21,9,2.0,0.0,0.189189,0.135135,0.135135,0.162162,0.121622,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,21,3,2.0,4.0,0.0,0.0,0.0,0.111111,0.444444,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,101,70,2.0,0.0,0.231579,0.105263,0.133333,0.101754,0.101754,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,115,81,2.0,2.0,0.15544,0.163212,0.165803,0.121762,0.129534,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# I convert the ageCategory feature
data['ageCategory'] = data['ageCategory'].map({'under_18': 0, 'over_18': 1})

In [8]:
X = data.drop(['sex', 'ageCategory'], axis=1)
y_age = data[['ageCategory']].values.ravel()
y_sex = data[['sex']].values.ravel()

In [9]:
X

Unnamed: 0,championPref,championCount,hourMostFreq,dayMostFreq,monday,tuesday,wednesday,thursday,friday,saturday,...,gameMode_SIEGE,gameMode_ASSASSINATE,gameMode_ARSR,gameMode_DARKSTAR,gameMode_STARGUARDIAN,gameMode_PROJECT,gameMode_GAMEMODEX,gameMode_ODYSSEY,gameMode_NEXUSBLITZ,gameMode_ULTBOOK
0,21,9,2.0,0.0,0.189189,0.135135,0.135135,0.162162,0.121622,0.081081,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,21,3,2.0,4.0,0.0,0.0,0.0,0.111111,0.444444,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,101,70,2.0,0.0,0.231579,0.105263,0.133333,0.101754,0.101754,0.161404,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,115,81,2.0,2.0,0.15544,0.163212,0.165803,0.121762,0.129534,0.124352,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
y_age

array([0, 1, 0, 1], dtype=int64)

In [11]:
y_sex

array([1, 1, 0, 0], dtype=int64)

The firs test is to predict the age category.

In [12]:
# Custom splits for age
splits = [
    {"test": [0, 1], "train": [2, 3]},
    {"test": [0, 3], "train": [2, 1]},
    {"test": [2, 1], "train": [0, 3]},
    {"test": [2, 3], "train": [0, 1]},
]

accuracies = []

for i, split in enumerate(splits):
    clf = GaussianNB()

    # Split the data
    X_train, X_test = X.iloc[split["train"]], X.iloc[split["test"]]
    y_train, y_test = y_age[split["train"]], y_age[split["test"]]
    
    # Fit the model
    clf.fit(X_train, y_train)
    
    # Predict
    y_pred = clf.predict(X_test)
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    
    # Print split results
    print(f"Split {i+1}:")
    print(f"  True Labels: {y_test}")
    print(f"  Predictions: {y_pred}")
    print(f"  Accuracy: {accuracy * 100:.2f}%\n")

# Compute mean accuracy
mean_accuracy = np.mean(accuracies)
print(f"Mean Accuracy Across Splits: {mean_accuracy * 100:.2f}%")

Split 1:
  True Labels: [0 1]
  Predictions: [1 1]
  Accuracy: 50.00%

Split 2:
  True Labels: [0 1]
  Predictions: [1 0]
  Accuracy: 0.00%

Split 3:
  True Labels: [0 1]
  Predictions: [1 0]
  Accuracy: 0.00%

Split 4:
  True Labels: [0 1]
  Predictions: [0 0]
  Accuracy: 50.00%

Mean Accuracy Across Splits: 25.00%


The second one to predict the sex.

In [13]:
# Custom splits for sex
splits = [
    {"test": [0, 2], "train": [1, 3]},
    {"test": [0, 3], "train": [2, 1]},
    {"test": [2, 1], "train": [0, 3]},
    {"test": [1, 3], "train": [0, 2]},
]

accuracies = []

for i, split in enumerate(splits):
    clf = GaussianNB()

    # Split the data
    X_train, X_test = X.iloc[split["train"]], X.iloc[split["test"]]
    y_train, y_test = y_sex[split["train"]], y_sex[split["test"]]
    
    # Fit the model
    clf.fit(X_train, y_train)
    
    # Predict
    y_pred = clf.predict(X_test)
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    
    # Print split results
    print(f"Split {i+1}:")
    print(f"  True Labels: {y_test}")
    print(f"  Predictions: {y_pred}")
    print(f"  Accuracy: {accuracy * 100:.2f}%\n")

# Compute mean accuracy
mean_accuracy = np.mean(accuracies)
print(f"Mean Accuracy Across Splits: {mean_accuracy * 100:.2f}%")

Split 1:
  True Labels: [1 0]
  Predictions: [1 0]
  Accuracy: 100.00%

Split 2:
  True Labels: [1 0]
  Predictions: [1 0]
  Accuracy: 100.00%

Split 3:
  True Labels: [0 1]
  Predictions: [0 1]
  Accuracy: 100.00%

Split 4:
  True Labels: [1 0]
  Predictions: [1 0]
  Accuracy: 100.00%

Mean Accuracy Across Splits: 100.00%
