In [1]:
# Package imports
import nle.dataset as nld
from nle.nethack import tty_render
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from nle.dataset import db

In [2]:
# Database path
# Define data set in separate jupyter notebook, then write it into to this notebook
nld_nao_path = "/code/nld-nao/nld-nao-unzipped"

dbfilename = "nld-nao.db"

dataset_name = "nld-nao"

db_conn = nld.db.connect(filename=dbfilename)

print(f"AltOrg Dataset has {nld.db.count_games('nld-nao', conn=db_conn)} games.")

AltOrg Dataset has 1511228 games.


In [3]:
# random sample of 1000 players and all their corresponding games
# select a distinct set of players names from the db and shuffle them randomly
# limits to 1000 players
# selects all corresponding games for the randomly selected players

random = """WITH random_players AS (SELECT DISTINCT name FROM games ORDER BY RANDOM() LIMIT 1000)
SELECT g.*
FROM games g
JOIN random_players rp ON g.name = rp.name"""


df = pd.read_sql(random, db_conn)  

In [16]:
# TODO: Add to main.py for data analysis pipeline
def bin_players(df, metric, method='quantile', thresholds=None):
    
    # Calculate player-level performance metrics
    player_metrics = df.groupby('name')[metric].mean().reset_index()
                
    if method == 'quantile':
        # Use quantiles to determine thresholds
        lower_bound = df[metric].quantile(1/3)
        upper_bound = df[metric].quantile(2/3)
    
        # Categorize players based on thresholds
        beginners = player_metrics[player_metrics[metric] <= lower_bound]['name']
        intermediates = player_metrics[(player_metrics[metric] > lower_bound) & (player_metrics[metric] <= upper_bound)]['name']
        advanced = player_metrics[player_metrics[metric] > upper_bound]['name']
        
    elif method == 'thresholds' and thresholds is not None:
        lower_bound, upper_bound = thresholds
    else:
        raise ValueError("Invalid method or thresholds not provided")
    
    # Add more elif statements to include alternative forms of binning 

    # # Optionally add a tier column to each DataFrame
    # beginners['tier'] = 'Beginner'
    # intermediates['tier'] = 'Intermediate'
    # advanced['tier'] = 'Advanced'

    return beginners, intermediates, advanced


beginners, intermediates, advanced = bin_players(df, metric='points')

print("Beginners:\n", beginners)
print("\nIntermediates:\n", intermediates)
print("\nAdvanced:\n", advanced)

Beginners:
 4      Player10243
5      Player10259
6      Player10283
8      Player10376
13      Player1051
          ...     
991     Player9725
993     Player9764
996     Player9816
997     Player9835
999     Player9928
Name: name, Length: 362, dtype: object

Intermediates:
 0      Player10102
7      Player10303
14     Player10537
16     Player10553
17     Player10596
          ...     
985     Player9398
988     Player9643
994     Player9782
995     Player9810
998     Player9870
Name: name, Length: 330, dtype: object

Advanced:
 1      Player10182
2      Player10211
3      Player10220
9      Player10382
10     Player10383
          ...     
974     Player9132
976      Player915
986     Player9486
987      Player955
992     Player9745
Name: name, Length: 308, dtype: object


In [4]:
# Define window size
window_size = 3
window_features = pd.DataFrame()

def filter_after_ascension(group):
    if 'ascended' in group['death'].values:
        # Find the index of the first "ascended" death
        first_ascended_index = group[group['death'] == 'ascended'].index[0]
        # Filter rows up to and including the first "ascended" death
        return group.loc[:first_ascended_index]
    else:
        # If "ascended" does not exist, return the entire group
        return group

# Apply the function to each group of players
filtered_df = df.groupby('name', group_keys=False).apply(filter_after_ascension)


# Create the 'games_played' feature
filtered_df['games_played'] = filtered_df.groupby('name').cumcount() + 1

# Rolling features for each player
for name, group in filtered_df.groupby('name'):
    # Rolling Mean Features
    group['rolling_mean_points'] = group['points'].rolling(window=window_size, min_periods=1).mean()
    group['rolling_mean_turns'] = group['turns'].rolling(window=window_size, min_periods=1).mean()
    group['rolling_mean_maxlvl'] = group['maxlvl'].rolling(window=window_size, min_periods=1).mean()

    # Rolling Standard Deviation Features
    group['rolling_std_points'] = group['points'].rolling(window=window_size, min_periods=1).std().fillna(0)
    group['rolling_std_turns'] = group['turns'].rolling(window=window_size, min_periods=1).std().fillna(0)
    group['rolling_std_maxlvl'] = group['maxlvl'].rolling(window=window_size, min_periods=1).std().fillna(0)

    # Cumulative Sum Features
    group['cumulative_sum_points'] = group['points'].cumsum()
    group['cumulative_sum_turns'] = group['turns'].cumsum()
    group['cumulative_sum_maxlvl'] = group['maxlvl'].cumsum()

    # Expanding Mean Features
    group['expanding_mean_points'] = group['points'].expanding().mean()
    group['expanding_mean_turns'] = group['turns'].expanding().mean()
    group['expanding_mean_maxlvl'] = group['maxlvl'].expanding().mean()

    # Append the group with new features to the window_features DataFrame
    window_features = pd.concat([window_features, group])

# Reset index if needed
window_features.reset_index(drop=True, inplace=True)

# Display the updated DataFrame with window features
print(window_features.head())

  filtered_df = df.groupby('name', group_keys=False).apply(filter_after_ascension)


    gameid version  points  deathdnum  deathlev  maxlvl  hp  maxhp  deaths  \
0  1919519   3.6.1     421          0         3       3   0     18       1   
1  1919520   3.6.1     750          0         4       5 -10     29       1   
2  1919607   3.6.1     161          0         1       1  -3     13       1   
3  2594326   3.6.0     166          0         2       2  -5     16       1   
4  2594327   3.6.0     154          0         2       2  12     12       0   

   deathdate  ...  rolling_mean_maxlvl  rolling_std_points rolling_std_turns  \
0   20170420  ...             3.000000            0.000000          0.000000   
1   20170420  ...             4.000000          232.638131        225.567063   
2   20170421  ...             3.000000          295.172831        450.906864   
3   20170418  ...             2.666667          338.625161        549.375100   
4   20170418  ...             1.666667            6.027714        626.194059   

  rolling_std_maxlvl cumulative_sum_points cumulat

In [5]:
window_features

Unnamed: 0,gameid,version,points,deathdnum,deathlev,maxlvl,hp,maxhp,deaths,deathdate,...,rolling_mean_maxlvl,rolling_std_points,rolling_std_turns,rolling_std_maxlvl,cumulative_sum_points,cumulative_sum_turns,cumulative_sum_maxlvl,expanding_mean_points,expanding_mean_turns,expanding_mean_maxlvl
0,1919519,3.6.1,421,0,3,3,0,18,1,20170420,...,3.000000,0.000000,0.000000,0.000000,421,933,3,421.000000,933.000000,3.000000
1,1919520,3.6.1,750,0,4,5,-10,29,1,20170420,...,4.000000,232.638131,225.567063,1.414214,1171,2185,8,585.500000,1092.500000,4.000000
2,1919607,3.6.1,161,0,1,1,-3,13,1,20170421,...,3.000000,295.172831,450.906864,2.000000,1332,2547,9,444.000000,849.000000,3.000000
3,2594326,3.6.0,166,0,2,2,-5,16,1,20170418,...,2.666667,338.625161,549.375100,2.081666,1498,3912,11,374.500000,978.000000,2.750000
4,2594327,3.6.0,154,0,2,2,12,12,0,20170418,...,1.666667,6.027714,626.194059,0.577350,1652,4126,13,330.400000,825.200000,2.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22425,2130387,3.6.1,182,0,4,4,-8,12,1,20181225,...,6.000000,2819.936406,1866.520828,2.000000,6804,9489,26,1360.800000,1897.800000,5.200000
22426,2130390,3.6.1,81,0,2,2,-6,12,1,20181225,...,4.666667,299.032328,2148.510259,3.055050,6885,9665,28,1147.500000,1610.833333,4.666667
22427,2888780,3.6.0,11,0,1,1,12,12,0,20180317,...,2.333333,85.967048,300.800598,1.527525,6896,9710,29,985.142857,1387.142857,4.142857
22428,2888820,3.6.0,843,0,7,7,0,36,1,20180317,...,3.333333,461.477338,1108.428768,3.214550,7739,11737,36,967.375000,1467.125000,4.500000


In [19]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

window_features['ascended'] = np.where(window_features['death'] == 'ascended', 1, 0)


features = [
    'rolling_mean_points', 'rolling_mean_turns', 'rolling_mean_maxlvl',
    'rolling_std_points', 'rolling_std_turns', 'rolling_std_maxlvl',
    'cumulative_sum_points', 'cumulative_sum_turns', 'cumulative_sum_maxlvl',
    'expanding_mean_points', 'expanding_mean_turns', 'expanding_mean_maxlvl'
]

X = window_features[features]
y = window_features['ascended']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('classifier', RandomForestClassifier(random_state=42))  # Random Forest Classifier
])

# Fit the model
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_proba))

Accuracy: 1.0
Confusion Matrix:
 [[3752    0]
 [   0   11]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3752
           1       1.00      1.00      1.00        11

    accuracy                           1.00      3763
   macro avg       1.00      1.00      1.00      3763
weighted avg       1.00      1.00      1.00      3763

ROC AUC Score: 1.0
