In [73]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# For preprocessing and splitting data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# For models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

# For evaluating models
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, r2_score

# For deep learning (TensorFlow/Keras)
import tensorflow as tf
from tensorflow import keras

# xgboost
import xgboost as xgb

In [74]:
# Load data
df = pd.read_csv('df_s.csv')
X = pd.read_csv('X.csv')
y = pd.read_csv('y.csv')

In [75]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119733 entries, 0 to 119732
Data columns (total 40 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   year                      119733 non-null  int64  
 1   playoffs                  119733 non-null  int64  
 2   game                      119733 non-null  float64
 3   patch                     119733 non-null  float64
 4   participantid             119733 non-null  int64  
 5   gamelength                119733 non-null  int64  
 6   result                    119733 non-null  int64  
 7   kills                     119733 non-null  int64  
 8   deaths                    119733 non-null  int64  
 9   assists                   119733 non-null  int64  
 10  teamkills                 119733 non-null  int64  
 11  teamdeaths                119733 non-null  int64  
 12  team kpm                  119733 non-null  float64
 13  ckpm                      119733 non-null  f

In [76]:
y = df['Total']
y

0          75
1          73
2         157
3           6
4         289
         ... 
119728     18
119729      1
119730     18
119731      1
119732     18
Name: Total, Length: 119733, dtype: int64

In [78]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119733 entries, 0 to 119732
Data columns (total 40 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   year                      119733 non-null  int64  
 1   playoffs                  119733 non-null  int64  
 2   game                      119733 non-null  float64
 3   patch                     119733 non-null  float64
 4   participantid             119733 non-null  int64  
 5   gamelength                119733 non-null  int64  
 6   result                    119733 non-null  int64  
 7   kills                     119733 non-null  int64  
 8   deaths                    119733 non-null  int64  
 9   assists                   119733 non-null  int64  
 10  teamkills                 119733 non-null  int64  
 11  teamdeaths                119733 non-null  int64  
 12  team kpm                  119733 non-null  float64
 13  ckpm                      119733 non-null  f

In [79]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Normalize the features
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)
X.describe()

Unnamed: 0,year,playoffs,game,patch,participantid,gamelength,result,kills,deaths,assists,...,goldspent,total cs,minionkills,monsterkills,cspm,side_Red,position_jng,position_mid,position_sup,position_top
count,119733.0,119733.0,119733.0,119733.0,119733.0,119733.0,119733.0,119733.0,119733.0,119733.0,...,119733.0,119733.0,119733.0,119733.0,119733.0,119733.0,119733.0,119733.0,119733.0,119733.0
mean,-4.405694e-16,-7.596024000000001e-17,7.596024000000001e-18,-5.317217e-16,1.875268e-17,5.697018000000001e-17,-9.676029e-17,6.148032000000001e-17,9.305129e-17,3.0384090000000003e-17,...,1.709105e-16,-5.3172170000000007e-17,-7.833399e-17,2.5161830000000002e-17,-1.514457e-16,4.830596e-17,-1.47173e-17,1.020716e-17,-3.180835e-17,3.5368990000000005e-17
std,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,...,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004
min,-1.900724,-0.4456058,-0.7426331,-2.215658,-1.565549,-2.731714,-1.114461,-1.034378,-1.320102,-1.462309,...,-2.48852,-1.820225,-1.36102,-0.7463847,-1.982507,-0.9963236,-0.5309007,-0.4788176,-0.4983735,-0.4865646
25%,-0.8132323,-0.4456058,-0.7426331,-0.7951401,-0.8680857,-0.7026438,-1.114461,-0.6394648,-0.7622904,-0.7490495,...,-0.7148216,-0.6774222,-1.076917,-0.673969,-0.6811598,-0.9963236,-0.5309007,-0.4788176,-0.4983735,-0.4865646
50%,-0.08823784,-0.4456058,-0.7426331,-0.06150534,-0.1706218,-0.1479513,0.8972948,-0.2445513,-0.2044791,-0.03579015,...,-0.0807848,0.1066904,0.2591374,-0.4567218,0.2970584,-0.9963236,-0.5309007,-0.4788176,-0.4983735,-0.4865646
75%,0.6367566,-0.4456058,0.3108566,0.6757257,0.8755739,0.5556199,0.8972948,0.5452757,0.3533322,0.6774692,...,0.5928077,0.6989457,0.8043089,0.1226039,0.7905216,1.00369,-0.5309007,-0.4788176,-0.4983735,-0.4865646
max,2.086746,2.244136,3.471326,1.761793,1.573038,8.719926,0.8972948,6.863892,5.931445,9.236582,...,16.83097,10.40025,9.181522,5.535679,2.804765,1.00369,1.883591,2.088478,2.006527,2.055226


In [80]:
# drop correlated feautures
X_corr = X.corr()
corr_names = set()
for i in range(len(X_corr .columns)):
    for j in range(i):
        if abs(X_corr.iloc[i, j]) > 0.8:
            col = X_corr.columns[i]
            corr_names.add(col)

X.drop(columns=corr_names,inplace=True)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119733 entries, 0 to 119732
Data columns (total 23 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   year                      119733 non-null  float64
 1   playoffs                  119733 non-null  float64
 2   game                      119733 non-null  float64
 3   participantid             119733 non-null  float64
 4   gamelength                119733 non-null  float64
 5   result                    119733 non-null  float64
 6   kills                     119733 non-null  float64
 7   deaths                    119733 non-null  float64
 8   assists                   119733 non-null  float64
 9   teamkills                 119733 non-null  float64
 10  teamdeaths                119733 non-null  float64
 11  ckpm                      119733 non-null  float64
 12  damagetochampions         119733 non-null  float64
 13  damagetakenperminute      119733 non-null  f

In [82]:
import statsmodels.api as sm
import pandas as pd
from sklearn.metrics import accuracy_score

# Fit Logistic Regression model
X_logit = X.drop(columns=['year', 'game', 'participantid'])
ols = sm.Logit(y, X_logit.astype('float'))
result = ols.fit()

# Predict probabilities
y_pred_prob = result.predict(X_logit.astype('float'))

# Convert probabilities to binary predictions (0 or 1)
y_pred = (y_pred_prob >= 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y, y_pred)

# Print the accuracy
print(f'Accuracy: {accuracy:.4f}')


ValueError: endog must be in the unit interval.