In [165]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold

from sklearn.linear_model import LogisticRegression

#### Fetching the data

In [166]:
df = pd.read_csv("data.csv")

In [167]:
# Irrelevant features
irr_fts = {
    "weapon": ["ak47", "ssg08", "fiveseven", "p250", "usps", "aug", "awp", "galilar", "glock", "cz75auto", "bizon", "elite", "famas", "g3sg1", "m249", "m4a1s", "m4a4", "mac10", "mag7", "mp5sd", "mp7", "mp9", "negev", "nova", "p90", "r8revolver", "sawedoff", "scar20", "sg553", "ump45", "xm1014", "deagle", "tec9", "p2000"],
    "grenade": ["hegrenade", "decoygrenade", "flashbang", "smokegrenade", "incendiarygrenade", "molotovgrenade"]
}

irr_cols = [] 

for ft_type, ft_names in irr_fts.items():
    for ft_name in ft_names:
        for side in ["ct", "t"]:
            value = f"{side}_{ft_type}_{ft_name}"
            irr_cols.append(value)

#### Remove irrelevant columns

In [168]:
df.drop(columns=irr_cols, inplace=True)

#### Convert features to suitable data types

In [169]:
le = LabelEncoder()
df["map"] = le.fit_transform(df["map"])
df["round_winner"] = le.fit_transform(df["round_winner"])
df["bomb_planted"] = df["bomb_planted"].astype(np.int16)

In [170]:
df.columns

Index(['time_left', 'ct_score', 't_score', 'map', 'bomb_planted', 'ct_health',
       't_health', 'ct_armor', 't_armor', 'ct_money', 't_money', 'ct_helmets',
       't_helmets', 'ct_defuse_kits', 'ct_players_alive', 't_players_alive',
       'round_winner'],
      dtype='object')

#### Prepare the data for the models

In [173]:
# Used features
features = ["ct_score", "t_score", "time_left", "ct_money", "t_money", "map", "ct_armor", "t_armor"]

# Round winner is the label (1 or 0)
X, y = df[features], df["round_winner"]

# K-fold (to prevent overfitting)
seed=42
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

In [172]:
for train_indices, val_indices in kf.split(X):
    X_train, X_val, y_train, y_val = X.iloc[train_indices], X.iloc[val_indices], y.iloc[train_indices], y.iloc[val_indices]
    # Logistic regression classifier
    clf_lr = LogisticRegression()
    clf_lr.fit(X_train, y_train)
    y_pred_train_lr = clf_lr.predict(X_train)
    acc_train_lr = clf_lr.score(X_train, y_train)
    accuracy_lr = clf_lr.score(X, y)

    ## validation
    y_pred_val_lr = clf_lr.predict(X_val)
    acc_val_lr = clf_lr.score(X_val, y_val)
    
    print(f"training accuracy: {acc_train_lr}")
    print(f"validation accuracy: {acc_val_lr}")
    print("--------------")


training accuracy: 0.5269342940470063
validation accuracy: 0.5200960690128419
--------------
training accuracy: 0.5227002585562514
validation accuracy: 0.5285640761708698
--------------
training accuracy: 0.5243300207090078
validation accuracy: 0.5253045119231429
--------------
