In [1]:
import pandas as pd
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from warnings import simplefilter

# Get input for model training and validation
data_set = pd.read_csv('steam-cleaned.csv')
data_array = data_set.values
input_array = data_array[:, 1:10]
output_array = data_array[:, 10]

# Split data into train and validation sets, then each set into input and labels (referred to as output here)
validation_size = 0.20
input_train, input_validation, output_train, output_validation = model_selection.train_test_split(input_array, output_array, test_size=validation_size, random_state=42)

In [2]:
# List of models we'll use
models = [('LR', LogisticRegression(solver='liblinear')), ('KNN', KNeighborsClassifier()),
          ('CART', DecisionTreeClassifier()), ('NB', GaussianNB()), ('RFC', RandomForestClassifier()),
          ('MLP', MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000))]

simplefilter(action='ignore', category=FutureWarning)  # Otherwise output is clouded with future warnings

# K-fold cross-validation with k=10 to evaluate the models using training data
results = []
names = []

for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=42)
    cv_results = model_selection.cross_val_score(model, input_train, output_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())  # Display both the mean and standard deviation of the accuracy score
    print(msg)

LR: 0.806279 (0.006543)


KNN: 0.773038 (0.023732)
CART: 0.810203 (0.006777)


NB: 0.803924 (0.006646)


RFC: 0.810526 (0.006710)


MLP: 0.809603 (0.006239)


In [3]:
# LR, CART, RFC, MLP are all viable options with sufficiently high accuracies
# We'll feed the validation data into RFC and print out the accuracy score, confusion matrix, and classification report
rfc = RandomForestClassifier()
rfc.fit(input_train, output_train)
predictions = rfc.predict(input_validation)
print(accuracy_score(output_validation, predictions))
print(confusion_matrix(output_validation, predictions))
print(classification_report(output_validation, predictions))

0.8151431209602955
[[2340  329]
 [ 672 2074]]
              precision    recall  f1-score   support

           0       0.78      0.88      0.82      2669
           1       0.86      0.76      0.81      2746

    accuracy                           0.82      5415
   macro avg       0.82      0.82      0.81      5415
weighted avg       0.82      0.82      0.81      5415



In [4]:
# The model performs decently with unseen data, with good accuracy score, precision, and recall
# Next step is algorithm tuning