# Training data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# imports
# feature selection
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import LinearSVC
from sklearn.svm import SVR

# models
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# scoring
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

np.random.seed(0)

df = pd.read_csv('npf_train.csv')

class2 = np.array(["event"]*df.shape[0], dtype="object")
class2[df["class4"]=="nonevent"] = "nonevent"
df["class2"] = class2
df["class2"] = df["class2"].astype("category")

not_needed_columns = ["partlybad", "id", "date"]
df = df.drop(not_needed_columns, axis=1)

class2_categories = {
    "nonevent": 0,
    "event": 1
}

class4_categories = {
    "nonevent": 0,
    "Ia": 1,
    "Ib": 2,
    "II": 3
}

def categorize_class2(row):
    row["class2"] = class2_categories[row["class2"]]
    
    return row
def categorize_class4(row):
    row["class4"] = class4_categories[row["class4"]]
    
    return row

df = df.apply(lambda row: categorize_class2(row), axis=1)
df = df.apply(lambda row: categorize_class4(row), axis=1)
df["class2"] = df["class2"].astype("category")
df["class4"] = df["class4"].astype("category")

df = df.sample(frac=1).reset_index(drop=True)

# Training

In [3]:
# Parameters
y_col = "class4"
cv_features = StratifiedKFold(10)
cv = 10
step_features = 1

# Data
X_cols = list(filter(lambda col: col != y_col and col != "class2", df.columns))
X_train = df[X_cols]
y_train = df[y_col]

# Train
clf = RandomForestClassifier(random_state=0)
estimator = RandomForestClassifier(random_state=0)
pipe = Pipeline(
    [('scaler', StandardScaler()), 
    ("feature_selection", RFECV(estimator=estimator, step=step_features, cv=cv_features, scoring="accuracy")), 
    ("clf", clf)]
)

# scores = cross_val_score(pipe, X_train, y_train, cv=cv)
# cv_score = scores.mean()

# Fitted model
pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection',
                 RFECV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
                       estimator=RandomForestClassifier(random_state=0),
                       scoring='accuracy')),
                ('clf', RandomForestClassifier(random_state=0))])

# Prediction

In [7]:
chosen_features =  list(X_train.columns[pipe.steps[1][1].support_]), 

# Read and clean the test data
df = pd.read_csv('npf_test_hidden.csv')

class2 = np.array(["event"]*df.shape[0],dtype="object")
class2[df["class4"]=="nonevent"] = "nonevent"
df["class2"] = class2
df["class2"] = df["class2"].astype("category")

not_needed_columns = ["partlybad", "id", "date", "class4"]
df = df.drop(not_needed_columns, axis=1)

X_df = df.loc[:, ~df.columns.isin(['class2', 'class4'])].copy()
X = X_df.values

pred = pipe.predict_proba(X)


# Results

In [13]:
# p column
p_nonevent = pred[:,0]
p_event = np.sum(pred[:,1:], axis=1)
np.stack((p_nonevent, p_event))

classes = {0: "nonevent", 1: "Ia", 2:"Ib", 3:"II"}

class_pred = [classes[i] for i in np.argmax(pred, axis=1)]

results = pd.DataFrame(data={"nonevent":pred[:,0], "Ia":pred[:,1], "Ib":pred[:,2], "II":pred[:,3], "prediction":class_pred, "p":p_event})
results[['prediction', 'p']].to_csv('answers_new.csv', index=False)
results

Unnamed: 0,nonevent,Ia,Ib,II,prediction,p
0,0.63,0.10,0.11,0.16,nonevent,0.37
1,0.79,0.08,0.09,0.04,nonevent,0.21
2,0.01,0.09,0.76,0.14,Ib,0.99
3,0.10,0.10,0.58,0.22,Ib,0.90
4,0.80,0.00,0.05,0.15,nonevent,0.20
...,...,...,...,...,...,...
960,0.67,0.00,0.09,0.24,nonevent,0.33
961,1.00,0.00,0.00,0.00,nonevent,0.00
962,0.03,0.02,0.65,0.30,Ib,0.97
963,0.01,0.32,0.25,0.42,II,0.99


# Score

In [14]:
df = pd.read_csv('npf_train.csv')

class2 = np.array(["event"]*df.shape[0], dtype="object")
class2[df["class4"]=="nonevent"] = "nonevent"
df["class2"] = class2
df["class2"] = df["class2"].astype("category")

not_needed_columns = ["partlybad", "id", "date"]
df = df.drop(not_needed_columns, axis=1)

df = df.apply(lambda row: categorize_class2(row), axis=1)
df = df.apply(lambda row: categorize_class4(row), axis=1)
df["class2"] = df["class2"].astype("category")
df["class4"] = df["class4"].astype("category")

df = df.sample(frac=1).reset_index(drop=True)

y_col = "class2"

# Data
X_cols = list(filter(lambda col: col != y_col and col != "class4", df.columns))
X_train = df[X_cols]
y_train = df[y_col]


In [15]:
# accuracy
scores = cross_val_score(pipe, X_train, y_train, cv=cv)
cv_score = scores.mean()

In [16]:
cv_score

0.8712560386473431