In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, cross_validate
import statistics

In [2]:
# Einlesen der Daten
df = pd.read_csv("train.csv")

# Skalierung von CO2, da rechtsschiefe Verteilung
scaler= MinMaxScaler()
scaler.fit(df["CO2"].values.reshape(-1, 1))
df["CO2"]=scaler.transform(df["CO2"].values.reshape(-1, 1))

# Auffüllen NaN-Werte
df.fillna(method="ffill", inplace= True)

# Operationen auf Datumsspalte
df["Datum"]=pd.to_datetime(df["Datum"])
df["tag"]=df["Datum"].dt.day.astype(str)
df["uhrzeit"]=df["Datum"].dt.hour.astype(str)
df_modified = df.drop(columns="Datum",axis=1)

# Teile Daten in Features und Label auf
X = df_modified.drop(columns=["Anwesenheit"])
y = df_modified[["Anwesenheit"]].values.reshape(-1,1)
y = np.ravel(y)

# Alle Spalten skalieren? SVM sei sehr sensitiv auf Skalierung?

## Hyperparameter Suche mit exhaustive GridSearchCV - rein auf recall fokussiert

In [6]:
from sklearn.metrics import classification_report
# Split dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [
    {"kernel": ["rbf"], "gamma": [1e-4, 0.1, 1], "C": [1, 10]},
    {"kernel": ["linear"], "C": [1, 10]},
    {"kernel": ["poly"],"C": [1, 10]}
]



clf = GridSearchCV(SVC(), tuned_parameters, scoring="recall")
clf.fit(X_train, y_train)
best_parameters = clf.best_params_
print(best_parameters)
best_result = clf.best_score_
print(best_result)

y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))

{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
0.9804794520547946
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      2116
           1       0.98      0.99      0.98       733

    accuracy                           0.99      2849
   macro avg       0.99      0.99      0.99      2849
weighted avg       0.99      0.99      0.99      2849



In [8]:
from joblib import dump, load
>>> dump(clf, 'svm.joblib') 

['svm.joblib']

In [None]:
# Nutzen des Modells
clf = load('svm.joblib') 

In [7]:
# ROC-Kurve
from sklearn import metrics


## GridSearchCV - ultra lange Laufzeit

## Effizienter: Händische Suche der Parameter

In [None]:
# SVC mit Kernel linear, C=1 und class_weight = balanced, da größerer Anteil an Abwesenheit als Anwesenheit

model1 = SVC(kernel="linear",C=1, class_weight="balanced")
scores1 = cross_validate(model1, X, y, cv=2,
                         scoring=["recall","accuracy","precision"],
                         return_train_score=True)
print(mean(scores1["recall"]))
print(mean(scores1["accuracy"]))
print(mean(scores1["precision"]))

In [None]:
# SVC mit Kernel rbf, C=1 und class_weight = balanced, da größerer Anteil an Abwesenheit als Anwesenheit

model2 = SVC(kernel="rbf",C=10, gamma=0.1, class_weight="balanced")
scores2 = cross_validate(model1, X, y, cv=5,
                         scoring=["recall", "precision",  "accuracy"],
                         return_train_score=True)
print(mean(scores1["recall"]))
print(mean(scores1["accuracy"]))
print(mean(scores1["precision"]))

In [None]:
# Index herauswerfen
# alles MinMax skalieren
# Upsampling (dann kann ich mir class_weight sparen)
# ROC-Kurve 
# Modell auf Testdaten starten