##### <ins>Import required libs</ins>

In [None]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as pyplot

##### <ins>Taking a look at the DFs head:</ins>

In [None]:
gyro = pd.read_csv('../datasets/gyro_mobile.csv')
print(gyro.head())

       accX      accY      accZ     gyroX     gyroY     gyroZ timestamp  \
0 -0.496517  3.785628  8.954828 -0.142849 -0.126159 -0.022539   34:22.9   
1 -0.462388  3.869603  9.281898  0.084349  0.096695  0.092130   34:23.0   
2 -0.296084  3.820505  8.930728  0.061763  0.051543  0.071287   34:23.1   
3 -0.469723  3.890110  8.744067  0.007641  0.028679  0.109433   34:23.2   
4 -0.472418  4.109105  8.941207 -0.123640  0.099057  0.051943   34:23.3   

   Activity  
0         1  
1         1  
2         1  
3         1  
4         1  


In [6]:
gyroCols = gyro.columns.to_list()
for col in gyroCols:
    print(f"Column: {col} \n{gyro[col].describe()} \nData Type: {gyro[col].dtype}\n")

Column: accX 
count    31991.000000
mean         0.023825
std          0.741396
min         -3.673361
25%         -0.472193
50%         -0.024998
75%          0.477208
max          4.678671
Name: accX, dtype: float64 
Data Type: float64

Column: accY 
count    31991.000000
mean         2.153858
std          1.085466
min         -4.386029
25%          1.413062
50%          2.119143
75%          2.928435
max          6.377039
Name: accY, dtype: float64 
Data Type: float64

Column: accZ 
count    31991.000000
mean         9.537909
std          2.056358
min          4.296066
25%          7.794217
50%          9.406739
75%         11.158845
max         17.591568
Name: accZ, dtype: float64 
Data Type: float64

Column: gyroX 
count    31991.000000
mean        -0.004493
std          0.307643
min         -1.470421
25%         -0.149783
50%          0.022301
75%          0.177978
max          1.332722
Name: gyroX, dtype: float64 
Data Type: float64

Column: gyroY 
count    31991.000000
mean     

Insights:
- 31991 data points
- Every feature is continuous
- Activity is either 1 or 0 (binary classification)
- Dataset contains a timestamp that might be dropped

##### <ins>Dropping timestamp and splitting data into training and testing</ins>

In [7]:
gyro = gyro.drop(columns='timestamp')

x_train, x_test, y_train, y_test = train_test_split(
    gyro.iloc[:,:6],
    gyro.iloc[:,6:],
    test_size=0.2,
    random_state=0
)

##### <ins>Training and Evaluating</ins><br>
Um eine gute Anzahl an Estimators zu bestimmen, wird zuerst ein Modell mithilfe von Early Stopping, sowie einer großen Menge an Estimatoren trainiert. Hiermit wird die beste Anzahl an Iterationen ermittelt und mit dieser Anzahl ein weiteres Modell trainiert.

In [8]:
preModel = XGBClassifier(           # "Spendermodell"
    objective='binary:logistic',
    n_estimators=10000,             # "Große Anzahl an Schaetzern, die nicht erreicht werden soll"
    early_stopping_rounds=100,      # Anzahl an Runden, bei denen sich das Modell nicht verbessern muss, bis abgebrochen wird
    max_depth=2,
    learning_rate=0.1
)

evaldata=[(x_test,y_test)]          # Datensatz zur Evaluierung

preModel.fit(x_train, y_train, eval_set=evaldata, verbose=False)

bIter = preModel.best_iteration     # Beste Anzahl an Estimatoren
print(f'Best Iteration: {bIter}')

model = XGBClassifier(
    objective='binary:logistic',
    n_estimators=bIter,
    max_depth=2,
    learning_rate=0.1
)

model.fit(x_train, y_train)

yhat = model.predict(x_test)
print(accuracy_score(y_test, yhat))

Best Iteration: 624
0.9832786372870761
