In [65]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
import numpy as np


# Load the data
# Load the data with ';' delimiter
measures = pd.read_csv('measures.csv', delimiter=';')
to_predict = pd.read_csv('to_predict.csv', delimiter=';')

# Convert comma decimal separator to dot and convert columns to appropriate types
measures = measures.map(lambda x: x.replace(',', '.') if isinstance(x, str) else x)
measures = measures.astype({col: float for col in measures.columns if col not in ['subject', 'activity']})
measures['subject'] = measures['subject'].astype(float)

to_predict = to_predict.map(lambda x: x.replace(',', '.') if isinstance(x, str) else x)
to_predict = to_predict.astype({col: float for col in measures.columns if col not in ['subject', 'activity']})


# Select only the columns that include "entropy" and the "subject" and "activity" columns
#columns_to_use = [col for col in measures.columns if 'entropy' in col] + ['subject', 'activity']
#measures_filtered = measures[columns_to_use]
#to_predict_filtered = to_predict[[col for col in to_predict.columns if 'entropy' in col] + ['subject']]

# Define the subjects for training and test sets
training_subjects = [1, 3, 5, 6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]
test_subjects = [27, 28, 29, 30]

# Split the measures dataset into training and test sets
training_set = measures[measures['subject'].isin(training_subjects)]
test_set = measures[measures['subject'].isin(test_subjects)]

# Ensure there's no overlap
assert not training_set['subject'].isin(test_subjects).any(), "Training and test sets overlap!"

# Define features and labels
X_train = training_set.drop(columns=['subject', 'activity'])
y_train = training_set['activity']

X_test = test_set.drop(columns=['subject', 'activity'])
y_test = test_set['activity']




In [69]:
# Initialize classifiers with chosen hyperparameters
log_reg = LogisticRegression(max_iter=10000, C=38, dual=False)
svm = SVC(C=40, gamma=0.055, kernel='rbf', probability=True)
knn = KNeighborsClassifier(n_neighbors=7, weights='distance')
dt = DecisionTreeClassifier(max_depth=100, min_samples_split=2)
rf = RandomForestClassifier(max_depth=100, n_estimators=1000)
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
nb = GaussianNB()
ada = AdaBoostClassifier(n_estimators=100, learning_rate=1, algorithm="SAMME")
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='lbfgs', alpha=0.002, learning_rate='adaptive', max_iter=2500)
xgb = XGBClassifier(n_estimators=500, learning_rate=0.2, max_depth=5, subsample=0.8, colsample_bytree=0.8, use_label_encoder=False, eval_metric='mlogloss')

# Define the ensemble of classifiers
ensemble = VotingClassifier(estimators=[
    ('log_reg', log_reg),
    ('svm', svm),
    #('knn', knn),
    #('dt', dt),
    #('rf', rf),
    #('gb', gb),
    #('nb', nb),
    #('ada', ada),
    #('mlp', mlp),
    #('xgb', xgb)
], voting='soft')  # Use 'soft' for probability-based voting

# Fit the ensemble model
ensemble.fit(X_train, y_train)

print("-->", ensemble.score(X_test, y_test))


# Predict on the test set
y_pred = ensemble.predict(X_test)

# Calculate accuracy and classification error
accuracy = accuracy_score(y_test, y_pred)
classification_error = 1 - accuracy

# Print the results
print("\nEnsemble Model:")
print("Test Set Accuracy:", accuracy)

#subject 1 bis 16 in training
# run one Test Set Accuracy:    0.9602693602693603
# nur logReg Test Set Accuracy: 0.9629629629629629
# nur svm Test Set Accuracy:    0.9622895622895623
# LR und SVM Test Set Accuracy: 0.9663299663299664 !!!
# xgb Test Set Accuracy:        0.9488215488215488
# xbg lr svm Test Set Accuracy: 0.9643097643097643


# xgb optimize Test Set Accuracy:       0.9434343434343434
# 0.2 learning rate Test Set Accuracy:  0.9474747474747475
# 0.5 Test Set Accuracy:                0.9420875420875421

#mlp optimize Test Set Accuracy:              0.938047138047138
# solver a -> lb Test Set Accuracy:           0.958922558922559
# activision logitic <- al Test Set Accuracy: 0.9501683501683502
# alpha von 001 auf .002 Test Set Accuracy:   0.9616161616161616

# svm lr mlp noch nicht besser Test Set Accuracy: 0.9602693602693603
# opti rf 50 1000 --> 0.9643097643097643
# rf 100 1000 --> --> 0.9649831649831649
 
# training mit allen 
# lr svm Test Set Accuracy: 0.9717171717171718

--> 0.9656565656565657

Ensemble Model:
Test Set Accuracy: 0.9656565656565657


In [47]:

# Prepare the to_predict features, ensuring to match the training features
#to_predict_features = to_predict_filtered[X_train.columns]

# Predict activities in to_predict.csv
#predicted_activities = ensemble.predict(to_predict_features)

# Add predictions to the to_predict DataFrame
#to_predict_filtered['predicted_activity'] = predicted_activities

# Display the first few rows with predictions
#print("\nPredictions on to_predict.csv:")
#print(to_predict_filtered.head())