In [19]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
import numpy as np
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold

# Load the data
# Load the data with ';' delimiter
measures = pd.read_csv('measures.csv', delimiter=';')
to_predict = pd.read_csv('to_predict.csv', delimiter=';')

# Convert comma decimal separator to dot and convert columns to appropriate types
measures = measures.map(lambda x: x.replace(',', '.') if isinstance(x, str) else x)
measures = measures.astype({col: float for col in measures.columns if col not in ['subject', 'activity']})
measures['subject'] = measures['subject'].astype(float)

to_predict = to_predict.map(lambda x: x.replace(',', '.') if isinstance(x, str) else x)
to_predict = to_predict.astype({col: float for col in to_predict.columns if col not in ['subject', 'activity']})
to_predict['subject'] = to_predict['subject'].astype(float)

# Feature selection and train-test split
X = measures.drop(['activity'], axis=1)
y = measures['activity']

#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

training_subjects = [1, 3, 5, 6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]
test_subjects = [27, 28, 29, 30]

# Split the measures dataset into training and test sets
training_set = measures[measures['subject'].isin(training_subjects)]
test_set = measures[measures['subject'].isin(test_subjects)]

# Ensure there's no overlap
assert not training_set['subject'].isin(test_subjects).any(), "Training and test sets overlap!"

# Define features and labels
X_train = training_set.drop(columns=['subject', 'activity'])
y_train = training_set['activity']

X_test = test_set.drop(columns=['subject', 'activity'])
y_test = test_set['activity']


to_predict_test_X = to_predict.drop(columns=['subject'])


In [23]:
# Applying feature selection
selector = VarianceThreshold()
X_train_important = selector.fit_transform(X_train)
X_test_important = selector.transform(X_test)

selected_features = X_train.columns[selector.get_support()]

In [26]:
# Initialize classifiers with chosen hyperparameters
log_reg = LogisticRegression(max_iter=10000, C=38, dual=False)
svm = SVC(C=40, gamma=0.055, kernel='rbf', probability=True)
knn = KNeighborsClassifier(n_neighbors=7, weights='distance')
dt = DecisionTreeClassifier(max_depth=100, min_samples_split=2)
rf = RandomForestClassifier(max_depth=100, n_estimators=1000)
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
nb = GaussianNB()
ada = AdaBoostClassifier(n_estimators=100, learning_rate=1, algorithm="SAMME")
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='lbfgs', alpha=0.002, learning_rate='adaptive', max_iter=2500)
xgb = XGBClassifier(n_estimators=500, learning_rate=0.2, max_depth=5, subsample=0.8, colsample_bytree=0.8, use_label_encoder=False, eval_metric='mlogloss')

# Define the ensemble of classifiers
ensemble = VotingClassifier(estimators=[
    ('log_reg', log_reg),
    ('svm', svm),
    #('knn', knn),
    #('dt', dt),
    #('rf', rf),
    #('gb', gb),
    #('nb', nb),
    #('ada', ada),
    #('mlp', mlp),
    #('xgb', xgb)
], voting='soft')  # Use 'soft' for probability-based voting

# Fit the ensemble model
ensemble.fit(X_train_important, y_train)
# Evaluate the model on the test set
print("-->", ensemble.score(X_test_important, y_test))
ensemble.fit(pd.DataFrame(X_train_important, columns=selected_features), y_train)
print("-->", ensemble.score(pd.DataFrame(X_test_important, columns=selected_features), y_test))

# Predict on the test set
y_pred = ensemble.predict(to_predict_test_X)


--> 0.9723905723905724
--> 0.9723905723905724
