In [1]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
import numpy as np


# Load the data
# Load the data with ';' delimiter
measures = pd.read_csv('measures.csv', delimiter=';')
to_predict = pd.read_csv('to_predict.csv', delimiter=';')

# Convert comma decimal separator to dot and convert columns to appropriate types
measures = measures.map(lambda x: x.replace(',', '.') if isinstance(x, str) else x)
measures = measures.astype({col: float for col in measures.columns if col not in ['subject', 'activity']})
measures['subject'] = measures['subject'].astype(float)

to_predict = to_predict.map(lambda x: x.replace(',', '.') if isinstance(x, str) else x)
to_predict = to_predict.astype({col: float for col in measures.columns if col not in ['subject', 'activity']})

# Define the subjects for training and test sets
training_subjects =      [1, 3, 5, 6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]
test_subjects = [27, 28, 29, 30]

# Split the measures dataset into training and test sets
training_set = measures[measures['subject'].isin(training_subjects)]
test_set = measures[measures['subject'].isin(test_subjects)]

# Ensure there's no overlap
assert not training_set['subject'].isin(test_subjects).any(), "Training and test sets overlap!"

# Define features and labels
X_train = training_set.drop(columns=['subject', 'activity'])
y_train = training_set['activity']

X_test = test_set.drop(columns=['subject', 'activity'])
y_test = test_set['activity']




In [4]:
# Assuming X_train and y_train are already defined as shown in previous steps

# Train a RandomForestClassifier to evaluate feature importance
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Get feature importances
feature_importances = pd.Series(rf_model.feature_importances_, index=X_train.columns)

# Display the top 10 most important features
top_features = feature_importances.nlargest(10)
print(top_features)


# Create new datasets with top features
X_train_top = X_train[top_features.index]
X_test_top = X_test[top_features.index]

tGravityAcc-mean()-X      0.034425
angle(X,gravityMean)      0.033263
tGravityAcc-mean()-Y      0.030730
tGravityAcc-max()-X       0.027335
tGravityAcc-min()-Y       0.027280
tGravityAcc-min()-X       0.022995
angle(Y,gravityMean)      0.022869
tGravityAcc-energy()-X    0.021537
tGravityAcc-max()-Y       0.021520
tGravityAcc-energy()-Y    0.017830
dtype: float64


1. Optimal Feature Selection:
Let's try selecting a different number of top features to see if we can find a better subset. We can use methods like Recursive Feature Elimination (RFE) with cross-validation to determine the optimal number of features.

2. Recursive Feature Elimination with Cross-Validation (RFECV):
This method helps find the best number of features by recursively removing the least important features and evaluating the model's performance.

Here is the code to perform RFECV:

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

# Initialize the model
rf_model = RandomForestClassifier(n_estimators=5, random_state=42)
log_reg = LogisticRegression(max_iter=1000, C=38, dual=False)

# Initialize RFECV
rfecv = RFECV(estimator=log_reg, step=1, cv=StratifiedKFold(5), scoring='accuracy')

# Fit RFECV
rfecv.fit(X_train, y_train)

# Get the optimal number of features
optimal_num_features = rfecv.n_features_
print("Optimal number of features:", optimal_num_features)

# Get the support and ranking of features
selected_features = X_train.columns[rfecv.support_]
print("Selected features:", selected_features)

# Create new datasets with selected features
X_train_optimal = X_train[selected_features]
X_test_optimal = X_test[selected_features]

# Train model using optimal features
rf_model_optimal = RandomForestClassifier(n_estimators=100, random_state=42)
log_reg = LogisticRegression(max_iter=10000, C=38, dual=False)
rf_model_optimal.fit(X_train_optimal, y_train)
accuracy_optimal = rf_model_optimal.score(X_test_optimal, y_test)
print("Test Set Accuracy with Optimal Features:", accuracy_optimal)


In [13]:
# Initialize classifiers with chosen hyperparameters
svm = SVC(C=40, gamma=0.055, kernel='rbf', probability=True)
knn = KNeighborsClassifier(n_neighbors=7, weights='distance')
dt = DecisionTreeClassifier(max_depth=100, min_samples_split=2)
rf = RandomForestClassifier(max_depth=100, n_estimators=1000)
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
nb = GaussianNB()
ada = AdaBoostClassifier(n_estimators=100, learning_rate=1, algorithm="SAMME")
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='lbfgs', alpha=0.002, learning_rate='adaptive', max_iter=2500)
xgb = XGBClassifier(n_estimators=500, learning_rate=0.2, max_depth=5, subsample=0.8, colsample_bytree=0.8, use_label_encoder=False, eval_metric='mlogloss')

# Define the ensemble of classifiers
ensemble = VotingClassifier(estimators=[
    ('log_reg', log_reg),
    ('svm', svm),
    #('knn', knn),
    #('dt', dt),
    #('rf', rf),
    #('gb', gb),
    #('nb', nb),
    #('ada', ada),
    #('mlp', mlp),
    #('xgb', xgb)
], voting='soft')  # Use 'soft' for probability-based voting

# Fit the ensemble model
ensemble.fit(X_train, y_train)

print("-->", ensemble.score(X_test, y_test))


# Predict on the test set
y_pred = ensemble.predict(X_test)

# Calculate accuracy and classification error
accuracy = accuracy_score(y_test, y_pred)
classification_error = 1 - accuracy

# Print the results
print("\nEnsemble Model:")
print("Test Set Accuracy:", accuracy)
# lr svm Test Set Accuracy: 0.9717171717171718

--> 0.9717171717171718

Ensemble Model:
Test Set Accuracy: 0.9717171717171718


In [12]:
ensemble = VotingClassifier(estimators=[
    ('log_reg', log_reg),
    ('svm', svm)
], voting='soft')  # Use 'soft' for probability-based voting

ensemble.fit(X_train_optimal, y_train)
print("top features -->", ensemble.score(X_test_optimal, y_test))

top features --> 0.9454545454545454
