In [1]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
import numpy as np

### Data Preparation

In [2]:
# Load the data
# Load the data with ';' delimiter
measures = pd.read_csv('measures.csv', delimiter=';')
to_predict = pd.read_csv('to_predict.csv', delimiter=';')

# Convert comma decimal separator to dot and convert columns to appropriate types
measures = measures.map(lambda x: x.replace(',', '.') if isinstance(x, str) else x)
measures = measures.astype({col: float for col in measures.columns if col not in ['subject', 'activity']})
measures['subject'] = measures['subject'].astype(float)

to_predict = to_predict.map(lambda x: x.replace(',', '.') if isinstance(x, str) else x)
to_predict = to_predict.astype({col: float for col in measures.columns if col not in ['subject', 'activity']})

# Define the subjects for training and test sets
training_subjects =      [1, 3, 5, 6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]
test_subjects = [27, 28, 29, 30]

# Split the measures dataset into training and test sets
training_set = measures[measures['subject'].isin(training_subjects)]
test_set = measures[measures['subject'].isin(test_subjects)]

# Ensure there's no overlap
assert not training_set['subject'].isin(test_subjects).any(), "Training and test sets overlap!"

# Define features and labels
X_train = training_set.drop(columns=['subject', 'activity'])
y_train = training_set['activity']

X_test = test_set.drop(columns=['subject', 'activity'])
y_test = test_set['activity']

to_predict = to_predict.drop(columns=['subject'])


### Feature Selection

Optimal number of features: 138
Test Set Accuracy with Optimal Features: 0.9683501683501684
with LogisticRegression(max_iter=10000, C=38, dual=False)

In [4]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

# Initialize the model
log_reg = LogisticRegression(max_iter=1000, C=38, dual=False)

# Initialize RFECV
rfecv = RFECV(estimator=log_reg, step=1, cv=StratifiedKFold(5), scoring='accuracy')

# Fit RFECV
rfecv.fit(X_train, y_train)

# Get the optimal number of features
optimal_num_features = rfecv.n_features_
print("Optimal number of features:", optimal_num_features)

# Get the support and ranking of features
selected_features = X_train.columns[rfecv.support_]
print("Selected features:", selected_features)

# Create new datasets with selected features
X_train_optimal_log_reg = X_train[selected_features]
X_test_optimal_log_reg = X_test[selected_features]
to_predict = to_predict[selected_features]



Optimal number of features: 138
Selected features: Index(['tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-max()-X',
       'tBodyAcc-max()-Z', 'tBodyAcc-min()-X', 'tBodyAcc-min()-Y',
       'tBodyAcc-min()-Z', 'tBodyAcc-iqr()-Z', 'tBodyAcc-entropy()-X',
       'tBodyAcc-entropy()-Y',
       ...
       'fBodyBodyAccJerkMag-std()', 'fBodyBodyAccJerkMag-max()',
       'fBodyBodyAccJerkMag-entropy()', 'fBodyBodyAccJerkMag-skewness()',
       'fBodyBodyGyroMag-min()', 'fBodyBodyGyroMag-meanFreq()',
       'fBodyBodyGyroJerkMag-min()', 'fBodyBodyGyroJerkMag-skewness()',
       'angle(X,gravityMean)', 'angle(Y,gravityMean)'],
      dtype='object', length=138)


 ### SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=6)
X_train_smote, y_train_smote = smote.fit_resample(X_train_optimal_log_reg, y_train)

print("SMOTE finished")

### Training Model SVM

In [None]:
# Initialize the SVM classifier
svm_smote = SVC(C=40, gamma=0.055, kernel='rbf', probability=True)

# Train the classifier on the oversampled data
svm_smote.fit(X_train_smote, y_train_smote)

# Predict on the test set
y_pred_smote = svm_smote.predict(X_test_optimal_log_reg)

# Calculate accuracy and confusion matrix
accuracy_smote = accuracy_score(y_test, y_pred_smote)
conf_matrix_smote = confusion_matrix(y_test, y_pred_smote)

# Print the results
# print("SMOTE SVM Test Set Accuracy:", accuracy_smote)
# print("\nSMOTE SVM Confusion Matrix:\n", conf_matrix_smote)
# print("\nSMOTE SVM Classification Report:\n", classification_report(y_test, y_pred_smote))

# Visualizing the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix_smote, annot=True, fmt='d', cmap='Blues')
plt.title('SMOTE SVM Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


### Predicting and Saving

In [None]:
import pandas as pd

# Make predictions on the to_predict dataset
predictions = svm_smote.predict(to_predict)

# Create a DataFrame with the predictions
predictions_df = pd.DataFrame(predictions, columns=['Predicted Activity'])

# Assuming 'subject' column needs to be included in the output, if available
# If not, you can skip this part
if 'subject' in to_predict.columns:
    predictions_df['subject'] = to_predict['subject']

# Save the predictions to a CSV file
predictions_df.to_csv('predicted_activities.csv', index=False)

print("Predictions saved to 'predicted_activities.csv'")
