In [4]:
import pandas as pd

# Load the data with ';' delimiter
measures = pd.read_csv('measures.csv', delimiter=';')
to_predict = pd.read_csv('to_predict.csv', delimiter=';')

# Convert comma decimal separator to dot and convert columns to appropriate types
measures = measures.map(lambda x: x.replace(',', '.') if isinstance(x, str) else x)
measures = measures.astype({col: float for col in measures.columns if col not in ['subject', 'activity']})
measures['subject'] = measures['subject'].astype(float)

to_predict = to_predict.map(lambda x: x.replace(',', '.') if isinstance(x, str) else x)
to_predict = to_predict.astype({col: float for col in measures.columns if col not in ['subject', 'activity']})

# Print the first 10 rows of the columns 'subject' and 'activity'
print("First 10 rows of 'subject' and 'activity' in measures.csv:")
print(measures[['subject', 'activity']].head(10))


First 10 rows of 'subject' and 'activity' in measures.csv:
   subject  activity
0      1.0  STANDING
1      1.0  STANDING
2      1.0  STANDING
3      1.0  STANDING
4      1.0  STANDING
5      1.0  STANDING
6      1.0  STANDING
7      1.0  STANDING
8      1.0  STANDING
9      1.0  STANDING


In [5]:
# Define the subjects for training and test sets
training_subjects = [1.0, 3.0, 5.0, 6.0]
test_subjects = [27.0, 28.0, 29.0, 30.0]

# Split the measures dataset into training and test sets
training_set = measures[measures['subject'].isin(training_subjects)]
test_set = measures[measures['subject'].isin(test_subjects)]

# Ensure there's no overlap
assert not training_set['subject'].isin(test_subjects).any(), "Training and test sets overlap!"

# Display the size of each dataset
print("\nTraining Set Size:", training_set.shape)
print("Test Set Size:", test_set.shape)



Training Set Size: (1315, 563)
Test Set Size: (1485, 563)


In [6]:
# Define features and labels
X_train = training_set.drop(columns=['subject', 'activity'])
y_train = training_set['activity']

X_test = test_set.drop(columns=['subject', 'activity'])
y_test = test_set['activity']

# Display the first few rows of the feature sets
print("\nTraining Features:")
print(X_train)

print("\nTest Features:")
print(X_test)



Training Features:
      tBodyAcc-mean()-X  tBodyAcc-mean()-Y  tBodyAcc-mean()-Z  \
0              0.288585          -0.020294          -0.132905   
1              0.278419          -0.016411          -0.123520   
2              0.279653          -0.019467          -0.113462   
3              0.279174          -0.026201          -0.123283   
4              0.276629          -0.016570          -0.115362   
...                 ...                ...                ...   
1310           0.298282           0.008336          -0.053625   
1311           0.141531          -0.095164          -0.181721   
1312           0.214323          -0.025805          -0.105071   
1313           0.317841           0.061345          -0.019359   
1314           0.270468          -0.011791          -0.086065   

      tBodyAcc-std()-X  tBodyAcc-std()-Y  tBodyAcc-std()-Z  tBodyAcc-mad()-X  \
0            -0.995279         -0.983111         -0.913526         -0.995112   
1            -0.998245         -0.97530

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Initialize the model
model = RandomForestClassifier(n_estimators=15, random_state=42)

# Perform 10-fold cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=10)

print("\nCross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Score:", cv_scores.mean())



Cross-Validation Scores: [0.97727273 0.98484848 0.98484848 1.         0.96212121 0.9389313
 0.90839695 0.96946565 0.96946565 0.86259542]
Mean Cross-Validation Score: 0.9557945870922971


In [35]:
from sklearn.metrics import accuracy_score

# Train the final model on all available training data
final_model = RandomForestClassifier(n_estimators=100, random_state=42)
final_model.fit(X_train, y_train)

# Predict on the test set
y_pred = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate classification error
classification_error = 1 - accuracy

# Print the results
print("\nAccuracy:", accuracy)
print("Classification Error:", classification_error)

# Prepare the to_predict features, ensuring to match the training features
to_predict_features = to_predict[X_train.columns]

# Predict activities in to_predict.csv
predicted_activities = final_model.predict(to_predict_features)

# Add predictions to the to_predict DataFrame
to_predict['predicted_activity'] = predicted_activities

# Display the first few rows with predictions
#print("\nPredictions on to_predict.csv:")
#print(to_predict.head())



Accuracy: 0.9245791245791246
Classification Error: 0.07542087542087539


In [32]:
# Train the final model on all available training data
final_model = RandomForestClassifier(n_estimators=100, random_state=42)
final_model.fit(X_train, y_train)

# Predict activities in to_predict.csv
to_predict_features = to_predict.drop(columns=['subject'])
predicted_activities = final_model.predict(to_predict_features)

# Add predictions to the to_predict DataFrame
to_predict_with_predictions = pd.concat([to_predict, pd.Series(predicted_activities, name='predicted_activity')], axis=1)


# Display the first few rows with predictions
print("\nPredictions on to_predict.csv:")
print(to_predict.head())


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- predicted_activity



Accuracy: 0.9245791245791246
Classification Error: 0.07542087542087539

Predictions on to_predict.csv:
   tBodyAcc-mean()-X  tBodyAcc-mean()-Y  tBodyAcc-mean()-Z  tBodyAcc-std()-X  \
0           0.257178          -0.023285          -0.014654         -0.938404   
1           0.286027          -0.013163          -0.119083         -0.975415   
2           0.275485          -0.026050          -0.118152         -0.993819   
3           0.270298          -0.032614          -0.117520         -0.994743   
4           0.274833          -0.027848          -0.129527         -0.993852   

   tBodyAcc-std()-Y  tBodyAcc-std()-Z  tBodyAcc-mad()-X  tBodyAcc-mad()-Y  \
0         -0.920091         -0.667683         -0.952501         -0.925249   
1         -0.967458         -0.944958         -0.986799         -0.968401   
2         -0.969926         -0.962748         -0.994403         -0.970735   
3         -0.973268         -0.967091         -0.995274         -0.974471   
4         -0.967445         -0

In [7]:
from sklearn.metrics import accuracy_score

def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, to_predict):
    # Train the model on all available training data
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Calculate classification error
    classification_error = 1 - accuracy

    # Prepare the to_predict features, ensuring to match the training features
    to_predict_features = to_predict[X_train.columns]

    # Predict activities in to_predict.csv
    predicted_activities = model.predict(to_predict_features)

    # Add predictions to the to_predict DataFrame
    to_predict['predicted_activity'] = predicted_activities

    return accuracy, classification_error, to_predict

In [38]:
from sklearn.metrics import accuracy_score
# Example usage
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Call the function
accuracy, classification_error, to_predict_with_predictions = train_and_evaluate_model(
    rf_model, X_train, y_train, X_test, y_test, to_predict
)

# Print the results
print("\nAccuracy:", accuracy)
print("Classification Error:", classification_error)
#print("\nPredictions on to_predict.csv:")
#print(to_predict_with_predictions.head())



Accuracy: 0.9245791245791246
Classification Error: 0.07542087542087539


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

# List of classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=250),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Naive Bayes': GaussianNB(),
    'AdaBoost': AdaBoostClassifier(algorithm="SAMME"),
    'MLP': MLPClassifier()
}

# Iterate through classifiers and evaluate each
for name, model in classifiers.items():
    accuracy, classification_error, _ = train_and_evaluate_model(
        model, X_train, y_train, X_test, y_test, to_predict
    )
    print(f"\n{name}:")
    print("Accuracy:", accuracy)
    print("Classification Error:", classification_error)


  to_predict['predicted_activity'] = predicted_activities



Logistic Regression:
Accuracy: 0.9239057239057239
Classification Error: 0.0760942760942761

SVM:
Accuracy: 0.9138047138047138
Classification Error: 0.08619528619528616

KNN:
Accuracy: 0.8868686868686869
Classification Error: 0.11313131313131308

Decision Tree:
Accuracy: 0.7858585858585858
Classification Error: 0.2141414141414142

Random Forest:
Accuracy: 0.9272727272727272
Classification Error: 0.07272727272727275


KeyboardInterrupt: 

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# List of classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=500),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Naive Bayes': GaussianNB(),
    'AdaBoost': AdaBoostClassifier(algorithm="SAMME"),
    'MLP': MLPClassifier()
}

# Number of folds for cross-validation
n_folds = 10

# Iterate through classifiers and evaluate each with cross-validation
for name, model in classifiers.items():
    # Perform cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=n_folds)
    mean_accuracy = np.mean(cv_scores)
    classification_error = 1 - mean_accuracy
    
    # Print the results
    print(f"\n{name}:")
    print("Cross-Validation Mean Accuracy:", mean_accuracy)
    #print("Cross-Validation Classification Error:", classification_error)

    # Train the model on the full training set and evaluate on the test set
    accuracy, classification_error, _ = train_and_evaluate_model(
        model, X_train, y_train, X_test, y_test, to_predict
    )
    print("Test Set Accuracy:", accuracy)
    #print("Test Set Classification Error:", classification_error)



Logistic Regression:
Cross-Validation Mean Accuracy: 0.9755956511681703
Test Set Accuracy: 0.9239057239057239

SVM:
Cross-Validation Mean Accuracy: 0.955031228313671
Test Set Accuracy: 0.9138047138047138

KNN:
Cross-Validation Mean Accuracy: 0.9307483229238956
Test Set Accuracy: 0.8868686868686869

Decision Tree:
Cross-Validation Mean Accuracy: 0.9024751330094845
Test Set Accuracy: 0.826936026936027


KeyboardInterrupt: 

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
import numpy as np


In [ ]:


# Define parameter grids for each classifier
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100]
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'gamma': [1, 0.1, 0.01, 0.001],
        'kernel': ['linear', 'rbf']
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
    },
    'Decision Tree': {
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': [2, 5, 10]
    },
    'Random Forest': {
        'n_estimators': [10, 50, 100, 200],
        'max_depth': [None, 10, 20, 30]
    },
    'Gradient Boosting': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5]
    },
    'Naive Bayes': {},
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1]
    },
    'MLP': {
        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'activation': ['tanh', 'relu'],
        'solver': ['sgd', 'adam'],
        'alpha': [0.0001, 0.001, 0.01]
    }
}

# List of classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=250),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Naive Bayes': GaussianNB(),
    'AdaBoost': AdaBoostClassifier(algorithm="SAMME"),
    'MLP': MLPClassifier()
}

# Iterate through classifiers and optimize each with GridSearchCV
for name, model in classifiers.items():
    print(f"\nOptimizing {name}...")
    
    # Get the parameter grid for the current classifier
    param_grid = param_grids[name]
    
    # Initialize GridSearchCV
    grid_search = GridSearchCV(model, param_grid, cv=10, n_jobs=-1)
    
    # Fit GridSearchCV
    grid_search.fit(X_train, y_train)
    
    # Get the best estimator
    best_model = grid_search.best_estimator_
    
    # Perform cross-validation with the best model
    cv_scores = cross_val_score(best_model, X_train, y_train, cv=10)
    mean_accuracy = np.mean(cv_scores)
    classification_error = 1 - mean_accuracy
    
    # Print the results
    print(f"Best Parameters: {grid_search.best_params_}")
    print("Cross-Validation Mean Accuracy:", mean_accuracy)
    #print("Cross-Validation Classification Error:", classification_error)

    # Train the model on the full training set and evaluate on the test set
    accuracy, classification_error, _ = train_and_evaluate_model(
        best_model, X_train, y_train, X_test, y_test, to_predict
    )
    print("Test Set Accuracy:", accuracy)
    #print("Test Set Classification Error:", classification_error)



Optimizing Logistic Regression...
Best Parameters: {'C': 10}
Cross-Validation Mean Accuracy: 0.9771223687254222
Test Set Accuracy: 0.9252525252525252

Optimizing SVM...
Best Parameters: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
Cross-Validation Mean Accuracy: 0.9763590099467961
Test Set Accuracy: 0.9272727272727272

Optimizing KNN...
Best Parameters: {'n_neighbors': 7, 'weights': 'distance'}
Cross-Validation Mean Accuracy: 0.9330383992597732
Test Set Accuracy: 0.8875420875420875

Optimizing Decision Tree...
Best Parameters: {'max_depth': 20, 'min_samples_split': 2}
Cross-Validation Mean Accuracy: 0.8933610918343742
Test Set Accuracy: 0.8363636363636363

Optimizing Random Forest...
Best Parameters: {'max_depth': 20, 'n_estimators': 200}
Cross-Validation Mean Accuracy: 0.9535102937774692
Test Set Accuracy: 0.9232323232323232

Optimizing Gradient Boosting...


In [77]:
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier

#xg boost sind gut
param_grid = {
    'penalty': ['l1', 'l2'],  # Regularization type
    'dual': [False, True],  # Dual formulation
    'tol': [1e-4, 1e-3, 1e-2],  # Tolerance for stopping criteria
    'C': [0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
    'fit_intercept': [True, False],  # Whether to include intercept
    'solver': ['liblinear', 'lbfgs', 'sag', 'saga'],  # Algorithm to use in the optimization problem
    'max_iter': [100, 500, 1000, 10000],  # Maximum number of iterations
    'multi_class': ['ovr', 'multinomial'],  # Multi-class setting
}

mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.001, learning_rate='adaptive', max_iter=1000)
xgb = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, subsample=0.8, colsample_bytree=0.8, use_label_encoder=False, eval_metric='mlogloss')



# Define the ensemble of classifiers
ensemble = VotingClassifier(estimators=[
    ('log_reg', LogisticRegression(multi_class="multinomial", penalty="l2" , fit_intercept=True ,tol=1e-4,dual=False, solver="lbfgs" ,max_iter=10000, C=38)),
    ('svm', SVC(C=40, gamma=0.0555, kernel='rbf', probability=True)),
    #('knn', KNeighborsClassifier(n_neighbors=7, weights='distance')),
    ('rf', RandomForestClassifier(max_depth=100, n_estimators=1000)),
    #('mlp', mlp),
    ('xgb', xgb)

], voting='soft')  # Use 'soft' for probability-based voting

# Evaluate the ensemble
ensemble.fit(X_train, y_train)
accuracy, classification_error, _ = train_and_evaluate_model(
    ensemble, X_train, y_train, X_test, y_test, to_predict
)
print("\nEnsemble Model:")
print("# Test Set Accuracy:", accuracy)
# LogReg Test Set Accuracy: 0.9346801346801347
# SVM Test Set Accuracy: 0.9259259259259259


# Test Set Accuracy: 0.9346801346801347

#XGB Test Set Accuracy: 0.898989898989899
#MLP Test Set Accuracy: 0.9057239057239057

# Test Set Accuracy: 0.9353535353535354


Ensemble Model:
# Test Set Accuracy: 0.94006734006734


In [64]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

ensemble.fit(X_train_resampled, y_train_resampled)
accuracy, _ , _ = train_and_evaluate_model(
    ensemble, X_train_resampled, y_train_resampled, X_test, y_test, to_predict
)
print("\nEnsemble Model:")
print("Test Set Accuracy:", accuracy)


Ensemble Model:
Test Set Accuracy: 0.9313131313131313


In [35]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

# Example pipeline for logistic regression with feature engineering
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('poly', PolynomialFeatures(degree=2)),  # Interaction terms
    ('model', LogisticRegression(max_iter=10000, C=30))
])

# Use this pipeline in your GridSearchCV
param_grids = {
    'Logistic Regression': {
        'model__C': [0.01, 0.1, 1, 10, 100],
        'poly__degree': [1, 2, 3]  # Try different degrees of polynomial features
    }
    # Add other models' parameters here
}

# Initialize GridSearchCV with the pipeline
grid_search = GridSearchCV(pipeline, param_grids['Logistic Regression'], cv=30, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best estimator and evaluate
best_model = grid_search.best_estimator_
cv_scores = cross_val_score(best_model, X_train, y_train, cv=10)
mean_accuracy = np.mean(cv_scores)
classification_error = 1 - mean_accuracy

print("Best Parameters:", grid_search.best_params__)
print("Cross-Validation Mean Accuracy:", mean_accuracy)
print("Cross-Validation Classification Error:", classification_error)

accuracy, classification_error, _ = train_and_evaluate_model(
    best_model, X_train, y_train, X_test, y_test, to_predict
)
print("Test Set Accuracy:", accuracy)
print("Test Set Classification Error:", classification_error)


KeyboardInterrupt: 

In [ ]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import numpy as np

# Define the parameter grid for Logistic Regression
param_grid = {
    'penalty': ['l1', 'l2'],  # Regularization type
    'dual': [False, True],  # Dual formulation
    'tol': [1e-4, 1e-3, 1e-2],  # Tolerance for stopping criteria
    'C': [0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
    'fit_intercept': [True, False],  # Whether to include intercept
    'solver': ['liblinear', 'lbfgs', 'sag', 'saga'],  # Algorithm to use in the optimization problem
    'max_iter': [100, 500, 1000, 10000],  # Maximum number of iterations
    'multi_class': ['ovr', 'multinomial'],  # Multi-class setting
}

# Initialize GridSearchCV
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=10, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best estimator
best_model = grid_search.best_estimator_

# Perform cross-validation with the best model
cv_scores = cross_val_score(best_model, X_train, y_train, cv=10)
mean_accuracy = np.mean(cv_scores)
classification_error = 1 - mean_accuracy

# Print the results
print(f"Best Parameters: {grid_search.best_params__}")
print("Cross-Validation Mean Accuracy:", mean_accuracy)

# Train the best model on the full training set and evaluate on the test set
accuracy, _, _ = train_and_evaluate_model(
    best_model, X_train, y_train, X_test, y_test, to_predict
)
print("Test Set Accuracy:", accuracy)
