In [2]:
import pandas as pd
from pathlib import Path
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance
from sklearn.multioutput import MultiOutputClassifier
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [3]:
# directory - only train for now
train_dir = Path('train')

# initialise list to store processed data
processed_data = []

# consider each file in train folder
for folder in train_dir.iterdir():
    if folder.is_dir():

        # get data
        data = pd.read_csv(folder / 'train_stage3.csv')
        # append
        processed_data.append(data)

# concatenate all data   
data = pd.concat(processed_data, ignore_index=True)

# drop start and end columns and save as pandas dataframe
processed_df = pd.DataFrame(data)
processed_df.drop(columns=['start', 'end'], inplace=True)

In [4]:
"""
room_mapping = {room: i+1 for i, room in enumerate(processed_df['room'].unique())}
processed_df['room'] = processed_df['room'].map(room_mapping)"""

"\nroom_mapping = {room: i+1 for i, room in enumerate(processed_df['room'].unique())}\nprocessed_df['room'] = processed_df['room'].map(room_mapping)"

In [5]:
processed_df.isna().sum() # no NaN is good


location_idx                0
Kitchen_AP                  0
Lounge_AP                   0
Upstairs_AP                 0
Study_AP                    0
2d_movement_mean_x          0
2d_movement_mean_y          0
2d_movement_std_x           0
2d_movement_std_y           0
3d_movement_mean_x          0
3d_movement_mean_y          0
3d_movement_mean_z          0
3d_movement_std_x           0
3d_movement_std_y           0
3d_movement_std_z           0
2d_shape_mean_width         0
2d_shape_mean_height        0
2d_shape_std_width          0
2d_shape_std_height         0
3d_shape_mean_width         0
3d_shape_mean_height        0
3d_shape_std_width          0
3d_shape_std_height         0
x_mean                      0
y_mean                      0
z_mean                      0
x_average_jerk              0
x_average_absolute_value    0
x_median                    0
x_std_dev                   0
x_max_value                 0
x_min_value                 0
x_max_absolute_value        0
y_average_

In [6]:
# prep X and Y from datset
X = processed_df.drop('target_vector', axis=1)
y = pd.DataFrame(processed_df['target_vector'].tolist())

# split it as you wish
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# train # choose model
clf = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
#clf = SVC(kernel='linear', random_state=42)
#clf = KNeighborsClassifier(n_neighbors=3)
# clf = GradientBoostingClassifier(random_state=42)
#clf = DecisionTreeClassifier(random_state=42)

clf.fit(X_train, y_train)

# predict
y_pred = clf.predict(X_test)

# print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.5728


In [6]:
"""
# Feature Engineering - method 1 - does not work with SVM

# get feature importances - from built in command
importances = clf.feature_importances_

# convert the importances into a DataFrame
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
})

# sort the DataFrame to find out the most important features
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# plot
plt.figure(figsize=(10, 8))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'], color='skyblue')
plt.xlabel('Relative Importance')
plt.title('Feature Importances')
plt.gca().invert_yaxis() 
plt.show()
"""

"\n# Feature Engineering - method 1 - does not work with SVM\n\n# get feature importances - from built in command\nimportances = clf.feature_importances_\n\n# convert the importances into a DataFrame\nfeature_importance_df = pd.DataFrame({\n    'Feature': X.columns,\n    'Importance': importances\n})\n\n# sort the DataFrame to find out the most important features\nfeature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)\n\n# plot\nplt.figure(figsize=(10, 8))\nplt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'], color='skyblue')\nplt.xlabel('Relative Importance')\nplt.title('Feature Importances')\nplt.gca().invert_yaxis() \nplt.show()\n"

In [7]:
# Feature Engineering - method 2


# perform permutation importance
results = permutation_importance(clf, X_test, y_test, n_repeats=5, random_state=42, n_jobs=-1)

# convert the importances into a DataFrame
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': results.importances_mean
})

# sort the DataFrame to find out the most important features
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# plot
plt.figure(figsize=(10, 8))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'], color='skyblue')
plt.xlabel('Relative Importance')
plt.title('Feature Importances')
plt.gca().invert_yaxis() 
plt.show()


