<a href="https://colab.research.google.com/github/lexxai/goit_python_ds_hw_05/blob/main/goit_ds_hw_05_extra.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## LOAD DATASET

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, accuracy_score

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler


In [None]:
from pathlib import Path

DATASET_KEYS = ("f0s0", "f0s1", "f1s0", "f1s1")
SAVED_DATAFRAME_BASE = Path("/content/")
URL = "https://drive.usercontent.google.com/download?id=1nzrtQpfaHL0OgJ_eXzA7VuEj7XotrSWO&export=download&authuser=0"
OUTPUT = Path("/content/homework.zip")
CSV_DATA_PATH  = Path("/content/data")

In [None]:
if not OUTPUT.is_file():
  !wget -O $OUTPUT $URL

if OUTPUT.is_file() and not CSV_DATA_PATH.is_dir():
  !unzip -q -o $OUTPUT
  #!rm $OUTPUT

In [None]:
if CSV_DATA_PATH.is_dir():
  class_path = list(CSV_DATA_PATH.iterdir())
  class_list = list(d.name for d in class_path)
  print(class_list)

['idle', 'walking', 'running', 'stairs']


### PRELOAD SAVED DATASETs

In [None]:
def gen_saved_name(key):
  return SAVED_DATAFRAME_BASE.joinpath(f"data-{key}.father")

In [None]:
df_set = {}
X_set = {}
y_set = {}
reports = {}

In [None]:
skip_load = False

In [None]:
for key in DATASET_KEYS:
  filename = gen_saved_name(key)
  if not skip_load and filename.is_file():
    df_set[key] = pd.read_feather(filename)
    X_set[key] = df_set[key].iloc[:,:-1]
    y_set[key] = df_set[key].iloc[:,-1]
    print(f"DATASET {key}. Loaded. shape: {df_set[key].shape}")
  else:
    df_set[key] = pd.DataFrame()

DATASET f0s0. Loaded. shape: (193860, 4)
DATASET f0s1. Loaded. shape: (193860, 31)
DATASET f1s0. Loaded. shape: (6462, 91)
DATASET f1s1. Loaded. shape: (6462, 118)


## BUILD DataFrame

#### functions

In [None]:
def save_dataset(key,df):
  filename = gen_saved_name(key)
  if not df.empty and not filename.is_file():
    df.to_feather(filename)

In [None]:
def flatten_frame(frame):
        columns=[f"{col}_{i}" for i in range(frame.shape[0]) for col in frame.columns]
        return pd.DataFrame(frame.values.reshape(1, -1), columns=columns)

In [None]:
def add_stat_feature_frame(frame, rows):
  features = []
  for col_id in range(0,3):
    col = frame.iloc[:,col_id]
    features.append(pd.DataFrame([col.mean()] * rows,columns=[f'{col.name}_mean']))
    features.append(pd.DataFrame([col.max()] * rows,columns=[f'{col.name}_max']))
    features.append(pd.DataFrame([col.min()] * rows,columns=[f'{col.name}_min']))
    features.append(pd.DataFrame([col.quantile(0.75)-col.quantile(0.25)] * rows,columns=[f'{col.name}_interquartile_range']))
    features.append(pd.DataFrame([col.idxmin()] * rows,columns=[f'{col.name}_index_of_minimum_value']))
    features.append(pd.DataFrame([np.mean(np.abs(col - col.mean()))] * rows,columns=[f'{col.name}_mean_of_absolute_deviation']))
    features.append(pd.DataFrame([col.median()] * rows,columns=[f'{col.name}_median']))
    # features.append(pd.DataFrame([df.skew(axis=0).iloc[:,col_id]] * rows,columns=[f'{col.name}_skewness']))
    features.append(pd.DataFrame([col.std()] * rows,columns=[f'{col.name}_standard_deviation']))
    features.append(pd.DataFrame([np.sqrt(np.mean(col)**2)] * rows,columns=[f'{col.name}_root_mean_square_error']))

  result = pd.concat(features, axis=1)
  # rint(f"New stats features: {result.columns}")
  return result

In [None]:
def prepare_dataset(class_path: list[Path], flatten = True, stat_feture = True, limit_frames = None):
  dfws = []
  for class_id, work_class_path in enumerate(class_path):
    list_files = list(sorted(work_class_path.glob('*.csv'), key=lambda path: int(path.stem.rsplit("-", 1)[1])))
    print(f"- Importing class '{work_class_path.name:7}' : {class_id}. Frames: {len(list_files)}")
    for i, filename in enumerate(list_files):
      # READ FRAME CSV FILE
      df_w: pd.DataFrame = pd.read_csv(filename)
      addon_features = [df_w]
      if flatten:
        addon_features = [flatten_frame(df_w)]
      if stat_feture:
        addon_features.append(add_stat_feature_frame(df_w,addon_features[0].shape[0]))
      df_w = pd.concat(addon_features, axis=1)
      df_w['class'] = class_id
      dfws.append(df_w)
      if limit_frames and (i > limit_frames):
        break
  df = pd.concat(dfws, axis=0, ignore_index=True)
  # print(df.info())
  print(df.shape)
  return df

### DATASETs  (f0s0, f0s1, f1s0, f1s1)

In [None]:
limit_frames = None
for f in range(2):
  for s in range(2):
    key_set = f"f{f}s{s}"
    print(f"\nDATASET {key_set}. Where flatten={bool(f)}, stat_feture={bool(s)}")
    if df_set.get(key_set) is not None and (not df_set[key_set].empty):
      print("Alredy loaded, skipped")
      continue
    df_set[key_set] = prepare_dataset(class_path, flatten = f, stat_feture = s, limit_frames = limit_frames)
    save_dataset(key_set, df_set[key_set])
    # print(df_set[key_set].sample(4))
    X_set[key_set] = df_set[key_set].iloc[:,:-1]
    y_set[key_set] = df_set[key_set].iloc[:,-1]


DATASET f0s0. Where flatten=False, stat_feture=False
Alredy loaded, skipped

DATASET f0s1. Where flatten=False, stat_feture=True
Alredy loaded, skipped

DATASET f1s0. Where flatten=True, stat_feture=False
Alredy loaded, skipped

DATASET f1s1. Where flatten=True, stat_feture=True
Alredy loaded, skipped


## Незбаланосваність класів

In [None]:
key_set = list(df_set.keys())[0]
for x in df_set[key_set]['class'].unique():
  count = df_set[key_set]['class'][df_set[key_set]['class'] == x].count()
  prop = count / df_set[key_set].shape[0]
  print(f"class: {x}, rows: {count:7}, {class_list[x]:7}, prop: {prop:.4}" )

class: 0, rows:   31170, idle   , prop: 0.1608
class: 1, rows:   55500, walking, prop: 0.2863
class: 2, rows:  102240, running, prop: 0.5274
class: 3, rows:    4950, stairs , prop: 0.02553


## COMPARE MODELS

In [None]:
models = {"SVC": lambda: make_pipeline(StandardScaler(), SVC(gamma='auto')),
          "SVC_Linear": lambda: make_pipeline(StandardScaler(), SVC(gamma='auto', kernel="linear")),
          "RandomForestClassifier": lambda: RandomForestClassifier()}
reports = {}
skip_models = {
    "SVC_Linear": ["f0s0"]
}
skip_resample = ["f0s0"]

In [None]:
def balance_log(y):
  unique_values, counts = np.unique(y, return_counts=True)
  for x, count in zip(unique_values, counts):
      prop = count / y.shape[0]
      print(f"class: {x}, rows: {count:7}, {class_list[x]:7}, prop: {prop:.4}" )

In [None]:
SEED = 42
print("Models fit and prepare report")
for key in X_set.keys():
  print("-"*80)
  print(f"DATASET {key}. shape: {df_set[key].shape}")
  X = X_set[key]
  y = y_set[key]
  balance_log(y)
  if key in skip_resample:
     print("SKIP RESAMPLE")
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED, stratify=y)
  else:
    # sm = SMOTE(random_state=0)
    # X_resampled, y_resampled = sm.fit_resample(X, y)
    rundersampler = RandomUnderSampler(random_state=0)
    X_resampled, y_resampled = rundersampler.fit_resample(X, y)
    print(f"{X.shape=}")
    print(f"{X_resampled.shape=}")
    print(f"{y_resampled.shape=}")
    balance_log(y_resampled)
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=SEED, stratify=y_resampled)
  if reports.get(key) is None:
    reports[key] = {}
  for model, classification in models.items():
    print(f"\n- classification: {model}")
    if reports[key].get(model):
      print("   alredy fit, skipped")
      continue
    if key in skip_models.get(model,[]):
      print("   skip this model")
      continue
    clf = classification()
    %time clf.fit(X_train, y_train)
    %time y_test_predict = clf.predict(X_test)
    reports[key][model] = classification_report(y_test, y_test_predict, digits=4, target_names=class_list)


Models fit and prepare report
--------------------------------------------------------------------------------
DATASET f0s0. shape: (193860, 4)
class: 0, rows:   31170, idle   , prop: 0.1608
class: 1, rows:   55500, walking, prop: 0.2863
class: 2, rows:  102240, running, prop: 0.5274
class: 3, rows:    4950, stairs , prop: 0.02553
SKIP RESAMPLE

- classification: SVC
CPU times: user 3min 10s, sys: 469 ms, total: 3min 11s
Wall time: 3min 20s
CPU times: user 1min 38s, sys: 106 ms, total: 1min 38s
Wall time: 1min 41s

- classification: SVC_Linear
   skip this model

- classification: RandomForestClassifier
CPU times: user 11 s, sys: 11 ms, total: 11 s
Wall time: 11 s
CPU times: user 778 ms, sys: 4 µs, total: 778 ms
Wall time: 775 ms
--------------------------------------------------------------------------------
DATASET f0s1. shape: (193860, 31)
class: 0, rows:   31170, idle   , prop: 0.1608
class: 1, rows:   55500, walking, prop: 0.2863
class: 2, rows:  102240, running, prop: 0.5274
clas

## Classification report

In [None]:
print(f"{limit_frames=}")
for dset in reports.keys():
  for model in reports[dset].keys():
    print("-"*80)
    print(f"Data set: {dset}, shape: {df_set[dset].shape}, model: {model}")
    print(reports[dset][model])

limit_frames=None
--------------------------------------------------------------------------------
Data set: f0s0, shape: (193860, 4), model: SVC
              precision    recall  f1-score   support

        idle     0.9601    0.9839    0.9718      9351
     walking     0.8017    0.8979    0.8470     16650
     running     0.9279    0.9052    0.9164     30672
      stairs     1.0000    0.0054    0.0107      1485

    accuracy                         0.8928     58158
   macro avg     0.9224    0.6981    0.6865     58158
weighted avg     0.8988    0.8928    0.8823     58158

--------------------------------------------------------------------------------
Data set: f0s0, shape: (193860, 4), model: RandomForestClassifier
              precision    recall  f1-score   support

        idle     0.9997    1.0000    0.9998      9351
     walking     0.9995    0.9994    0.9994     16650
     running     0.9995    0.9999    0.9997     30672
      stairs     1.0000    0.9906    0.9953      1485



## RESULTS:

Different datasets were created from CSV source files downloaded from various folders, each named according to the related activity. And each file had 30 records of 3 accelerometer parameters (coordinates).


* Dataset f0s0: Contains data with 4 features.
* Dataset f0s1: Contains data with 31 features.
* Dataset f1s0: Contains data with 91 features.
* Dataset f1s1: Contains data with 118 features.

In the 'f0s0' dataset, the SVC-Linear model was skipped during the long-time operation, potentially exceeding one hour.

Additionally, the SVC model was augmented with StandardScaler(), resulting in a modest increase of 0.452% in accuracy.

The models used for analysis included SVC, SVC-Linear, and RandomForestClassifier.

* For the dataset with 4 features (f0s0), the RandomForestClassifier performs better.
* For the dataset with 31 features (f0s1), the RandomForestClassifier performs better.
* For the dataset with 91 features (f1s0), the RandomForestClassifier performs better.
* For the dataset with 118 features (f1s1), the RandomForestClassifier performs better.

This dataset 'f0s1' achieved a perfect accuracy of 1.0000, making it the highest among the provided datasets.

The dataset 'f0s1' has not been flattened and statistical features have been added.

Winner is datast 'f0s1' with with 31 features when using the RandomForestClassifier model.

## TEST на minimuм данних аккселерометра

In [None]:
rows = 10
df_one = df_set['f0s0'].iloc[:rows,:3]
df_one_class = df_set['f0s0'].iloc[:rows,3]
df_one_class

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: class, dtype: int64

In [None]:
df_one_stat = add_stat_feature_frame(df_one,df_one.shape[0])
df_one_stat

Unnamed: 0,accelerometer_X_mean,accelerometer_X_max,accelerometer_X_min,accelerometer_X_interquartile_range,accelerometer_X_index_of_minimum_value,accelerometer_X_mean_of_absolute_deviation,accelerometer_X_median,accelerometer_X_standard_deviation,accelerometer_X_root_mean_square_error,accelerometer_Y_mean,...,accelerometer_Y_root_mean_square_error,accelerometer_Z_mean,accelerometer_Z_max,accelerometer_Z_min,accelerometer_Z_interquartile_range,accelerometer_Z_index_of_minimum_value,accelerometer_Z_mean_of_absolute_deviation,accelerometer_Z_median,accelerometer_Z_standard_deviation,accelerometer_Z_root_mean_square_error
0,0.720176,5.09965,-0.909797,1.045069,2,1.140406,-0.059855,1.710087,0.720176,0.778594,...,0.778594,9.277053,9.787497,8.418014,1.039083,3,0.497994,9.507375,0.593041,9.277053
1,0.720176,5.09965,-0.909797,1.045069,2,1.140406,-0.059855,1.710087,0.720176,0.778594,...,0.778594,9.277053,9.787497,8.418014,1.039083,3,0.497994,9.507375,0.593041,9.277053
2,0.720176,5.09965,-0.909797,1.045069,2,1.140406,-0.059855,1.710087,0.720176,0.778594,...,0.778594,9.277053,9.787497,8.418014,1.039083,3,0.497994,9.507375,0.593041,9.277053
3,0.720176,5.09965,-0.909797,1.045069,2,1.140406,-0.059855,1.710087,0.720176,0.778594,...,0.778594,9.277053,9.787497,8.418014,1.039083,3,0.497994,9.507375,0.593041,9.277053
4,0.720176,5.09965,-0.909797,1.045069,2,1.140406,-0.059855,1.710087,0.720176,0.778594,...,0.778594,9.277053,9.787497,8.418014,1.039083,3,0.497994,9.507375,0.593041,9.277053
5,0.720176,5.09965,-0.909797,1.045069,2,1.140406,-0.059855,1.710087,0.720176,0.778594,...,0.778594,9.277053,9.787497,8.418014,1.039083,3,0.497994,9.507375,0.593041,9.277053
6,0.720176,5.09965,-0.909797,1.045069,2,1.140406,-0.059855,1.710087,0.720176,0.778594,...,0.778594,9.277053,9.787497,8.418014,1.039083,3,0.497994,9.507375,0.593041,9.277053
7,0.720176,5.09965,-0.909797,1.045069,2,1.140406,-0.059855,1.710087,0.720176,0.778594,...,0.778594,9.277053,9.787497,8.418014,1.039083,3,0.497994,9.507375,0.593041,9.277053
8,0.720176,5.09965,-0.909797,1.045069,2,1.140406,-0.059855,1.710087,0.720176,0.778594,...,0.778594,9.277053,9.787497,8.418014,1.039083,3,0.497994,9.507375,0.593041,9.277053
9,0.720176,5.09965,-0.909797,1.045069,2,1.140406,-0.059855,1.710087,0.720176,0.778594,...,0.778594,9.277053,9.787497,8.418014,1.039083,3,0.497994,9.507375,0.593041,9.277053


In [None]:
X_test_one = pd.concat([df_one, df_one_stat], axis=1)
X_test_one.fillna(0, inplace=True)
X_test_one[:1]

Unnamed: 0,accelerometer_X,accelerometer_Y,accelerometer_Z,accelerometer_X_mean,accelerometer_X_max,accelerometer_X_min,accelerometer_X_interquartile_range,accelerometer_X_index_of_minimum_value,accelerometer_X_mean_of_absolute_deviation,accelerometer_X_median,...,accelerometer_Y_root_mean_square_error,accelerometer_Z_mean,accelerometer_Z_max,accelerometer_Z_min,accelerometer_Z_interquartile_range,accelerometer_Z_index_of_minimum_value,accelerometer_Z_mean_of_absolute_deviation,accelerometer_Z_median,accelerometer_Z_standard_deviation,accelerometer_Z_root_mean_square_error
0,1.000776,4.616021,8.576031,0.720176,5.09965,-0.909797,1.045069,2,1.140406,-0.059855,...,0.778594,9.277053,9.787497,8.418014,1.039083,3,0.497994,9.507375,0.593041,9.277053


In [None]:
key = 'f0s1'
X = X_set[key]
y = y_set[key]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED, stratify=y)
# clf = SVC(kernel="linear")
clf = RandomForestClassifier()
%time clf.fit(X_train, y_train)

CPU times: user 13.8 s, sys: 43.8 ms, total: 13.8 s
Wall time: 16.2 s


In [None]:
y_test_predict_one = clf.predict(X_test_one[:1])
y_test_predict_one

array([0])

In [None]:
print(classification_report(df_one_class[:y_test_predict_one.shape[0]], y_test_predict_one, digits=4))

              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000         1

    accuracy                         1.0000         1
   macro avg     1.0000    1.0000    1.0000         1
weighted avg     1.0000    1.0000    1.0000         1



тести показали мінімум 10 записів акселерометра треба для формування (статистики),
 щоб прогноз сходився