<a href="https://colab.research.google.com/github/lexxai/goit_python_ds_hw_05/blob/main/goit_ds_hw_05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## LOAD DATASET

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, accuracy_score


In [2]:
from pathlib import Path

DATASET_KEYS = ("f0s0", "f0s1", "f1s0", "f1s1")
SAVED_DATAFRAME_BASE = Path("/content/")
URL = "https://drive.usercontent.google.com/download?id=1nzrtQpfaHL0OgJ_eXzA7VuEj7XotrSWO&export=download&authuser=0"
OUTPUT = Path("/content/homework.zip")
CSV_DATA_PATH  = Path("/content/data")

In [3]:
if not OUTPUT.is_file():
  !wget -O $OUTPUT $URL

if OUTPUT.is_file() and not CSV_DATA_PATH.is_dir():
  !unzip -q -o $OUTPUT
  #!rm $OUTPUT

--2024-01-31 18:02:48--  https://drive.usercontent.google.com/download?id=1nzrtQpfaHL0OgJ_eXzA7VuEj7XotrSWO
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 173.194.216.132, 2607:f8b0:400c:c0c::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|173.194.216.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4211746 (4.0M) [application/octet-stream]
Saving to: ‘/content/homework.zip’


2024-01-31 18:03:18 (31.5 MB/s) - ‘/content/homework.zip’ saved [4211746/4211746]



In [4]:
if CSV_DATA_PATH.is_dir():
  class_path = list(CSV_DATA_PATH.iterdir())
  class_list = list(d.name for d in class_path)
  print(class_list)

['walking', 'stairs', 'idle', 'running']


### PRELOAD SAVED DATASETs

In [5]:
def gen_saved_name(key):
  return SAVED_DATAFRAME_BASE.joinpath(f"data-{key}.father")

In [6]:
df_set = {}
X_set = {}
y_set = {}
reports = {}

In [7]:
skip_load = False

In [8]:
for key in DATASET_KEYS:
  filename = gen_saved_name(key)
  if not skip_load and filename.is_file():
    df_set[key] = pd.read_feather(filename)
    X_set[key] = df_set[key].iloc[:,:-1]
    y_set[key] = df_set[key].iloc[:,-1]
    print(f"DATASET {key}. Loaded. shape: {df_set[key].shape}")
  else:
    df_set[key] = pd.DataFrame()

## BUILD DataFrame

#### functions

In [9]:
def save_dataset(key,df):
  filename = gen_saved_name(key)
  if not df.empty and not filename.is_file():
    df.to_feather(filename)

In [10]:
def flatten_frame(frame):
        columns=[f"{col}_{i}" for i in range(frame.shape[0]) for col in frame.columns]
        return pd.DataFrame(frame.values.reshape(1, -1), columns=columns)

In [11]:
def add_stat_feature_frame(frame, rows):
  features = []
  for col_id in range(0,3):
    col = frame.iloc[:,col_id]
    features.append(pd.DataFrame([col.mean()] * rows,columns=[f'{col.name}_mean']))
    features.append(pd.DataFrame([col.max()] * rows,columns=[f'{col.name}_max']))
    features.append(pd.DataFrame([col.min()] * rows,columns=[f'{col.name}_min']))
    features.append(pd.DataFrame([col.quantile(0.75)-col.quantile(0.25)] * rows,columns=[f'{col.name}_interquartile_range']))
    features.append(pd.DataFrame([col.idxmin()] * rows,columns=[f'{col.name}_index_of_minimum_value']))
    features.append(pd.DataFrame([np.mean(np.abs(col - col.mean()))] * rows,columns=[f'{col.name}_mean_of_absolute_deviation']))
    features.append(pd.DataFrame([col.median()] * rows,columns=[f'{col.name}_median']))
    # features.append(pd.DataFrame([df.skew(axis=0).iloc[:,col_id]] * rows,columns=[f'{col.name}_skewness']))
    features.append(pd.DataFrame([col.std()] * rows,columns=[f'{col.name}_standard_deviation']))
    features.append(pd.DataFrame([np.sqrt(np.mean(col)**2)] * rows,columns=[f'{col.name}_root_mean_square_error']))

  result = pd.concat(features, axis=1)
  # rint(f"New stats features: {result.columns}")
  return result

In [12]:
def prepare_dataset(class_path: list[Path], flatten = True, stat_feture = True, limit_frames = None):
  dfws = []
  for class_id, work_class_path in enumerate(class_path):
    list_files = list(sorted(work_class_path.glob('*.csv'), key=lambda path: int(path.stem.rsplit("-", 1)[1])))
    print(f"Importing class '{work_class_path.name:7}' : {class_id}. Frames: {len(list_files)}")
    for i, filename in enumerate(list_files):
      # READ FRAME CSV FILE
      df_w: pd.DataFrame = pd.read_csv(filename)
      addon_features = [df_w]
      if flatten:
        addon_features = [flatten_frame(df_w)]
      if stat_feture:
        addon_features.append(add_stat_feature_frame(df_w,addon_features[0].shape[0]))
      df_w = pd.concat(addon_features, axis=1)
      df_w['class'] = class_id
      dfws.append(df_w)
      if limit_frames and (i > limit_frames):
        break
  df = pd.concat(dfws, axis=0, ignore_index=True)
  # print(df.info())
  print(df.shape)
  return df

### DATASETs  (f0s0, f0s1, f1s0, f1s1)

In [13]:
limit_frames = None
for f in range(2):
  for s in range(2):
    key_set = f"f{f}s{s}"
    print(f"\nDATASET {key_set}. Where flatten={bool(f)}, stat_feture={bool(s)}")
    if df_set.get(key_set) is not None and (not df_set[key_set].empty):
      print("Alredy loaded, skipped")
      continue
    df_set[key_set] = prepare_dataset(class_path, flatten = f, stat_feture = s, limit_frames = limit_frames)
    save_dataset(key_set, df_set[key_set])
    # print(df_set[key_set].sample(4))
    X_set[key_set] = df_set[key_set].iloc[:,:-1]
    y_set[key_set] = df_set[key_set].iloc[:,-1]


DATASET f0s0. Where flatten=False, stat_feture=False
Importing class 'walking' : 0. Frames: 1850
Importing class 'stairs ' : 1. Frames: 165
Importing class 'idle   ' : 2. Frames: 1039
Importing class 'running' : 3. Frames: 3408
(193860, 4)

DATASET f0s1. Where flatten=False, stat_feture=True
Importing class 'walking' : 0. Frames: 1850
Importing class 'stairs ' : 1. Frames: 165
Importing class 'idle   ' : 2. Frames: 1039
Importing class 'running' : 3. Frames: 3408
(193860, 31)

DATASET f1s0. Where flatten=True, stat_feture=False
Importing class 'walking' : 0. Frames: 1850
Importing class 'stairs ' : 1. Frames: 165
Importing class 'idle   ' : 2. Frames: 1039
Importing class 'running' : 3. Frames: 3408
(6462, 91)

DATASET f1s1. Where flatten=True, stat_feture=True
Importing class 'walking' : 0. Frames: 1850
Importing class 'stairs ' : 1. Frames: 165
Importing class 'idle   ' : 2. Frames: 1039
Importing class 'running' : 3. Frames: 3408
(6462, 118)


In [14]:
key_set = list(df_set.keys())[0]
for x in df_set[key_set]['class'].unique():
  count = df_set[key_set]['class'][df_set[key_set]['class'] == x].count()
  prop = count / df_set[key_set].shape[0]
  print(f"class: {x}, rows: {count:7}, {class_list[x]:7}, prop: {prop:.4}" )

class: 0, rows:   55500, walking, prop: 0.2863
class: 1, rows:    4950, stairs , prop: 0.02553
class: 2, rows:   31170, idle   , prop: 0.1608
class: 3, rows:  102240, running, prop: 0.5274


## COMPARE MODELS

In [15]:
models = {"SVC": lambda: SVC(),
          "SVC_Linear": lambda: SVC(kernel="linear"),
          "RandomForestClassifier": lambda: RandomForestClassifier()}
reports = {}
skip_models = {
    "SVC_Linear": ["f0s0"]
}

In [16]:
SEED = 42
print("Models fit and prepare report")
for key in X_set.keys():
  print("-"*80)
  print(f"DATASET {key}. shape: {df_set[key].shape}")
  X = X_set[key]
  y = y_set[key]
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED, stratify=y)
  if reports.get(key) is None:
    reports[key] = {}
  for model, classification in models.items():
    print(f"\n- classification: {model}")
    if reports[key].get(model):
      print("   alredy fit, skipped")
      continue
    if key in skip_models.get(model,[]):
      print("   skip this model")
      continue
    clf = classification()
    %time clf.fit(X_train, y_train)
    %time y_test_predict = clf.predict(X_test)
    reports[key][model] = classification_report(y_test, y_test_predict, digits=4, target_names=class_list)


Models fit and prepare report
--------------------------------------------------------------------------------
DATASET f0s0. shape: (193860, 4)

- classification: SVC
CPU times: user 3min 57s, sys: 461 ms, total: 3min 58s
Wall time: 3min 58s
CPU times: user 1min 49s, sys: 99.9 ms, total: 1min 49s
Wall time: 1min 50s

- classification: SVC_Linear
   skip this model

- classification: RandomForestClassifier
CPU times: user 13.9 s, sys: 19 ms, total: 13.9 s
Wall time: 13.9 s
CPU times: user 963 ms, sys: 2.01 ms, total: 965 ms
Wall time: 963 ms
--------------------------------------------------------------------------------
DATASET f0s1. shape: (193860, 31)

- classification: SVC
CPU times: user 22.5 s, sys: 115 ms, total: 22.6 s
Wall time: 22.7 s
CPU times: user 17.5 s, sys: 18 ms, total: 17.5 s
Wall time: 17.6 s

- classification: SVC_Linear
CPU times: user 2min 2s, sys: 234 ms, total: 2min 2s
Wall time: 2min 2s
CPU times: user 7.26 s, sys: 6.98 ms, total: 7.26 s
Wall time: 7.3 s

- clas

## Classification report

In [17]:
print(f"{limit_frames=}")
for dset in reports.keys():
  for model in reports[dset].keys():
    print("-"*80)
    print(f"Data set: {dset}, shape: {df_set[dset].shape}, model: {model}")
    print(reports[dset][model])

limit_frames=None
--------------------------------------------------------------------------------
Data set: f0s0, shape: (193860, 4), model: SVC
              precision    recall  f1-score   support

     walking     0.8010    0.9013    0.8482     16650
      stairs     1.0000    0.0034    0.0067      1485
        idle     0.9585    0.9863    0.9722      9351
     running     0.9301    0.9035    0.9166     30672

    accuracy                         0.8932     58158
   macro avg     0.9224    0.6986    0.6859     58158
weighted avg     0.8995    0.8932    0.8827     58158

--------------------------------------------------------------------------------
Data set: f0s0, shape: (193860, 4), model: RandomForestClassifier
              precision    recall  f1-score   support

     walking     0.9994    0.9998    0.9996     16650
      stairs     1.0000    0.9865    0.9932      1485
        idle     1.0000    0.9997    0.9998      9351
     running     0.9994    1.0000    0.9997     30672



## RESULTS:

Different datasets were created from CSV source files downloaded from various folders, each named according to the related activity.

* Dataset f0s0: Contains data with 4 features.
* Dataset f0s1: Contains data with 31 features.
* Dataset f1s0: Contains data with 91 features.
* Dataset f1s1: Contains data with 118 features.

In the 'f0s0' dataset, the SVC-Linear model was skipped during log-time operation, potentially exceeding one hour.

The models used for analysis included SVC, SVC-Linear, and RandomForestClassifier.

* For the dataset with 4 features (f0s0), the RandomForestClassifier performs better.
* For the dataset with 31 features (f0s1), the RandomForestClassifier performs better.
* For the dataset with 91 features (f1s0), the RandomForestClassifier performs better.
* For the dataset with 118 features (f1s1), the RandomForestClassifier performs better.

This dataset 'f0s1' achieved a perfect accuracy of 1.0000, making it the highest among the provided datasets.

The dataset 'f0s1' has not been flattened and statistical features have been added.

Winner is datast 'f0s1' with with 31 features when using the RandomForestClassifier model.

## TEST на minimuм данних аккселерометра

In [18]:
rows = 10
df_one = df_set['f0s0'].iloc[:rows,:3]
df_one_class = df_set['f0s0'].iloc[:rows,3]
df_one_class

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: class, dtype: int64

In [19]:
df_one_stat = add_stat_feature_frame(df_one,df_one.shape[0])
df_one_stat

Unnamed: 0,accelerometer_X_mean,accelerometer_X_max,accelerometer_X_min,accelerometer_X_interquartile_range,accelerometer_X_index_of_minimum_value,accelerometer_X_mean_of_absolute_deviation,accelerometer_X_median,accelerometer_X_standard_deviation,accelerometer_X_root_mean_square_error,accelerometer_Y_mean,...,accelerometer_Y_root_mean_square_error,accelerometer_Z_mean,accelerometer_Z_max,accelerometer_Z_min,accelerometer_Z_interquartile_range,accelerometer_Z_index_of_minimum_value,accelerometer_Z_mean_of_absolute_deviation,accelerometer_Z_median,accelerometer_Z_standard_deviation,accelerometer_Z_root_mean_square_error
0,6.501215,11.219229,1.781286,6.412869,1,3.112941,6.80911,3.655132,6.501215,-7.608294,...,7.608294,1.600285,11.827356,-8.087613,4.721366,5,3.572532,1.261744,5.243697,1.600285
1,6.501215,11.219229,1.781286,6.412869,1,3.112941,6.80911,3.655132,6.501215,-7.608294,...,7.608294,1.600285,11.827356,-8.087613,4.721366,5,3.572532,1.261744,5.243697,1.600285
2,6.501215,11.219229,1.781286,6.412869,1,3.112941,6.80911,3.655132,6.501215,-7.608294,...,7.608294,1.600285,11.827356,-8.087613,4.721366,5,3.572532,1.261744,5.243697,1.600285
3,6.501215,11.219229,1.781286,6.412869,1,3.112941,6.80911,3.655132,6.501215,-7.608294,...,7.608294,1.600285,11.827356,-8.087613,4.721366,5,3.572532,1.261744,5.243697,1.600285
4,6.501215,11.219229,1.781286,6.412869,1,3.112941,6.80911,3.655132,6.501215,-7.608294,...,7.608294,1.600285,11.827356,-8.087613,4.721366,5,3.572532,1.261744,5.243697,1.600285
5,6.501215,11.219229,1.781286,6.412869,1,3.112941,6.80911,3.655132,6.501215,-7.608294,...,7.608294,1.600285,11.827356,-8.087613,4.721366,5,3.572532,1.261744,5.243697,1.600285
6,6.501215,11.219229,1.781286,6.412869,1,3.112941,6.80911,3.655132,6.501215,-7.608294,...,7.608294,1.600285,11.827356,-8.087613,4.721366,5,3.572532,1.261744,5.243697,1.600285
7,6.501215,11.219229,1.781286,6.412869,1,3.112941,6.80911,3.655132,6.501215,-7.608294,...,7.608294,1.600285,11.827356,-8.087613,4.721366,5,3.572532,1.261744,5.243697,1.600285
8,6.501215,11.219229,1.781286,6.412869,1,3.112941,6.80911,3.655132,6.501215,-7.608294,...,7.608294,1.600285,11.827356,-8.087613,4.721366,5,3.572532,1.261744,5.243697,1.600285
9,6.501215,11.219229,1.781286,6.412869,1,3.112941,6.80911,3.655132,6.501215,-7.608294,...,7.608294,1.600285,11.827356,-8.087613,4.721366,5,3.572532,1.261744,5.243697,1.600285


In [20]:
X_test_one = pd.concat([df_one, df_one_stat], axis=1)
X_test_one.fillna(0, inplace=True)
X_test_one[:1]

Unnamed: 0,accelerometer_X,accelerometer_Y,accelerometer_Z,accelerometer_X_mean,accelerometer_X_max,accelerometer_X_min,accelerometer_X_interquartile_range,accelerometer_X_index_of_minimum_value,accelerometer_X_mean_of_absolute_deviation,accelerometer_X_median,...,accelerometer_Y_root_mean_square_error,accelerometer_Z_mean,accelerometer_Z_max,accelerometer_Z_min,accelerometer_Z_interquartile_range,accelerometer_Z_index_of_minimum_value,accelerometer_Z_mean_of_absolute_deviation,accelerometer_Z_median,accelerometer_Z_standard_deviation,accelerometer_Z_root_mean_square_error
0,2.322376,3.643975,11.827356,6.501215,11.219229,1.781286,6.412869,1,3.112941,6.80911,...,7.608294,1.600285,11.827356,-8.087613,4.721366,5,3.572532,1.261744,5.243697,1.600285


In [21]:
key = 'f0s1'
X = X_set[key]
y = y_set[key]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED, stratify=y)
# clf = SVC(kernel="linear")
clf = RandomForestClassifier()
%time clf.fit(X_train, y_train)

CPU times: user 15.7 s, sys: 25 ms, total: 15.7 s
Wall time: 15.7 s


In [22]:
y_test_predict_one = clf.predict(X_test_one[:1])
y_test_predict_one

array([0])

In [23]:
print(classification_report(df_one_class[:y_test_predict_one.shape[0]], y_test_predict_one, digits=4))

              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000         1

    accuracy                         1.0000         1
   macro avg     1.0000    1.0000    1.0000         1
weighted avg     1.0000    1.0000    1.0000         1



тести показали мінімум 10 записів акселерометра треба для формування (статистики),
 щоб прогноз сходився