<a href="https://colab.research.google.com/github/lexxai/goit_python_ds_hw_05/blob/main/goit_ds_hw_05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## LOAD DATASET

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, accuracy_score


In [2]:
from pathlib import Path

DATASET_KEYS = ("f0s0", "f0s1", "f1s0", "f1s1")
SAVED_DATAFRAME_BASE = Path("/content/")
URL = "https://drive.usercontent.google.com/download?id=1nzrtQpfaHL0OgJ_eXzA7VuEj7XotrSWO&export=download&authuser=0"
OUTPUT = Path("/content/homework.zip")
CSV_DATA_PATH  = Path("/content/data")

In [3]:
if not OUTPUT.is_file():
  !wget -O $OUTPUT $URL

if OUTPUT.is_file() and not CSV_DATA_PATH.is_dir():
  !unzip -q -o $OUTPUT
  #!rm $OUTPUT

In [4]:
if CSV_DATA_PATH.is_dir():
  class_path = list(CSV_DATA_PATH.iterdir())
  class_list = list(d.name for d in class_path)
  print(class_list)

['running', 'idle', 'walking', 'stairs']


### PRELOAD SAVED DATASETs

In [5]:
def gen_saved_name(key):
  return SAVED_DATAFRAME_BASE.joinpath(f"data-{key}.father")

In [24]:
df_set = {}
X_set = {}
y_set = {}
reports = {}

In [25]:
skip_load = True

In [26]:
for key in DATASET_KEYS:
  filename = gen_saved_name(key)
  if not skip_load and filename.is_file():
    df_set[key] = pd.read_feather(filename)
    X_set[key] = df_set[key].iloc[:,:-1]
    y_set[key] = df_set[key].iloc[:,-1]
    print(f"DATASET {key}. Loaded. shape: {df_set[key].shape}")
  else:
    df_set[key] = pd.DataFrame()

## BUILD DataFrame

#### functions

In [8]:
def save_dataset(key,df):
  filename = gen_saved_name(key)
  if not df.empty and not filename.is_file():
    df.to_feather(filename)

In [9]:
def flatten_frame(frame):
        columns=[f"{col}_{i}" for i in range(frame.shape[0]) for col in frame.columns]
        return pd.DataFrame(frame.values.reshape(1, -1), columns=columns)

In [10]:
def add_stat_feature_frame(frame, rows):
  features = []
  for col_id in range(0,3):
    col = frame.iloc[:,col_id]
    features.append(pd.DataFrame([col.mean()] * rows,columns=[f'{col.name}_mean']))
    features.append(pd.DataFrame([col.max()] * rows,columns=[f'{col.name}_max']))
    features.append(pd.DataFrame([col.min()] * rows,columns=[f'{col.name}_min']))
    features.append(pd.DataFrame([col.quantile(0.75)-col.quantile(0.25)] * rows,columns=[f'{col.name}_interquartile_range']))
    features.append(pd.DataFrame([col.idxmin()] * rows,columns=[f'{col.name}_index_of_minimum_value']))
    features.append(pd.DataFrame([np.mean(np.abs(col - col.mean()))] * rows,columns=[f'{col.name}_mean_of_absolute_deviation']))
    features.append(pd.DataFrame([col.median()] * rows,columns=[f'{col.name}_median']))
    # features.append(pd.DataFrame([df.skew(axis=0).iloc[:,col_id]] * rows,columns=[f'{col.name}_skewness']))
    features.append(pd.DataFrame([col.std()] * rows,columns=[f'{col.name}_standard_deviation']))
    features.append(pd.DataFrame([np.sqrt(np.mean(col)**2)] * rows,columns=[f'{col.name}_root_mean_square_error']))

  result = pd.concat(features, axis=1)
  # rint(f"New stats features: {result.columns}")
  return result

In [11]:
def prepare_dataset(class_path: list[Path], flatten = True, stat_feture = True, limit_frames = None):
  dfws = []
  for class_id, work_class_path in enumerate(class_path):
    list_files = list(sorted(work_class_path.glob('*.csv'), key=lambda path: int(path.stem.rsplit("-", 1)[1])))
    print(f"Importing class '{work_class_path.name:7}' : {class_id}. Frames: {len(list_files)}")
    for i, filename in enumerate(list_files):
      # READ FRAME CSV FILE
      df_w: pd.DataFrame = pd.read_csv(filename)
      addon_features = [df_w]
      if flatten:
        addon_features = [flatten_frame(df_w)]
      if stat_feture:
        addon_features.append(add_stat_feature_frame(df_w,addon_features[0].shape[0]))
      df_w = pd.concat(addon_features, axis=1)
      df_w['class'] = class_id
      dfws.append(df_w)
      if limit_frames and (i > limit_frames):
        break
  df = pd.concat(dfws, axis=0, ignore_index=True)
  # print(df.info())
  print(df.shape)
  return df

### DATASETs  (f0s0, f0s1, f1s0, f1s1)

In [27]:
limit_frames = 300
for f in range(2):
  for s in range(2):
    key_set = f"f{f}s{s}"
    print(f"\nDATASET {key_set}. Where flatten={bool(f)}, stat_feture={bool(s)}")
    if df_set.get(key_set) is not None and (not df_set[key_set].empty):
      print("Alredy loaded, skipped")
      continue
    df_set[key_set] = prepare_dataset(class_path, flatten = f, stat_feture = s, limit_frames = limit_frames)
    save_dataset(key_set, df_set[key_set])
    print(df_set[key_set].sample(4))
    X_set[key_set] = df_set[key_set].iloc[:,:-1]
    y_set[key_set] = df_set[key_set].iloc[:,-1]


DATASET f0s0. Where flatten=False, stat_feture=False
Importing class 'running' : 0. Frames: 3408
Importing class 'idle   ' : 1. Frames: 1039
Importing class 'walking' : 2. Frames: 1850
Importing class 'stairs ' : 3. Frames: 165
(32130, 4)
       accelerometer_X  accelerometer_Y  accelerometer_Z  class
30999        -2.240973        -4.735731        -0.804452      3
6986          2.537854       -20.513520        -4.338293      0
28557         3.466804        -5.406107        -2.633622      3
5458          8.963891        17.496826         2.968810      0

DATASET f0s1. Where flatten=False, stat_feture=True
Importing class 'running' : 0. Frames: 3408
Importing class 'idle   ' : 1. Frames: 1039
Importing class 'walking' : 2. Frames: 1850
Importing class 'stairs ' : 3. Frames: 165
(32130, 31)
       accelerometer_X  accelerometer_Y  accelerometer_Z  \
29995         0.991200       -10.903194        -8.183381   
14280         0.742203         6.943185         6.770803   
30716         2.9305

In [28]:
key_set = list(df_set.keys())[0]
for x in df_set[key_set]['class'].unique():
  count = df_set[key_set]['class'][df_set[key_set]['class'] == x].count()
  prop = count / df_set[key_set].shape[0]
  print(f"class: {x}, rows: {count:7}, {class_list[x]:7}, prop: {prop:.4}" )

class: 0, rows:    9060, running, prop: 0.282
class: 1, rows:    9060, idle   , prop: 0.282
class: 2, rows:    9060, walking, prop: 0.282
class: 3, rows:    4950, stairs , prop: 0.1541


## COMPARE MODELS

In [29]:
models = {"SVC": lambda: SVC(),
          "SVC_Linear": lambda: SVC(kernel="linear"),
          "RandomForestClassifier": lambda: RandomForestClassifier()}
reports = {}

In [30]:
SEED = 42
print("Models fit and prepare report")
for key in X_set.keys():
  print("-"*80)
  print(f"DATASET {key}. shape: {df_set[key].shape}")
  X = X_set[key]
  y = y_set[key]
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED, stratify=y)
  if reports.get(key) is None:
    reports[key] = {}
  for model, classification in models.items():
    print(f"\n- classification: {model}")
    if reports[key].get(model):
      print("   alredy fit, skipped")
      continue
    clf = classification()
    %time clf.fit(X_train, y_train)
    %time y_test_predict = clf.predict(X_test)
    reports[key][model] = classification_report(y_test, y_test_predict, digits=4, target_names=class_list)


Models fit and prepare report
--------------------------------------------------------------------------------
DATASET f0s0. shape: (32130, 4)

- classification: SVC
CPU times: user 7.19 s, sys: 430 ms, total: 7.62 s
Wall time: 7.87 s
CPU times: user 4.03 s, sys: 1.98 ms, total: 4.04 s
Wall time: 4.03 s

- classification: SVC_Linear
CPU times: user 1min 22s, sys: 723 ms, total: 1min 23s
Wall time: 1min 23s
CPU times: user 2.64 s, sys: 2.97 ms, total: 2.64 s
Wall time: 2.64 s

- classification: RandomForestClassifier
CPU times: user 1.52 s, sys: 3.99 ms, total: 1.52 s
Wall time: 1.52 s
CPU times: user 147 ms, sys: 1.02 ms, total: 148 ms
Wall time: 148 ms
--------------------------------------------------------------------------------
DATASET f0s1. shape: (32130, 31)

- classification: SVC
CPU times: user 1.08 s, sys: 26 ms, total: 1.11 s
Wall time: 1.11 s
CPU times: user 914 ms, sys: 1.97 ms, total: 916 ms
Wall time: 918 ms

- classification: SVC_Linear
CPU times: user 5.38 s, sys: 9.64

## Classification report

In [32]:
print(f"{limit_frames=}")
for dset in reports.keys():
  for model in reports[dset].keys():
    print("-"*80)
    print(f"Data set: {dset}, shape: {df_set[dset].shape}, model: {model}")
    print(reports[dset][model])

limit_frames=300
--------------------------------------------------------------------------------
Data set: f0s0, shape: (32130, 4), model: SVC
              precision    recall  f1-score   support

     running     0.9030    0.8289    0.8644      2718
        idle     0.9688    0.9952    0.9819      2718
     walking     0.7509    0.9007    0.8190      2718
      stairs     0.6016    0.4424    0.5099      1485

    accuracy                         0.8365      9639
   macro avg     0.8061    0.7918    0.7938      9639
weighted avg     0.8323    0.8365    0.8301      9639

--------------------------------------------------------------------------------
Data set: f0s0, shape: (32130, 4), model: SVC_Linear
              precision    recall  f1-score   support

     running     0.8935    0.6825    0.7739      2718
        idle     0.9307    0.9787    0.9541      2718
     walking     0.7012    0.8558    0.7708      2718
      stairs     0.4971    0.4646    0.4803      1485

    accuracy   

## TEST на minimuм данних аккселерометра

In [125]:
rows = 10
df_one = df_set['f0s0'].iloc[:rows,:3]
df_one_class = df_set['f0s0'].iloc[:rows,3]
df_one_class

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: class, dtype: int64

In [116]:
df_one_stat = add_stat_feature_frame(df_one,df_one.shape[0])
df_one_stat

Unnamed: 0,accelerometer_X_mean,accelerometer_X_max,accelerometer_X_min,accelerometer_X_interquartile_range,accelerometer_X_index_of_minimum_value,accelerometer_X_mean_of_absolute_deviation,accelerometer_X_median,accelerometer_X_standard_deviation,accelerometer_X_root_mean_square_error,accelerometer_Y_mean,...,accelerometer_Y_root_mean_square_error,accelerometer_Z_mean,accelerometer_Z_max,accelerometer_Z_min,accelerometer_Z_interquartile_range,accelerometer_Z_index_of_minimum_value,accelerometer_Z_mean_of_absolute_deviation,accelerometer_Z_median,accelerometer_Z_standard_deviation,accelerometer_Z_root_mean_square_error
0,3.08038,18.780119,-4.941632,5.075708,6,4.491331,1.900996,6.652656,3.08038,7.096414,...,7.096414,8.330865,25.775976,-2.197877,4.757279,7,5.151939,7.220913,7.648074,8.330865
1,3.08038,18.780119,-4.941632,5.075708,6,4.491331,1.900996,6.652656,3.08038,7.096414,...,7.096414,8.330865,25.775976,-2.197877,4.757279,7,5.151939,7.220913,7.648074,8.330865
2,3.08038,18.780119,-4.941632,5.075708,6,4.491331,1.900996,6.652656,3.08038,7.096414,...,7.096414,8.330865,25.775976,-2.197877,4.757279,7,5.151939,7.220913,7.648074,8.330865
3,3.08038,18.780119,-4.941632,5.075708,6,4.491331,1.900996,6.652656,3.08038,7.096414,...,7.096414,8.330865,25.775976,-2.197877,4.757279,7,5.151939,7.220913,7.648074,8.330865
4,3.08038,18.780119,-4.941632,5.075708,6,4.491331,1.900996,6.652656,3.08038,7.096414,...,7.096414,8.330865,25.775976,-2.197877,4.757279,7,5.151939,7.220913,7.648074,8.330865
5,3.08038,18.780119,-4.941632,5.075708,6,4.491331,1.900996,6.652656,3.08038,7.096414,...,7.096414,8.330865,25.775976,-2.197877,4.757279,7,5.151939,7.220913,7.648074,8.330865
6,3.08038,18.780119,-4.941632,5.075708,6,4.491331,1.900996,6.652656,3.08038,7.096414,...,7.096414,8.330865,25.775976,-2.197877,4.757279,7,5.151939,7.220913,7.648074,8.330865
7,3.08038,18.780119,-4.941632,5.075708,6,4.491331,1.900996,6.652656,3.08038,7.096414,...,7.096414,8.330865,25.775976,-2.197877,4.757279,7,5.151939,7.220913,7.648074,8.330865
8,3.08038,18.780119,-4.941632,5.075708,6,4.491331,1.900996,6.652656,3.08038,7.096414,...,7.096414,8.330865,25.775976,-2.197877,4.757279,7,5.151939,7.220913,7.648074,8.330865
9,3.08038,18.780119,-4.941632,5.075708,6,4.491331,1.900996,6.652656,3.08038,7.096414,...,7.096414,8.330865,25.775976,-2.197877,4.757279,7,5.151939,7.220913,7.648074,8.330865


In [124]:
X_test_one = pd.concat([df_one, df_one_stat], axis=1)
X_test_one.fillna(0, inplace=True)
X_test_one[:1]

Unnamed: 0,accelerometer_X,accelerometer_Y,accelerometer_Z,accelerometer_X_mean,accelerometer_X_max,accelerometer_X_min,accelerometer_X_interquartile_range,accelerometer_X_index_of_minimum_value,accelerometer_X_mean_of_absolute_deviation,accelerometer_X_median,...,accelerometer_Y_root_mean_square_error,accelerometer_Z_mean,accelerometer_Z_max,accelerometer_Z_min,accelerometer_Z_interquartile_range,accelerometer_Z_index_of_minimum_value,accelerometer_Z_mean_of_absolute_deviation,accelerometer_Z_median,accelerometer_Z_standard_deviation,accelerometer_Z_root_mean_square_error
0,-4.151545,1.781286,2.173935,3.08038,18.780119,-4.941632,5.075708,6,4.491331,1.900996,...,7.096414,8.330865,25.775976,-2.197877,4.757279,7,5.151939,7.220913,7.648074,8.330865




In [121]:
key = 'f0s1'
X = X_set[key]
y = y_set[key]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED, stratify=y)
# clf = SVC(kernel="linear")
clf = RandomForestClassifier()
%time clf.fit(X_train, y_train)

CPU times: user 2.62 s, sys: 17.1 ms, total: 2.63 s
Wall time: 4.51 s


In [122]:
y_test_predict_one = clf.predict(X_test_one[:1])
y_test_predict_one

array([0])

In [123]:
print(classification_report(df_one_class[:y_test_predict_one.shape[0]], y_test_predict_one, digits=4))

              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000         1

    accuracy                         1.0000         1
   macro avg     1.0000    1.0000    1.0000         1
weighted avg     1.0000    1.0000    1.0000         1



тести показали мінімум 10 записів акселерометра треба для формування (статистики),
 щоб прогноз сходився