In [1]:
#ライブラリをインポート
import os #OSに依存する様々な機能を利用するためのモジュール(ファイルやディレクトリ操作など)
import re #正規表現を利用するためのモジュール
import csv  #csvファイルを扱うためのモジュール
import math #数学的計算のためのモジュール
import matplotlib.pyplot as plt #グラフ描画のためのモジュール
import numpy as np  #多次元配列計算のためのモジュール
import pandas as pd #データフレームを扱うためのモジュール
from sklearn.model_selection import train_test_split  #データをトレーニング用とテスト用に分けるためのモジュール
from sklearn.linear_model import LinearRegression #線型回帰
from sklearn.svm import SVC #サポートベクターマシン
from sklearn.ensemble import RandomForestClassifier #ランダムフォレスト
from sklearn.metrics import accuracy_score  #機械学習モデルの性能評価のためのモジュール

In [41]:
#定数を定義
BINS = 12000  #ヒストグラムのビンの数
EPSILON = .00001  #スムージングパラメータ
UPPER_LIMIT = 1.1 #静止区間の上限
LOWER_LIMIT = 0.9 #静止区間の加減
STATIONARY_INTERVALS = 5  #静止区間除去のサンプルの間隔(静止区間が何サンプル連続したら除去するか)
TRAIN_SIZE = 0.8  #ランダムフォレストのトレーニングデータの割合

In [3]:
#ディレクトリ内のデータセットのファイル名と周波数を取得する関数
def get_Hz_and_filename(path: str) -> list[int, str]:
    filename = os.listdir(path) #引数のパスのディレクトリの中のファイル名一覧を取得
    Hz_and_filename=[]  #ファイル名と周波数を格納するリストを宣言

    for file in filename:
        Hz = re.search(r'\d+', file)    #正規表現を用いてファイル名の中で一番最初に出てくる数字(周波数)を取得
        if Hz:  #数字の入っていないファイル名があるとエラーを吐くので、このif文でチェックする
            Hz_and_filename.append([int(Hz.group(0)), file])    #ファイル名と周波数を格納

    return Hz_and_filename

In [4]:
#ファイル名と周波数を分けて出力する関数
def divide_Hz_and_filename(Hz_and_filename: list[int, str]) -> tuple[list[int], list[str]]:
    Hz = []
    filename = []
    for row in Hz_and_filename:
      Hz.append(row[0])
      filename.append(row[1])

    return Hz, filename

In [5]:
#加速度データのCSVファイルから3軸加速度を取得する関数
def get_acceleration(filename: str) -> tuple[list[float], list[float], list[float]]:
    AccX, AccY, AccZ = [], [], []
    with open(filename) as f:
        reader = csv.reader(f)
        for row in reader:
            AccX.append(float(row[2]))
            AccY.append(float(row[3]))
            AccZ.append(float(row[4]))

    return AccX, AccY, AccZ

In [6]:
#静止区間を除去する関数
def remove_stationary_intervals(AccX: list[float], AccY: list[float], AccZ: list[float]) -> list[float]:
    #各軸の加速度の平均を求める
    AvgAccX = sum(AccX) / len(AccX)
    AvgAccY = sum(AccY) / len(AccY)
    AvgAccZ = sum(AccZ) / len(AccZ)

    AvgResultantAcc = math.sqrt(AvgAccX ** 2 + AvgAccY ** 2 + AvgAccZ ** 2) #重力加速度の推定値=合成加速度の平均を求める

    ResultantAcc = [math.sqrt(x ** 2 + y ** 2 + z ** 2) for x, y, z in zip(AccX, AccY, AccZ)]   #各時刻の合成加速度を求める

    #各時刻の合成加速度から静止区間(重力加速度の推定値に近い値が一定以上以上連続している区間)を除去する
    i=0 #ループ変数
    counter = 0 #静止区間がSTATIONARY_INTERVALS分続いているかをカウントする変数
    while i < len(ResultantAcc):
        if AvgResultantAcc * LOWER_LIMIT < ResultantAcc[i] < AvgResultantAcc * UPPER_LIMIT:   #平均のLOWER_LIMIT倍~UPPER_LIMIT倍の範囲を調べる
            counter += 1    #範囲内ならカウントを増やす
            if counter == STATIONARY_INTERVALS: #カウントがSTATIONARY_INTERVALSに達したらその区間を削除
                del ResultantAcc[i+1-STATIONARY_INTERVALS:i+1]    #スライスでは選択範囲の開始位置startと終了位置stopを[start:stop]のように書くとstart <= x < stopの範囲が選択される #start番目の値は含まれるがstop番目の値は含まれない
                counter = 0 #カウンターをリセット
                i -= STATIONARY_INTERVALS   #削除した分インデックスがズレるので補正する
        else:
            counter = 0 #カウンターをリセット
        i += 1

    return ResultantAcc  #静止区間を除去した後のリストを返す

In [7]:
#連続する2サンプルの差分を取る関数
def differences_of_acceleration(ResultantAcc: list[float]) -> list[float]:
    index = 0
    DifferenceAcc = []
    for dif in ResultantAcc[:-1]:
        DifferenceAcc.append(math.fabs(ResultantAcc[index + 1]*100000 - ResultantAcc[index]*100000))    #100000倍して誤差を取る
        index += 1

    return DifferenceAcc

In [8]:
#KLダイバージェンス関数 #引数として与える2つの分布は非負の値の集合でなければならないことに注意
def KL_divergence(a: list[float], b: list[float]) -> float:
    min_value = min(min(a), min(b)) #a,bの最小値の小さい方
    max_value = max(max(a), max(b)) #a,bの最大値の大きい方

    #a,bのヒストグラムを作成し、同じ数のビンで区切る
    a_hist, _ = np.histogram(a, bins=BINS, range=(min_value, max_value))
    b_hist, _ = np.histogram(b, bins=BINS, range=(min_value, max_value))

    #正規化する(確率分布に変換する、合計を1にする)ために全合計で割る
    a_hist = (a_hist + EPSILON) / a_hist.sum()
    b_hist = (b_hist + EPSILON) / b_hist.sum()

    #KLダイバージェンスの値を返す
    return np.sum([ai * np.log(ai / bi) for ai, bi in zip(a_hist, b_hist)])

In [9]:
#JSダイバージェンス関数 #引数として与える2つの分布は非負の値の集合でなければならないことに注意
def JS_divergence(a: list[float], b: list[float]) -> float:
    min_value = min(min(a), min(b)) #a,bの最小値の小さい方
    max_value = max(max(a), max(b)) #a,bの最大値の大きい方

    #a,bのヒストグラムを作成し、同じ数のビンで区切る
    a_hist, _ = np.histogram(a, bins=BINS, range=(min_value, max_value))
    b_hist, _ = np.histogram(b, bins=BINS, range=(min_value, max_value))

    #正規化する(確率分布に変換する、合計を1にする)ために全合計で割る
    a_hist = (a_hist + EPSILON) / a_hist.sum()
    b_hist = (b_hist + EPSILON) / b_hist.sum()

    #2つの分布の平均値を求める
    mean_hist = (a_hist + b_hist) / 2.0

    #平均とそれぞれの分布のKLダイバージェンスを算出
    kl_a = np.sum([ai * np.log(ai / bi) for ai, bi in zip(a_hist, mean_hist)])
    kl_b = np.sum([ai * np.log(ai / bi) for ai, bi in zip(b_hist, mean_hist)])

    #JSダイバージェンスの値を返す
    return (kl_a + kl_b) / 2.0

In [10]:
#データフレームの各行の中で2番目に小さい値が格納されている場所を調べる関数(最小値は同じ確率分布同士の0.0)
def get_index_and_columns_of_second_smallest(df: pd.DataFrame) -> list[str, str]:
    index_and_columns_of_second_smallest = []  #データフレームの中で2番目に小さい値が格納されている場所のインデックス名とカラム名を格納する変数
    for i in range(len(df)):
        sorted_row = df.iloc[i].sort_values()   #.ilocでデータフレームの要素を行、列の番号の添字で指定する    #各行の要素を昇順に並び替える
        second_smallest_columns = sorted_row.index[1] #各行の2番目に小さい値が格納されているカラム[1]の名前を取得
        #second_smallest_label = df.columns.get_loc(second_smallest_index)
        index_and_columns_of_second_smallest.append((df.index[i], second_smallest_columns))    #インデックスとカラムのラベル名の組を二次元配列に追加
    return index_and_columns_of_second_smallest

In [11]:
#推定精度を算出する巻数
def calculate_accuracy(index_and_columns_of_second_smallest: list[str, str]) -> tuple[float, list[int]]:
    counter = 0
    error_index_list = []
    for i in range(len(index_and_columns_of_second_smallest)):
        #インデックスとカラムのラベル名が同じならばカウンターを1増やす
        if index_and_columns_of_second_smallest[i][0] == index_and_columns_of_second_smallest[i][1]:
            counter += 1
        else:
            error_index_list.append(i)
            print(f"間違ってるやつは{i}番目の{index_and_columns_of_second_smallest[i][0]}と{index_and_columns_of_second_smallest[i][1]}です")

    return (counter / len(index_and_columns_of_second_smallest)) * 100, error_index_list  #精度を100分率で返す

In [12]:
#入力された加速度の差分のリストからヒストグラムを作る関数
def create_histogram(DifferenceAcc_list: list[float]) -> np.histogram:
    min_value = min(map(lambda x:max(x), DifferenceAcc_list))   #入力されたリストの中で最も小さい数
    max_value = max(map(lambda x:max(x), DifferenceAcc_list))   #入力されたリストの中で最も大きい数

    DifferenceAcc_hist = np.zeros((len(DifferenceAcc_list), BINS), dtype=float)

    for i in range(len(DifferenceAcc_list)):
        DifferenceAcc_hist[i], _ = np.histogram(DifferenceAcc_list[i], bins=BINS, range=(min_value, max_value)) #ヒストグラムを作成し、同じ数のビンで区切る
        DifferenceAcc_hist[i] = (DifferenceAcc_hist[i] + EPSILON) / DifferenceAcc_hist[i].sum()     #正規化する(確率分布に変換する、合計を1にする)ために全合計で割る
    return DifferenceAcc_hist

In [13]:
#入力された加速度の差分のリストからヒストグラムを作る関数
def create_histogram2(DifferenceAcc_list: list[float]) -> np.histogram:
    DifferenceAcc_hist = np.zeros((len(DifferenceAcc_list), BINS), dtype=float)

    for i in range(len(DifferenceAcc_list)):
        min_value = min(DifferenceAcc_list[i])
        max_value = max(DifferenceAcc_list[i])
        DifferenceAcc_hist[i], _ = np.histogram(DifferenceAcc_list[i], bins=BINS, range=(min_value, max_value)) #ヒストグラムを作成し、同じ数のビンで区切る
        DifferenceAcc_hist[i] = (DifferenceAcc_hist[i] + EPSILON) / DifferenceAcc_hist[i].sum()     #正規化する(確率分布に変換する、合計を1にする)ために全合計で割る
    return DifferenceAcc_hist

In [14]:
#KLダイバージェンスとJSダイバージェンス算出の一連の流れを自動化した関数
def KL_and_JS(path: str):
    Hz_and_filename = get_Hz_and_filename(path)
    Hz_and_filename.sort(reverse=True)  #周波数の大きい順にソート
    Hz, filename = divide_Hz_and_filename(Hz_and_filename)
    Hz = [str(hz) + "Hz" for hz in Hz]  #周波数の値+"Hz"のリストを作りデータフレームのラベルに用いる

    #使う変数を宣言
    AccX, AccY, AccZ = [], [], []
    ResultantAcc = []
    DifferenceAcc_list = []
    resultKLD = [[0.0 for j in range(len(filename))] for i in range(len(filename))]  # resultKLDの要素を0.0で初期化
    resultJSD = [[0.0 for j in range(len(filename))] for i in range(len(filename))]  # resultKLDの要素を0.0で初期化
    error_index_list = []

    #各データセットからデータを読み込み静止区間を除去したものを二次元配列に格納
    for i in filename:
        AccX, AccY, AccZ = get_acceleration(path+i)
        ResultantAcc = remove_stationary_intervals(AccX, AccY, AccZ)
        DifferenceAcc_list.append(differences_of_acceleration(ResultantAcc))

    #KLダイバージェンスの値を格納
    for i in range(len(filename)):
        for j in range(len(filename)):
            resultKLD[i][j] = KL_divergence(DifferenceAcc_list[i], DifferenceAcc_list[j])

    #JSダイバージェンスの値を格納
    for i in range(len(filename)):
        for j in range(len(filename)):
            resultJSD[i][j] = JS_divergence(DifferenceAcc_list[i], DifferenceAcc_list[j])

    #結果を出力
    df_KLD = pd.DataFrame(resultKLD, index=Hz, columns=Hz)
    display(df_KLD)
    accuracyKLD, error_index_list = calculate_accuracy(get_index_and_columns_of_second_smallest(df_KLD))
    for i in range(len(error_index_list)):
        print(filename[error_index_list[i]])
    print(f"KLダイバージェンスによる推定精度は{accuracyKLD}%です")

    df_JSD = pd.DataFrame(resultJSD, index=Hz, columns=Hz)
    display(df_JSD)
    accuracyJSD, error_index_list = calculate_accuracy(get_index_and_columns_of_second_smallest(df_JSD))
    for i in range(len(error_index_list)):
        print(filename[error_index_list[i]])
    print(f"JSダイバージェンスによる推定精度は{accuracyJSD}%です")

In [15]:
#ランダムフォレストによる機械学習モデル構築と性能評価までを自動化した関数
def random_forest(path: str):
    Hz_and_filename = get_Hz_and_filename(path)
    Hz_and_filename.sort(reverse=True)  #周波数の大きい順にソート
    Hz, filename = divide_Hz_and_filename(Hz_and_filename)

    #使う変数を宣言
    AccX, AccY, AccZ = [], [], []
    ResultantAcc = []
    DifferenceAcc_list = []

    #各データセットからデータを読み込み静止区間を除去したものを二次元配列に格納
    for i in filename:
        AccX, AccY, AccZ = get_acceleration(path+i)
        ResultantAcc = remove_stationary_intervals(AccX, AccY, AccZ)
        DifferenceAcc_list.append(differences_of_acceleration(ResultantAcc))

    DifferenceAcc_hist = create_histogram2(DifferenceAcc_list)
    x_train, x_test, y_train, y_test = train_test_split(DifferenceAcc_hist, Hz, train_size = TRAIN_SIZE, shuffle = True)

    # 学習する
    clf = RandomForestClassifier(random_state=1234)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("正解率 = ", accuracy_score(y_test, y_pred))

In [37]:
path = "all_walk_data/"

In [42]:
KL_and_JS(path)

Unnamed: 0,100Hz,100Hz.1,100Hz.2,100Hz.3,100Hz.4,100Hz.5,100Hz.6,100Hz.7,100Hz.8,100Hz.9,...,10Hz,10Hz.1,10Hz.2,10Hz.3,10Hz.4,10Hz.5,10Hz.6,10Hz.7,10Hz.8,10Hz.9
100Hz,0.0,0.137662,0.468516,0.315697,0.086111,0.159402,0.420559,0.422,0.531334,0.460314,...,0.190067,1.210957,1.061776,1.093067,1.570841,0.476202,0.26882,1.215696,1.344919,1.014631
100Hz,0.158207,0.0,0.232543,0.36862,0.186139,0.091416,0.220966,0.373944,0.504794,0.393501,...,0.193286,1.669043,1.507716,1.219781,2.443894,0.540942,0.240908,1.550616,2.648911,1.28505
100Hz,0.492905,0.212292,0.0,0.420418,0.538265,0.137868,0.03994,0.405933,0.442767,0.392434,...,0.312062,2.034803,1.809884,1.422314,2.435239,0.791448,0.322424,2.185854,2.200512,1.829045
100Hz,0.110984,0.116828,0.343589,0.0,0.239166,0.11767,0.290957,0.094054,0.112283,0.091845,...,0.30286,1.883911,1.550419,1.439438,2.217149,0.684924,0.400048,1.686319,2.304295,1.423433
100Hz,0.394015,0.503646,0.762211,1.198757,0.0,0.457121,0.816973,1.437637,1.550402,1.353621,...,0.227683,0.893211,0.828163,0.776823,1.131088,0.375444,0.264813,0.908685,1.042956,0.713265
100Hz,0.218033,0.112843,0.18288,0.299977,0.238942,0.0,0.169651,0.351803,0.405531,0.341332,...,0.186277,1.549629,1.379172,1.185302,1.912332,0.562672,0.232563,1.646929,1.716934,1.355916
100Hz,0.430995,0.17148,0.024447,0.346671,0.491242,0.1039,0.0,0.336154,0.367942,0.32876,...,0.282603,1.990594,1.745195,1.381061,2.276021,0.746683,0.294021,2.120477,2.095147,1.768451
100Hz,0.178104,0.112228,0.297017,0.070523,0.327897,0.132351,0.25389,0.0,0.165847,0.030396,...,0.338623,3.635286,1.978778,1.649521,2.724684,0.771504,0.468745,1.885989,6.63827,1.684233
100Hz,0.19414,0.140536,0.326463,0.078353,0.341653,0.151962,0.281367,0.069338,0.0,0.040515,...,0.366729,3.62459,2.002687,1.670158,2.722083,0.795421,0.487593,1.911919,6.256186,1.708724
100Hz,0.198822,0.137285,0.314118,0.067136,0.347177,0.145255,0.270025,0.034328,0.064386,0.0,...,0.363698,2.629131,2.031186,1.678935,2.745496,0.798707,0.487938,1.921638,3.97981,1.64803


間違ってるやつは0番目の100Hzと50Hzです
間違ってるやつは4番目の100Hzと50Hzです
間違ってるやつは5番目の100Hzと50Hzです
間違ってるやつは13番目の50Hzと100Hzです
間違ってるやつは17番目の50Hzと100Hzです
間違ってるやつは19番目の50Hzと100Hzです
間違ってるやつは20番目の50Hzと100Hzです
間違ってるやつは22番目の50Hzと100Hzです
間違ってるやつは23番目の50Hzと100Hzです
間違ってるやつは25番目の10Hzと50Hzです
間違ってるやつは26番目の10Hzと100Hzです
間違ってるやつは32番目の10Hzと100Hzです
walk100Hz-20230303-111623870.csv
walk100Hz-20230228-195310844.csv
walk100Hz-20230227-202328538.csv
walk50Hz-20230302-165451196.csv
walk50Hz-20230227-202335920.csv
walk50Hz-0930-0756.csv
walk50Hz-0809-1000.csv
walk50Hz-0803-1229.csv
walk50Hz-0803-1126.csv
walk10Hz-20230302-165445244.csv
walk10Hz-20230302-115939846.csv
walk10Hz-20230227-194422371.csv
KLダイバージェンスによる推定精度は66.66666666666666%です


Unnamed: 0,100Hz,100Hz.1,100Hz.2,100Hz.3,100Hz.4,100Hz.5,100Hz.6,100Hz.7,100Hz.8,100Hz.9,...,10Hz,10Hz.1,10Hz.2,10Hz.3,10Hz.4,10Hz.5,10Hz.6,10Hz.7,10Hz.8,10Hz.9
100Hz,0.0,0.025264,0.10836,0.030057,0.024517,0.035366,0.095572,0.050056,0.055866,0.057732,...,0.049083,0.292028,0.271695,0.278582,0.33853,0.127919,0.07058,0.289119,0.283771,0.257857
100Hz,0.025264,0.0,0.045633,0.02899,0.045232,0.010936,0.038569,0.030833,0.038514,0.037663,...,0.043743,0.332679,0.311845,0.29648,0.378324,0.141137,0.058353,0.331826,0.338241,0.296125
100Hz,0.10836,0.045633,0.0,0.081633,0.128765,0.030247,0.003253,0.073825,0.079678,0.076062,...,0.081804,0.403142,0.379397,0.343867,0.441478,0.204049,0.087514,0.416437,0.408568,0.378623
100Hz,0.030057,0.02899,0.081633,0.0,0.071953,0.029705,0.069588,0.014923,0.015482,0.013866,...,0.085979,0.38685,0.361336,0.355659,0.42826,0.192559,0.11117,0.395649,0.389214,0.354903
100Hz,0.024517,0.045232,0.128765,0.071953,0.0,0.055224,0.119165,0.094636,0.100408,0.102861,...,0.028622,0.201971,0.191462,0.19097,0.250379,0.074833,0.040976,0.193805,0.198499,0.165253
100Hz,0.035366,0.010936,0.030247,0.029705,0.055224,0.0,0.023711,0.035232,0.040493,0.039021,...,0.042919,0.336325,0.314045,0.29538,0.378655,0.148259,0.056262,0.345371,0.338181,0.307473
100Hz,0.095572,0.038569,0.003253,0.069588,0.119165,0.023711,0.0,0.064117,0.069544,0.066519,...,0.076792,0.399819,0.375688,0.341787,0.437948,0.198583,0.082938,0.413112,0.403289,0.374972
100Hz,0.050056,0.030833,0.073825,0.014923,0.094636,0.035232,0.064117,0.0,0.012147,0.005376,...,0.0957,0.430643,0.388975,0.375843,0.44993,0.211585,0.126817,0.420252,0.470002,0.384866
100Hz,0.055866,0.038514,0.079678,0.015482,0.100408,0.040493,0.069544,0.012147,0.0,0.006135,...,0.10416,0.440321,0.397647,0.384116,0.457124,0.219154,0.132966,0.430518,0.470347,0.396067
100Hz,0.057732,0.037663,0.076062,0.013866,0.102861,0.039021,0.066519,0.005376,0.006135,0.0,...,0.104093,0.42676,0.399574,0.385814,0.460643,0.222112,0.134155,0.433695,0.438411,0.393572


間違ってるやつは0番目の100Hzと50Hzです
間違ってるやつは4番目の100Hzと50Hzです
間違ってるやつは5番目の100Hzと50Hzです
間違ってるやつは13番目の50Hzと100Hzです
間違ってるやつは17番目の50Hzと100Hzです
間違ってるやつは22番目の50Hzと100Hzです
間違ってるやつは23番目の50Hzと100Hzです
間違ってるやつは25番目の10Hzと50Hzです
間違ってるやつは26番目の10Hzと50Hzです
間違ってるやつは31番目の10Hzと50Hzです
walk100Hz-20230303-111623870.csv
walk100Hz-20230228-195310844.csv
walk100Hz-20230227-202328538.csv
walk50Hz-20230302-165451196.csv
walk50Hz-20230227-202335920.csv
walk50Hz-0803-1229.csv
walk50Hz-0803-1126.csv
walk10Hz-20230302-165445244.csv
walk10Hz-20230302-115939846.csv
walk10Hz-20230227-202332703.csv
JSダイバージェンスによる推定精度は72.22222222222221%です


In [33]:
random_forest(path)

正解率 =  0.5


In [32]:
from sklearn.neighbors import KNeighborsClassifier

Hz_and_filename = get_Hz_and_filename(path)
Hz_and_filename.sort(reverse=True)  #周波数の大きい順にソート
Hz, filename = divide_Hz_and_filename(Hz_and_filename)
#使う変数を宣言
AccX, AccY, AccZ = [], [], []
ResultantAcc = []
DifferenceAcc_list = []

#各データセットからデータを読み込み静止区間を除去したものを二次元配列に格納
for i in filename:
    AccX, AccY, AccZ = get_acceleration(path+i)
    ResultantAcc = remove_stationary_intervals(AccX, AccY, AccZ)
    DifferenceAcc_list.append(differences_of_acceleration(ResultantAcc))

DifferenceAcc_hist = create_histogram2(DifferenceAcc_list)
x_train, x_test, y_train, y_test = train_test_split(DifferenceAcc_hist, Hz, train_size = TRAIN_SIZE, shuffle = True)
# 学習する
clf = KNeighborsClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print("正解率 = ", accuracy_score(y_test, y_pred))

正解率 =  0.375
