In [18]:
#ライブラリをインポート
import os #OSに依存する様々な機能を利用するためのモジュール(ファイルやディレクトリ操作など)
import re #正規表現を利用するためのモジュール
import csv  #csvファイルを扱うためのモジュール
import math #数学的計算のためのモジュール
import matplotlib.pyplot as plt #グラフ描画のためのモジュール
import numpy as np  #多次元配列計算のためのモジュール
import pandas as pd #データフレームを扱うためのモジュール
from sklearn.model_selection import train_test_split  #データをトレーニング用とテスト用に分けるためのモジュール
from sklearn.linear_model import LinearRegression #線型回帰
from sklearn.svm import SVC #サポートベクターマシン
from sklearn.ensemble import RandomForestClassifier #ランダムフォレスト
from sklearn.metrics import accuracy_score  #機械学習モデルの性能評価のためのモジュール

In [19]:
#定数を定義
BINS = 10000  #ヒストグラムのビンの数
EPSILON = .00001  #スムージングパラメータ
UPPER_LIMIT = 1.1 #静止区間の上限
LOWER_LIMIT = 0.9 #静止区間の加減
STATIONARY_INTERVALS = 5  #静止区間除去のサンプルの間隔(静止区間が何サンプル連続したら除去するか)
TRAIN_SIZE = 0.9  #ランダムフォレストのトレーニングデータの割合

In [20]:
#ディレクトリ内のデータセットのファイル名と周波数を取得する関数
def get_filename_and_Hz(path: str) -> list[int, str]:
    filename = os.listdir(path) #引数のパスのディレクトリの中のファイル名一覧を取得
    filename_and_Hz=[]  #ファイル名と周波数を格納するリストを宣言

    for file in filename:
        Hz = re.search(r'\d+', file)    #正規表現を用いてファイル名の中で一番最初に出てくる数字(周波数)を取得
        if Hz:  #数字の入っていないファイル名があるとエラーを吐くので、このif文でチェックする
            filename_and_Hz.append([int(Hz.group(0)), file])    #ファイル名と周波数を格納

    return filename_and_Hz

In [21]:
#ファイル名と周波数を分けて出力する関数
def divide_filename_and_Hz(filename_and_Hz: list[int, str]) -> tuple[list[int], list[str]]:
    Hz = []
    filename = []
    for row in filename_and_Hz:
      Hz.append(row[0])
      filename.append(row[1])

    return Hz, filename

In [22]:
#加速度データのCSVファイルから3軸加速度を取得する関数
def get_acceleration(filename: str) -> tuple[list[float], list[float], list[float]]:
    AccX, AccY, AccZ = [], [], []
    with open(filename) as f:
        reader = csv.reader(f)
        for row in reader:
            AccX.append(float(row[2]))
            AccY.append(float(row[3]))
            AccZ.append(float(row[4]))

    return AccX, AccY, AccZ

In [23]:
#静止区間を除去する関数
def remove_stationary_intervals(AccX: list[float], AccY: list[float], AccZ: list[float]) -> list[float]:
    #各軸の加速度の平均を求める
    AvgAccX = sum(AccX) / len(AccX)
    AvgAccY = sum(AccY) / len(AccY)
    AvgAccZ = sum(AccZ) / len(AccZ)

    AvgResultantAcc = math.sqrt(AvgAccX ** 2 + AvgAccY ** 2 + AvgAccZ ** 2) #重力加速度の推定値=合成加速度の平均を求める

    ResultantAcc = [math.sqrt(x ** 2 + y ** 2 + z ** 2) for x, y, z in zip(AccX, AccY, AccZ)]   #各時刻の合成加速度を求める

    #各時刻の合成加速度から静止区間(重力加速度の推定値に近い値が一定以上以上連続している区間)を除去する
    i=0 #ループ変数
    counter = 0 #静止区間がSTATIONARY_INTERVALS分続いているかをカウントする変数
    while i < len(ResultantAcc):
        if AvgResultantAcc * LOWER_LIMIT < ResultantAcc[i] < AvgResultantAcc * UPPER_LIMIT:   #平均のLOWER_LIMIT倍~UPPER_LIMIT倍の範囲を調べる
            counter += 1    #範囲内ならカウントを増やす
            if counter == STATIONARY_INTERVALS: #カウントがSTATIONARY_INTERVALSに達したらその区間を削除
                del ResultantAcc[i+1-STATIONARY_INTERVALS:i+1]    #スライスでは選択範囲の開始位置startと終了位置stopを[start:stop]のように書くとstart <= x < stopの範囲が選択される #start番目の値は含まれるがstop番目の値は含まれない
                counter = 0 #カウンターをリセット
                i -= STATIONARY_INTERVALS   #削除した分インデックスがズレるので補正する
        else:
            counter = 0 #カウンターをリセット
        i += 1

    return ResultantAcc  #静止区間を除去した後のリストを返す

In [24]:
#連続する2サンプルの差分を取る関数
def differences_of_acceleration(ResultantAcc: list[float]) -> list[float]:
    index = 0
    DifferenceAcc = []
    for dif in ResultantAcc[:-1]:
        DifferenceAcc.append(math.fabs(ResultantAcc[index + 1]*100000 - ResultantAcc[index]*100000))    #100000倍して誤差を取る
        index += 1

    return DifferenceAcc

In [25]:
#KLダイバージェンス関数 #引数として与える2つの分布は非負の値の集合でなければならないことに注意
def KL_divergence(a: list[float], b: list[float]) -> float:
    min_value = min(min(a), min(b)) #a,bの最小値の小さい方
    max_value = max(max(a), max(b)) #a,bの最大値の大きい方

    #a,bのヒストグラムを作成し、同じ数のビンで区切る
    a_hist, _ = np.histogram(a, bins=BINS, range=(min_value, max_value))
    b_hist, _ = np.histogram(b, bins=BINS, range=(min_value, max_value))

    #正規化する(確率分布に変換する、合計を1にする)ために全合計で割る
    a_hist = (a_hist + EPSILON) / a_hist.sum()
    b_hist = (b_hist + EPSILON) / b_hist.sum()

    #KLダイバージェンスの値を返す
    return np.sum([ai * np.log(ai / bi) for ai, bi in zip(a_hist, b_hist)])

In [26]:
#JSダイバージェンス関数 #引数として与える2つの分布は非負の値の集合でなければならないことに注意
def JS_divergence(a: list[float], b: list[float]) -> float:
    min_value = min(min(a), min(b)) #a,bの最小値の小さい方
    max_value = max(max(a), max(b)) #a,bの最大値の大きい方

    #a,bのヒストグラムを作成し、同じ数のビンで区切る
    a_hist, _ = np.histogram(a, bins=BINS, range=(min_value, max_value))
    b_hist, _ = np.histogram(b, bins=BINS, range=(min_value, max_value))

    #正規化する(確率分布に変換する、合計を1にする)ために全合計で割る
    a_hist = (a_hist + EPSILON) / a_hist.sum()
    b_hist = (b_hist + EPSILON) / b_hist.sum()

    #2つの分布の平均値を求める
    mean_hist = (a_hist + b_hist) / 2.0

    #平均とそれぞれの分布のKLダイバージェンスを算出
    kl_a = np.sum([ai * np.log(ai / bi) for ai, bi in zip(a_hist, mean_hist)])
    kl_b = np.sum([ai * np.log(ai / bi) for ai, bi in zip(b_hist, mean_hist)])

    #JSダイバージェンスの値を返す
    return (kl_a + kl_b) / 2.0

In [27]:
#データフレームの各行の中で2番目に小さい値が格納されている場所を調べる関数(最小値は同じ確率分布同士の0.0)
def get_index_and_columns_of_second_smallest(df: pd.DataFrame) -> list[str, str]:
    index_and_columns_of_second_smallest = []  #データフレームの中で2番目に小さい値が格納されている場所のインデックス名とカラム名を格納する変数
    for i in range(len(df)):
        sorted_row = df.iloc[i].sort_values()   #.ilocでデータフレームの要素を行、列の番号の添字で指定する    #各行の要素を昇順に並び替える
        second_smallest_columns = sorted_row.index[1] #各行の2番目に小さい値が格納されているカラム[1]の名前を取得
        #second_smallest_label = df.columns.get_loc(second_smallest_index)
        index_and_columns_of_second_smallest.append((df.index[i], second_smallest_columns))    #インデックスとカラムのラベル名の組を二次元配列に追加
    return index_and_columns_of_second_smallest

In [28]:
#推定精度を算出する巻数
def calculate_accuracy(index_and_columns_of_second_smallest: list[str, str]) -> float:
    counter = 0
    for i in range(len(index_and_columns_of_second_smallest)):
        #インデックスとカラムのラベル名が同じならばカウンターを1増やす
        if index_and_columns_of_second_smallest[i][0] == index_and_columns_of_second_smallest[i][1]:
            counter += 1
        else:
            print(f"間違ってるやつは{index_and_columns_of_second_smallest[i][0]}と{index_and_columns_of_second_smallest[i][1]}です")

    return (counter / len(index_and_columns_of_second_smallest)) * 100  #精度を100分率で返す

In [29]:
#入力された加速度の差分のリストからヒストグラムを作る関数
def create_histogram(DifferenceAcc_list: list[float]) -> np.histogram:
    min_value = min(map(lambda x:max(x), DifferenceAcc_list))   #入力されたリストの中で最も小さい数
    max_value = max(map(lambda x:max(x), DifferenceAcc_list))   #入力されたリストの中で最も大きい数

    DifferenceAcc_hist = np.zeros((len(DifferenceAcc_list), BINS), dtype=float)
    print(DifferenceAcc_hist[0])

    for i in range(len(DifferenceAcc_list)):
        DifferenceAcc_hist[i], _ = np.histogram(DifferenceAcc_list[i], bins=BINS, range=(min_value, max_value)) #ヒストグラムを作成し、同じ数のビンで区切る
        DifferenceAcc_hist[i] = (DifferenceAcc_hist[i] + EPSILON) / DifferenceAcc_hist[i].sum()     #正規化する(確率分布に変換する、合計を1にする)ために全合計で割る
    return DifferenceAcc_hist

In [30]:
#KLダイバージェンスとJSダイバージェンス算出の一連の流れを自動化した関数
def KL_and_JS(path: str):
    filename_and_Hz = get_filename_and_Hz(path)
    filename_and_Hz.sort(reverse=True)  #周波数の大きい順にソート
    Hz, filename = divide_filename_and_Hz(filename_and_Hz)
    Hz = [str(hz) + "Hz" for hz in Hz]  #周波数の値+"Hz"のリストを作りデータフレームのラベルに用いる

    #使う変数を宣言
    AccX, AccY, AccZ = [], [], []
    ResultantAcc = []
    DifferenceAcc_list = []
    resultKLD = [[0.0 for j in range(len(filename))] for i in range(len(filename))]  # resultKLDの要素を0.0で初期化
    resultJSD = [[0.0 for j in range(len(filename))] for i in range(len(filename))]  # resultKLDの要素を0.0で初期化


    #各データセットからデータを読み込み静止区間を除去したものを二次元配列に格納
    for i in filename:
        AccX, AccY, AccZ = get_acceleration(path+i)
        ResultantAcc = remove_stationary_intervals(AccX, AccY, AccZ)
        DifferenceAcc_list.append(differences_of_acceleration(ResultantAcc))

    #KLダイバージェンスの値を格納
    for i in range(len(filename)):
        for j in range(len(filename)):
            resultKLD[i][j] = KL_divergence(DifferenceAcc_list[i], DifferenceAcc_list[j])

    #JSダイバージェンスの値を格納
    for i in range(len(filename)):
        for j in range(len(filename)):
            resultJSD[i][j] = JS_divergence(DifferenceAcc_list[i], DifferenceAcc_list[j])

    #結果を出力
    df_KLD = pd.DataFrame(resultKLD, index=Hz, columns=Hz)
    display(df_KLD)
    print(f"KLダイバージェンスによる推定精度は{calculate_accuracy(get_index_and_columns_of_second_smallest(df_KLD))}%です")

    df_JSD = pd.DataFrame(resultJSD, index=Hz, columns=Hz)
    display(df_JSD)
    print(f"JSダイバージェンスによる推定精度は{calculate_accuracy(get_index_and_columns_of_second_smallest(df_JSD))}%です")

In [33]:
#ランダムフォレストによる機械学習モデル構築と性能評価までを自動化した関数
def randomforest(path: str):
    filename_and_Hz = get_filename_and_Hz(path)
    filename_and_Hz.sort(reverse=True)  #周波数の大きい順にソート
    Hz, filename = divide_filename_and_Hz(filename_and_Hz)
    Hz = [str(hz) + "Hz" for hz in Hz]  #周波数の値+"Hz"のリストを作りデータフレームのラベルに用いる

    #使う変数を宣言
    AccX, AccY, AccZ = [], [], []
    ResultantAcc = []
    DifferenceAcc_list = []

    #各データセットからデータを読み込み静止区間を除去したものを二次元配列に格納
    for i in filename:
        AccX, AccY, AccZ = get_acceleration(path+i)
        ResultantAcc = remove_stationary_intervals(AccX, AccY, AccZ)
        DifferenceAcc_list.append(differences_of_acceleration(ResultantAcc))

    DifferenceAcc_hist = create_histogram(DifferenceAcc_list)
    x_train, x_test, y_train, y_test = train_test_split(DifferenceAcc_hist, Hz, train_size = TRAIN_SIZE, shuffle = True)

    # 学習する
    clf = RandomForestClassifier(random_state=1234)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("正解率 = ", accuracy_score(y_test, y_pred))

In [39]:
path = "all_walk_data/"

In [41]:
KL_and_JS(path)

Unnamed: 0,100Hz,100Hz.1,100Hz.2,100Hz.3,100Hz.4,100Hz.5,100Hz.6,100Hz.7,50Hz,50Hz.1,...,50Hz.2,50Hz.3,10Hz,10Hz.1,10Hz.2,10Hz.3,10Hz.4,10Hz.5,10Hz.6,10Hz.7
100Hz,0.0,0.41728,0.797052,1.426204,1.50247,1.323248,0.800298,1.052054,0.164775,0.231257,...,0.489474,0.63159,0.778927,0.760971,1.083899,0.330847,0.232908,0.895164,0.947644,0.702087
100Hz,0.234053,0.0,0.165483,0.349133,0.391047,0.335301,0.178283,0.213432,0.479494,0.162186,...,0.282905,0.315165,1.341353,1.17463,1.796806,0.550216,0.225328,1.632737,1.593108,1.346018
100Hz,0.49078,0.101292,0.0,0.335277,0.362438,0.323632,0.239413,0.206996,0.813659,0.306338,...,0.503558,0.538758,1.738741,1.380534,2.247759,0.741282,0.291562,2.118427,2.022862,1.765751
100Hz,0.327321,0.131331,0.253307,0.0,0.142298,0.027826,0.063785,0.056752,0.589961,0.285203,...,0.254333,0.258173,1.787133,1.584989,2.499185,0.760177,0.465417,1.866039,6.22117,1.632231
100Hz,0.339301,0.149467,0.279472,0.060716,0.0,0.035654,0.086962,0.077647,0.603947,0.305933,...,0.283433,0.279341,1.811411,1.602716,2.47667,0.78345,0.483655,1.89032,5.816405,1.653716
100Hz,0.346362,0.144172,0.269129,0.031389,0.052569,0.0,0.080518,0.068941,0.617057,0.308851,...,0.283292,0.284119,1.799605,1.608224,2.516639,0.789007,0.484465,1.904839,3.469898,1.629254
100Hz,0.189747,0.063701,0.208967,0.120977,0.2021,0.146167,0.0,0.083106,0.408779,0.146979,...,0.147443,0.159362,1.531025,1.377196,2.207195,0.580913,0.300578,1.599983,2.443566,1.34868
100Hz,0.250033,0.06609,0.172677,0.105659,0.164988,0.116378,0.066816,0.0,0.500855,0.191941,...,0.223971,0.234436,1.619915,1.410532,2.299417,0.639878,0.328186,1.74589,3.054139,1.471423
50Hz,0.109486,0.690139,1.16165,2.184588,2.367506,1.944211,1.154656,1.551668,0.0,0.282777,...,0.61468,0.803278,0.577637,0.625127,0.805204,0.266464,0.277692,0.561575,0.619292,0.463948
50Hz,0.111146,0.251245,0.508458,0.940559,1.102883,0.938669,0.512539,0.712816,0.224041,0.0,...,0.310513,0.392716,0.978881,0.884118,1.382691,0.367208,0.184393,1.123058,1.244457,0.909697


間違ってるやつは100Hzと50Hzです
間違ってるやつは100Hzと50Hzです
間違ってるやつは100Hzと50Hzです
間違ってるやつは50Hzと100Hzです
間違ってるやつは50Hzと100Hzです
間違ってるやつは50Hzと100Hzです
間違ってるやつは50Hzと100Hzです
間違ってるやつは10Hzと100Hzです
KLダイバージェンスによる推定精度は66.66666666666666%です


Unnamed: 0,100Hz,100Hz.1,100Hz.2,100Hz.3,100Hz.4,100Hz.5,100Hz.6,100Hz.7,50Hz,50Hz.1,...,50Hz.2,50Hz.3,10Hz,10Hz.1,10Hz.2,10Hz.3,10Hz.4,10Hz.5,10Hz.6,10Hz.7
100Hz,0.0,0.054268,0.11895,0.094455,0.099791,0.102645,0.055061,0.070845,0.022329,0.017959,...,0.023692,0.028583,0.187196,0.188477,0.245477,0.070533,0.038376,0.192259,0.193221,0.163567
100Hz,0.054268,0.0,0.023402,0.034959,0.039794,0.038754,0.015319,0.015382,0.111701,0.032255,...,0.047952,0.051757,0.310688,0.293556,0.375917,0.145045,0.054821,0.343926,0.333888,0.305825
100Hz,0.11895,0.023402,0.0,0.063983,0.069266,0.066354,0.050463,0.040223,0.188762,0.077904,...,0.111981,0.118108,0.373967,0.341493,0.436061,0.197184,0.082355,0.412623,0.400164,0.374434
100Hz,0.094455,0.034959,0.063983,0.0,0.010637,0.005026,0.016503,0.01271,0.163779,0.079576,...,0.071436,0.071347,0.38498,0.374462,0.446642,0.210292,0.126113,0.41898,0.459662,0.382939
100Hz,0.099791,0.039794,0.069266,0.010637,0.0,0.005303,0.021798,0.017137,0.170726,0.086782,...,0.08105,0.078469,0.393134,0.38202,0.452996,0.217578,0.131961,0.428494,0.46187,0.393578
100Hz,0.102645,0.038754,0.066354,0.005026,0.005303,0.0,0.02058,0.015461,0.175638,0.088694,...,0.081709,0.080663,0.395531,0.384328,0.457323,0.220854,0.133469,0.432537,0.432942,0.391988
100Hz,0.055061,0.015319,0.050463,0.016503,0.021798,0.02058,0.0,0.008273,0.11233,0.040528,...,0.035289,0.036301,0.338843,0.330734,0.404671,0.165833,0.083928,0.367968,0.367073,0.329927
100Hz,0.070845,0.015382,0.040223,0.01271,0.017137,0.015461,0.008273,0.0,0.135007,0.052614,...,0.053902,0.05472,0.359108,0.345213,0.421411,0.18036,0.091085,0.390891,0.390101,0.351341
50Hz,0.022329,0.111701,0.188762,0.163779,0.170726,0.175638,0.11233,0.135007,0.0,0.041611,...,0.045047,0.05091,0.151322,0.165511,0.203789,0.062205,0.058811,0.12993,0.137167,0.117361
50Hz,0.017959,0.032255,0.077904,0.079576,0.086782,0.088694,0.040528,0.052614,0.041611,0.0,...,0.022535,0.028527,0.230978,0.224406,0.293317,0.090016,0.032591,0.238689,0.241423,0.211167


間違ってるやつは100Hzと50Hzです
間違ってるやつは100Hzと50Hzです
間違ってるやつは100Hzと50Hzです
間違ってるやつは50Hzと100Hzです
間違ってるやつは50Hzと100Hzです
間違ってるやつは50Hzと100Hzです
間違ってるやつは10Hzと50Hzです
間違ってるやつは10Hzと50Hzです
JSダイバージェンスによる推定精度は66.66666666666666%です


In [42]:
randomforest(path)

[0. 0. 0. ... 0. 0. 0.]
正解率 =  0.3333333333333333
