In [1]:
import pandas as pd
import numpy as np

# 学習データ読み込み
df1 = pd.read_csv("./csv/nod2-1_processed.csv", engine='python')
df2 = pd.read_csv("./csv/nod2-2_processed.csv", engine='python')
df3 = pd.read_csv("./csv/nod2-3_processed.csv", engine='python')
df4 = pd.read_csv("./csv/other2-1_processed.csv", engine='python')
df5 = pd.read_csv("./csv/other2-2_processed.csv", engine='python')
df6 = pd.read_csv("./csv/other2-3_processed.csv", engine='python')
# 統合したcsv
csv_path = "./csv/entire.csv"
df = pd.read_csv(csv_path, engine='python')

# テストデータ読み込み（複数人のスライディングウィンドウcsv）
input_1 = pd.read_csv("csv/multi_people_0_processed.csv")
input_2 = pd.read_csv("csv/multi_people_1_processed.csv")
input_3 = pd.read_csv("csv/multi_people_2_processed.csv")
input_4 = pd.read_csv("csv/multi_people_3_processed.csv")

# アノテーションデータ読み込み
anno_1 = pd.read_csv("pos/anno_1.csv")
anno_2 = pd.read_csv("pos/anno_2.csv")
anno_3 = pd.read_csv("pos/anno_3.csv")
anno_4 = pd.read_csv("pos/anno_4.csv")

In [2]:
###  学習

In [3]:
# 学習データ
df = df.replace([np.inf, -np.inf], np.nan)
print(f"欠損値:\n{df.isnull().sum()}")
data = df.dropna(axis=1, how="any")

X = data.iloc[:, :-5]
y = data["label"].astype(int)

欠損値:
mean_x           0
std_x            0
mad_x            0
max_x            0
min_x            0
energy_x         0
entropy_x        0
iqr_x            0
range_x          0
skewness_x       0
kurtosis_x       0
mean_y           0
std_y            0
mad_y            0
max_y            0
min_y            0
energy_y         0
entropy_y        0
iqr_y            0
range_y          0
skewness_y       0
kurtosis_y       0
mean_z           0
std_z            0
mad_z            0
max_z            0
min_z            0
energy_z         0
entropy_z        0
iqr_z            0
range_z          0
skewness_z       0
kurtosis_z       0
frame_in         0
frame_out        0
timestamp_in     0
timestamp_out    0
label            0
dtype: int64


In [4]:
from sklearn.ensemble import RandomForestRegressor
from boruta import BorutaPy

# RandomForestRegressorでBorutaを実行
rf = RandomForestRegressor(n_jobs=-1, max_depth=5)
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=0, random_state=1)
feat_selector.fit(X.values, y.values)

# 選択された特徴量を確認
selected = feat_selector.support_
print('選択された特徴量の数: %d' % np.sum(selected))
print(selected)
print(X.columns[selected])

X_selected = X.iloc[:,selected]

選択された特徴量の数: 6
[False False  True  True False False False  True  True False False False
 False False False  True False False False False False False False False
 False False False False False False False False  True]
Index(['mad_x', 'max_x', 'iqr_x', 'range_x', 'min_y', 'kurtosis_z'], dtype='object')


In [5]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

random_state = 0

# X_train, X_test = X_selected[142:], X_selected[0:142]
# y_train, y_test = y[142:], y[0:142]

print(df1.shape, df4.shape)

df_train, df_test = pd.concat([df2, df3, df5, df6]), pd.concat([df1, df4])
X_train, X_test = df_train.iloc[:, :-5], df_test.iloc[:, :-5]
X_train, X_test = X_train.iloc[:,selected], X_test.iloc[:,selected]
y_train, y_test = df_train["label"].astype(int), df_test["label"].astype(int)

# データの標準化処理
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

# kernel SVMのインスタンスを生成
model = SVC(kernel='rbf', random_state=random_state)

# モデルの学習
model.fit(X_train_std, y_train)

# トレーニングデータに対する精度
pred_train = model.predict(X_train_std)
accuracy_train = accuracy_score(y_train, pred_train)
precision_train = precision_score(y_train, pred_train)
recall_train = recall_score(y_train, pred_train)
print('トレーニングデータに対する正解率： %.2f' % accuracy_train)
print('トレーニングデータに対する適合率： %.2f' % precision_train)
print('トレーニングデータに対する再現率： %.2f' % recall_train)
cm = confusion_matrix(y_train, pred_train)
print(cm)

# テストデータに対する精度
pred_test = model.predict(X_test_std)
accuracy_test = accuracy_score(y_test, pred_test)
precision_test = precision_score(y_test, pred_test)
recall_test = recall_score(y_test, pred_test)
print('テストデータに対する正解率： %.2f' % accuracy_test)
print('テストデータに対する適合率： %.2f' % precision_test)
print('テストデータに対する再現率： %.2f' % recall_test)

cm = confusion_matrix(y_test, pred_test)
print(cm)

(31, 38) (81, 38)
トレーニングデータに対する正解率： 0.97
トレーニングデータに対する適合率： 0.95
トレーニングデータに対する再現率： 0.98
[[136   6]
 [  2 113]]
テストデータに対する正解率： 0.97
テストデータに対する適合率： 0.91
テストデータに対する再現率： 1.00
[[78  3]
 [ 0 31]]


In [6]:
### 検証

In [12]:
def test(input_df, anno_df, selected, model):

    X_test = input_df.iloc[:, :-5]
    print(X_test.shape)
    print(len(selected))
    X_test = X_test.iloc[:,selected]
    y_test = anno_df["nod_0"].astype(int)

    pred_test = model.predict(X_test)
    
    return pred_test, y_test
    
pred_test1, y_test1 = test(input_1, anno_1, selected, model)
pred_test2, y_test2 = test(input_2, anno_2, selected, model)
pred_test3, y_test3 = test(input_3, anno_3, selected, model)
pred_test4, y_test4 = test(input_4, anno_4, selected, model)

(30, 33)
33
(27, 33)
33
(31, 33)
33


ValueError: invalid literal for int() with base 10: 'nod_0'

In [18]:
def output_csv(
    base_df: pd.core.frame.DataFrame,
    pred_result: np.ndarray,
    # columns: list=['frame_in', 'frame_out', 'label'],
    columns: list=['frame_in', 'frame_out'],
    output_path: str="output_pred.csv"
):
    """
    base_df: predに対応するDataFrame
    pred_result: 判別後の予測結果 (ndarray)
    columns: outputするカラム名のリスト
    """
    base_df = base_df[columns]
    base_df = base_df.assign(pred=pred_result)
    
    frame_in = base_df["frame_in"].iloc[0].astype(int)
    frame_out = base_df["frame_out"].iloc[-1].astype(int)
    cum_pred = np.zeros(frame_out - frame_in)

    # out_df = pd.DataFrame(columns=['frame', 'label', 'pred'])
    out_df = pd.DataFrame(columns=['frame', 'pred'])

    for i, (_, row) in enumerate(base_df.iterrows()):
        fin = row['frame_in'].astype(int)
        fout = row['frame_out'].astype(int)

        for frame in range(fin, fout + 1):
            if (out_df['frame'] == frame).sum() == 0:
                out_df = out_df.append({
                    'frame': frame,
#                     'label': row['label'],
                    'pred': row['pred'],
                }, ignore_index=True)
            else:
                if row['pred'] == 1:
                    out_df.loc[out_df['frame'] == frame, 'pred'] += 1
                    
    if output_path is not None:
        out_df.to_csv(output_path, index=False)

    return out_df

out_df1 = output_csv(input_1, pred_test1)
out_df1

Unnamed: 0,frame,pred
0,1.0,1.0
1,2.0,1.0
2,3.0,1.0
3,4.0,1.0
4,5.0,1.0
...,...,...
517,518.0,1.0
518,519.0,1.0
519,520.0,1.0
520,521.0,1.0


In [14]:
def nod_detection_report(
)

ValueError: Found input variables with inconsistent numbers of samples: [543, 30]