In [1]:
from sklearn.utils import resample
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from imblearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, cross_val_score, RepeatedStratifiedKFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay

import shap

import warnings
warnings.filterwarnings(action='ignore') 

In [2]:
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
np.random.seed(SEED)
random.seed(SEED)

In [3]:
csv_path = './datasets/csv/'
audio_df = pd.read_csv(csv_path+'feature_extracting.csv')

In [4]:
audio_df['label'].value_counts()

label
1    2611
0     373
Name: count, dtype: int64

In [5]:
# imbalanced data set H/D : scaler -> train/test set divided -> train set under/over sampling -> modeling -> test set evaluate

scaler = MinMaxScaler()
scaler.fit(audio_df.drop(['fileName', 'label'], axis=1))
scaled_X = scaler.transform(audio_df.drop(['fileName', 'label'], axis=1))

X_train, X_test, y_train, y_test = train_test_split(scaled_X, audio_df['label'], random_state=42, shuffle=True, test_size=0.2)
print(y_train.value_counts())
print(y_test.value_counts())

label
1    2101
0     286
Name: count, dtype: int64
label
1    510
0     87
Name: count, dtype: int64


In [6]:
def applyScaler(df) :
    scaler = MinMaxScaler()
    scaler.fit(df.drop(['fileName', 'label'], axis=1))
    scaled_X = scaler.transform(df.drop(['fileName', 'label'], axis=1))
    return scaler, scaled_X

def runModel(modelName, X_train, y_train, X_test, y_test, cmView=False) :
    model = None

    if (modelName == 'lr') :
        model = LogisticRegression(random_state=42)
    elif (modelName == 'svc') :
        model = SVC(random_state=42, probability=True)
    elif (modelName == 'rfc') :
        model = RandomForestClassifier(random_state=42)
    # elif (modelName == 'xgb') :
    else :
        model = XGBClassifier(random_state=42)
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    # print(model)

    tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
    fpr = fp / (fp + tn) # 실제 정상을 악성으로 예측한 비율, 낮을 수록 좋음
    eval_df = pd.DataFrame({
        'accuracy' : [accuracy_score(y_test, pred)],
        'recall' : [recall_score(y_test, pred)],
        'precision' : [precision_score(y_test, pred)],
        'f1' : [f1_score(y_test, pred, average='weighted')],
        'rocauc' : [roc_auc_score(y_test, pred, average='weighted')],
        'fpr' : [fpr]
    })

    if (cmView) :
        print(confusion_matrix(y_test, pred))
    # display(eval_df)
    # ConfusionMatrixDisplay.from_predictions(y_test, pred)
    return model, eval_df

def getEvalDf(modelDesc, X_train, y_train, X_test, y_test, dataList, cmView=False) :
    model, eval = runModel(modelDesc, X_train, y_train, X_test, y_test, cmView)
    for datafare in dataList :
        _, resultDf = runModel(modelDesc, datafare[0], datafare[1], X_test, y_test, cmView)
        eval = pd.concat([eval, resultDf])
    return model, eval

def resampler(samplerList, X_train, y_train, resmpale_columns) :
    dataList = []
    result_cnt_df = pd.DataFrame({'label': [0, 1]})
    result_cnt_df = pd.merge(result_cnt_df, y_train.value_counts().reset_index(), on='label', how='outer')

    for sampler in samplerList :
        s_X, s_y = sampler.fit_resample(X_train, y_train)
        dataList.append([s_X, s_y])
        result_cnt_df = pd.merge(result_cnt_df, s_y.value_counts().reset_index(), on='label', how='outer')
        result_cnt_df.columns = resmpale_columns[0:len(result_cnt_df.columns)]
    return dataList, result_cnt_df

### Baseline 및 resampler 적용

In [7]:
# imbalanced data set H/D : scaler -> train/test set divided -> train set under/over sampling -> modeling -> test set evaluate
scaler, scaled_X = applyScaler(audio_df)

X_train, X_test, y_train, y_test = train_test_split(scaled_X, audio_df['label'], random_state=42, shuffle=True, test_size=0.2, stratify=audio_df['label'])
print(y_train.value_counts())
print(y_test.value_counts())

label
1    2089
0     298
Name: count, dtype: int64
label
1    522
0     75
Name: count, dtype: int64


In [17]:
# resampler
resampler_columns = ['label', 'origin', 'rus', 'tomek', 'ros', 'smote', 'border-smote', 'adasyn']
samplerList = [
    RandomUnderSampler(random_state=42),
    TomekLinks(),
    RandomOverSampler(random_state=42),
    SMOTE(random_state=42),
    BorderlineSMOTE(random_state=42),
    ADASYN(random_state=42)
]
dataList, resample_cnt = resampler(samplerList, X_train, y_train, resampler_columns)
display('resample_cnt', resample_cnt)

lr, lr_eval = getEvalDf('lr', X_train, y_train, X_test, y_test, dataList)
svc, svc_eval = getEvalDf('svc', X_train, y_train, X_test, y_test, dataList)
rfc, rfc_eval = getEvalDf('rfc', X_train, y_train, X_test, y_test, dataList)
xgb, xgb_eval = getEvalDf('xgb', X_train, y_train, X_test, y_test, dataList)

lr_eval.index = resampler_columns[1:]
svc_eval.index = resampler_columns[1:]
rfc_eval.index = resampler_columns[1:]
xgb_eval.index = resampler_columns[1:]

origin_df = pd.concat([lr_eval.iloc[0], svc_eval.iloc[0], rfc_eval.iloc[0], xgb_eval.iloc[0]], axis=1).transpose()
origin_df.index = ['LogisticRegression', 'SVC', 'RandomForest', 'XGB']

display('origin dataset', origin_df)
display('LogisticRegression', lr_eval)
display('SVC', svc_eval)
display('RandomForest', rfc_eval)
display('XGB', xgb_eval)

'resample_cnt'

Unnamed: 0,label,origin,rus,tomek,ros,smote,border-smote,adasyn
0,0,298,298,298,2089,2089,2089,2082
1,1,2089,298,2088,2089,2089,2089,2089


'origin dataset'

Unnamed: 0,accuracy,recall,precision,f1,rocauc,fpr
LogisticRegression,0.921273,1.0,0.917399,0.905007,0.686667,0.626667
SVC,0.99665,1.0,0.996183,0.99663,0.986667,0.026667
RandomForest,0.9933,1.0,0.992395,0.993221,0.973333,0.053333
XGB,0.98995,0.998084,0.990494,0.989831,0.965709,0.066667


'LogisticRegression'

Unnamed: 0,accuracy,recall,precision,f1,rocauc,fpr
origin,0.921273,1.0,0.917399,0.905007,0.686667,0.626667
rus,0.81407,0.814176,0.968109,0.839157,0.813755,0.186667
tomek,0.921273,1.0,0.917399,0.905007,0.686667,0.626667
ros,0.842546,0.837165,0.979821,0.86284,0.858582,0.12
smote,0.854271,0.854406,0.97593,0.871462,0.85387,0.146667
border-smote,0.924623,0.955939,0.957774,0.924837,0.831303,0.293333
adasyn,0.919598,0.948276,0.959302,0.920912,0.834138,0.28


'SVC'

Unnamed: 0,accuracy,recall,precision,f1,rocauc,fpr
origin,0.99665,1.0,0.996183,0.99663,0.986667,0.026667
rus,0.979899,0.978927,0.998047,0.98043,0.982797,0.013333
tomek,0.99665,1.0,0.996183,0.99663,0.986667,0.026667
ros,0.994975,0.996169,0.998081,0.994989,0.991418,0.013333
smote,0.99665,0.998084,0.998084,0.99665,0.992375,0.013333
border-smote,0.99665,0.998084,0.998084,0.99665,0.992375,0.013333
adasyn,0.994975,0.996169,0.998081,0.994989,0.991418,0.013333


'RandomForest'

Unnamed: 0,accuracy,recall,precision,f1,rocauc,fpr
origin,0.9933,1.0,0.992395,0.993221,0.973333,0.053333
rus,0.973199,0.971264,0.998031,0.974161,0.978966,0.013333
tomek,0.9933,1.0,0.992395,0.993221,0.973333,0.053333
ros,0.991625,1.0,0.990512,0.9915,0.966667,0.066667
smote,0.994975,1.0,0.994286,0.994931,0.98,0.04
border-smote,0.988275,0.996169,0.990476,0.988172,0.964751,0.066667
adasyn,0.991625,0.998084,0.992381,0.991551,0.972375,0.053333


'XGB'

Unnamed: 0,accuracy,recall,precision,f1,rocauc,fpr
origin,0.98995,0.998084,0.990494,0.989831,0.965709,0.066667
rus,0.964824,0.961686,0.998012,0.966475,0.974176,0.013333
tomek,0.994975,1.0,0.994286,0.994931,0.98,0.04
ros,0.998325,1.0,0.998088,0.99832,0.993333,0.013333
smote,0.9933,0.994253,0.998077,0.993337,0.99046,0.013333
border-smote,0.988275,0.992337,0.994242,0.988308,0.976169,0.04
adasyn,0.994975,0.998084,0.996176,0.99496,0.985709,0.026667


##### 모델의 강건함을 확인하기 위한 방법 중 하나로, 특정인의 real data가 없을 때도 데이터 예측을 잘 하는지 확인

In [9]:
audio_df.loc[audio_df.fileName.str.startswith('biden_original')]

audio_df_ex_biden = audio_df[~audio_df['fileName'].str.startswith('biden-original')]
audio_df_ex_linus = audio_df[~audio_df['fileName'].str.startswith('linus-original')]
audio_df_ex_margot = audio_df[~audio_df['fileName'].str.startswith('margot-original')]
audio_df_ex_musk = audio_df[~audio_df['fileName'].str.startswith('musk-original')]
audio_df_ex_obama = audio_df[~audio_df['fileName'].str.startswith('obama-original')]
audio_df_ex_ryan = audio_df[~audio_df['fileName'].str.startswith('ryan-original')]
audio_df_ex_taylor = audio_df[~audio_df['fileName'].str.startswith('taylor-original')]
audio_df_ex_trump = audio_df[~audio_df['fileName'].str.startswith('trump-original')]

ex_df_list = [audio_df_ex_biden, audio_df_ex_linus, audio_df_ex_margot, audio_df_ex_musk, audio_df_ex_obama, audio_df_ex_ryan, audio_df_ex_taylor, audio_df_ex_trump]
result_ex_df = pd.DataFrame()

for df in ex_df_list :
    _, ex_scaled_X = applyScaler(df)
    ex_X_train, ex_X_test, ex_y_train, ex_y_test = train_test_split(ex_scaled_X, df['label'], random_state=42, shuffle=True, test_size=0.2, stratify=df['label'])
    _, eval = getEvalDf('svc', ex_X_train, ex_y_train, ex_X_test, ex_y_test, [], True)
    result_ex_df = pd.concat([result_ex_df, eval], axis=0)

result_ex_df.index = ['ex_biden', 'ex_linus', 'ex_margot', 'ex_musk', 'ex_obama', 'ex_ryan', 'ex_taylor', 'ex_trump']
display(result_ex_df)

[[ 60   3]
 [  0 522]]
[[ 60   3]
 [  0 523]]
[[ 71   2]
 [  0 523]]
[[ 61   2]
 [  0 522]]
[[ 62   1]
 [  0 522]]
[[ 72   1]
 [  0 522]]
[[ 61   2]
 [  1 521]]
[[ 61   2]
 [  0 522]]


Unnamed: 0,accuracy,recall,precision,f1,rocauc,fpr
ex_biden,0.994872,1.0,0.994286,0.994817,0.97619,0.047619
ex_linus,0.994881,1.0,0.994297,0.994825,0.97619,0.047619
ex_margot,0.996644,1.0,0.99619,0.996624,0.986301,0.027397
ex_musk,0.996581,1.0,0.996183,0.996557,0.984127,0.031746
ex_obama,0.998291,1.0,0.998088,0.998285,0.992063,0.015873
ex_ryan,0.998319,1.0,0.998088,0.998314,0.993151,0.013699
ex_taylor,0.994872,0.998084,0.996176,0.994854,0.983169,0.031746
ex_trump,0.996581,1.0,0.996183,0.996557,0.984127,0.031746


### Adversarial examples, data augmentation
- baseline은 전체 데이터로 테스트한 svc 모델을 사용
- perturbation이 추가된 dataset이 오분류되는 케이스에 대해 테스트
- Audio augmentation for speech recognition [Ko et al., 2015] : VTLP, tempo, speed pertubation을 통한 증강

In [10]:
def getAugmentEvalDf(model, X_test, y_test, cmView=False) :
    pred = model.predict(X_test)

    tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
    fpr = fp / (fp + tn) # 실제 정상을 악성으로 예측한 비율, 낮을 수록 좋음
    eval_df = pd.DataFrame({
        'accuracy' : [accuracy_score(y_test, pred)],
        'recall' : [recall_score(y_test, pred)],
        'precision' : [precision_score(y_test, pred)],
        'f1' : [f1_score(y_test, pred, average='weighted')],
        'rocauc' : [roc_auc_score(y_test, pred, average='weighted')],
        'fpr' : [fpr]
    })
    if (cmView) :
        print(confusion_matrix(y_test, pred))

    return eval_df

def simulateSameIndex(df, scaled_X, y_test, model, cmView=False) :
    # X_si_test = scaler.transform(df.loc[y_test.index].drop(['fileName', 'label'], axis=1))
    X_si_test = scaled_X[y_test.index]
    y_si_test = df.loc[y_test.index, 'label']
    result_si_df = getAugmentEvalDf(model, X_si_test, y_si_test, cmView)
    return result_si_df

def simulateRandomIndex(df, scaled_X, model, cmView=False) :
    # scaled_ri_X = scaler.transform(df.drop(['fileName', 'label'], axis=1))
    X_ri_train, X_ri_test, y_ri_train, y_ri_test = train_test_split(scaled_X, df['label'], random_state=42, shuffle=True, test_size=0.2, stratify=df['label'])
    print(y_ri_test.value_counts())
    result_ri_df = getAugmentEvalDf(model, X_ri_test, y_ri_test, cmView)
    return result_ri_df

In [11]:
# VTLP (Vocal Tract Length Perturbation) : 음성을 생성할 때 발음과 공명 주파수를 변형하여 더 자연스러운 음성을 만들어 내기 위한 기술
# pertubation은 1 ~ 1.003 사이에서 랜덤하게 줌
audio_vtlp_df = pd.read_csv(csv_path+'feature_extracting_vtlp.csv')
scaled_vtlp_X = scaler.transform(audio_vtlp_df.drop(['fileName', 'label'], axis=1))

origin_svc_df = getAugmentEvalDf(svc, X_test, y_test, True)
display(origin_svc_df)

# vtlp 데이터셋에서 랜덤으로 골라낸 결과 -> real을 fake로 인식하는 비율이 높아짐 (FPR 0.02 -> 0.21), roc, f1 전반적으로 떨어짐
display(simulateRandomIndex(audio_vtlp_df, scaled_vtlp_X, svc, True))

# baseline에서 사용된 동일한 데이터+vtlp 추가된 데이터로 테스트 했을 경우 (random 결과와 동일)
display(simulateSameIndex(audio_vtlp_df, scaled_vtlp_X, y_test, svc, True))

[[ 73   2]
 [  0 522]]


Unnamed: 0,accuracy,recall,precision,f1,rocauc,fpr
0,0.99665,1.0,0.996183,0.99663,0.986667,0.026667


label
1    522
0     75
Name: count, dtype: int64
[[ 59  16]
 [  1 521]]


Unnamed: 0,accuracy,recall,precision,f1,rocauc,fpr
0,0.971524,0.998084,0.970205,0.970144,0.892375,0.213333


[[ 59  16]
 [  1 521]]


Unnamed: 0,accuracy,recall,precision,f1,rocauc,fpr
0,0.971524,0.998084,0.970205,0.970144,0.892375,0.213333


In [12]:
# white noise
audio_w_noise_df = pd.read_csv(csv_path+'feature_extracting_w_noise.csv')
scaled_w_noise_X = scaler.transform(audio_w_noise_df.drop(['fileName', 'label'], axis=1))

origin_svc_df = getAugmentEvalDf(svc, X_test, y_test, True)
display(origin_svc_df)

# w_noise 데이터셋에서 랜덤으로 골라낸 결과 -> real을 fake로 인식하는 비율이 높아짐 (FPR 0.02 -> 0.72), roc, f1 전반적으로 떨어짐
display(simulateRandomIndex(audio_w_noise_df, scaled_w_noise_X, svc, True))

# baseline에서 사용된 동일한 데이터+vtlp 추가된 데이터로 테스트 했을 경우 (random 결과와 동일)
# display(simulateSameIndex(audio_w_noise_df, scaled_w_noise_X, y_test, svc, True))

[[ 73   2]
 [  0 522]]


Unnamed: 0,accuracy,recall,precision,f1,rocauc,fpr
0,0.99665,1.0,0.996183,0.99663,0.986667,0.026667


label
1    522
0     75
Name: count, dtype: int64
[[ 21  54]
 [  0 522]]


Unnamed: 0,accuracy,recall,precision,f1,rocauc,fpr
0,0.909548,1.0,0.90625,0.886332,0.64,0.72


In [13]:
# mask
audio_mask_df = pd.read_csv(csv_path+'feature_extracting_mask.csv')
scaled_mask_X = scaler.transform(audio_mask_df.drop(['fileName', 'label'], axis=1))

origin_svc_df = getAugmentEvalDf(svc, X_test, y_test, True)
display(origin_svc_df)

# mask 데이터셋에서 랜덤으로 골라낸 결과 -> 0.2s mask 씌웠을 때 -> 결과 동일
display(simulateRandomIndex(audio_mask_df, scaled_mask_X, svc, True))

# 1초 mask -> FPR 0.02 -> 0.2
audio_mask_df = pd.read_csv(csv_path+'feature_extracting_mask_1sec.csv')
scaled_mask_X = scaler.transform(audio_mask_df.drop(['fileName', 'label'], axis=1))
display(simulateRandomIndex(audio_mask_df, scaled_mask_X, svc, True))

[[ 73   2]
 [  0 522]]


Unnamed: 0,accuracy,recall,precision,f1,rocauc,fpr
0,0.99665,1.0,0.996183,0.99663,0.986667,0.026667


label
1    522
0     75
Name: count, dtype: int64
[[ 73   2]
 [  0 522]]


Unnamed: 0,accuracy,recall,precision,f1,rocauc,fpr
0,0.99665,1.0,0.996183,0.99663,0.986667,0.026667


label
1    522
0     75
Name: count, dtype: int64
[[ 60  15]
 [  0 522]]


Unnamed: 0,accuracy,recall,precision,f1,rocauc,fpr
0,0.974874,1.0,0.972067,0.973656,0.9,0.2


In [14]:
# speed up
audio_speed_df = pd.read_csv(csv_path+'feature_extracting_speed.csv')
scaled_speed_X = scaler.transform(audio_speed_df.drop(['fileName', 'label'], axis=1))

origin_svc_df = getAugmentEvalDf(svc, X_test, y_test, True)
display(origin_svc_df)

# speed up (x1.1)일 때, FPR : 0.02 -> 0.04, precision 올라감
audio_speed_df = pd.read_csv(csv_path+'feature_extracting_speed_1.csv')
scaled_speed_X = scaler.transform(audio_speed_df.drop(['fileName', 'label'], axis=1))
display(simulateRandomIndex(audio_speed_df, scaled_speed_X, svc, True))

# speed down (x0.9)일 때, FPR : 
audio_speed_df = pd.read_csv(csv_path+'feature_extracting_speed_09.csv')
scaled_speed_X = scaler.transform(audio_speed_df.drop(['fileName', 'label'], axis=1))
display(simulateRandomIndex(audio_speed_df, scaled_speed_X, svc, True))

[[ 73   2]
 [  0 522]]


Unnamed: 0,accuracy,recall,precision,f1,rocauc,fpr
0,0.99665,1.0,0.996183,0.99663,0.986667,0.026667


label
1    522
0     75
Name: count, dtype: int64
[[ 72   3]
 [  3 519]]


Unnamed: 0,accuracy,recall,precision,f1,rocauc,fpr
0,0.98995,0.994253,0.994253,0.98995,0.977126,0.04


label
1    522
0     75
Name: count, dtype: int64
[[ 72   3]
 [  2 520]]


Unnamed: 0,accuracy,recall,precision,f1,rocauc,fpr
0,0.991625,0.996169,0.994264,0.991601,0.978084,0.04


In [15]:
# normalize
audio_normalize_df = pd.read_csv(csv_path+'feature_extracting_normalize.csv')
scaled_normalize_X = scaler.transform(audio_normalize_df.drop(['fileName', 'label'], axis=1))

origin_svc_df = getAugmentEvalDf(svc, X_test, y_test, True)
display(origin_svc_df)

# real을 fake로 인식하는 비율이 높아짐 (FPR 0.02 -> 0.61), roc, f1 전반적으로 떨어짐
display(simulateRandomIndex(audio_normalize_df, scaled_normalize_X, svc, True))

[[ 73   2]
 [  0 522]]


Unnamed: 0,accuracy,recall,precision,f1,rocauc,fpr
0,0.99665,1.0,0.996183,0.99663,0.986667,0.026667


label
1    522
0     75
Name: count, dtype: int64
[[ 29  46]
 [  4 518]]


Unnamed: 0,accuracy,recall,precision,f1,rocauc,fpr
0,0.916248,0.992337,0.91844,0.901582,0.689502,0.613333


In [16]:
# 데이터 증강 후 다시 테스트
csv_path = './datasets/csv/'

file_list = [f for f in os.listdir(csv_path) if f.startswith('feature_extracting') and f.endswith('.csv')]
dfs = [pd.read_csv(os.path.join(csv_path, file)) for file in file_list]
total_df = pd.concat(dfs, ignore_index=True)

scaler_aug, scaled_aug_X = applyScaler(total_df)

X_aug_train, X_aug_test, y_aug_train, y_aug_test = train_test_split(scaled_aug_X, total_df['label'], random_state=42, shuffle=True, test_size=0.2, stratify=total_df['label'])
print(y_aug_train.value_counts())
print(y_aug_test.value_counts())

svc_aug, eval = getEvalDf('svc', X_aug_train, y_aug_train, X_aug_test, y_aug_test, [], True)
display(eval)

label
1    22993
0     3289
Name: count, dtype: int64
label
1    5749
0     822
Name: count, dtype: int64
[[ 792   30]
 [   2 5747]]


Unnamed: 0,accuracy,recall,precision,f1,rocauc,fpr
0,0.99513,0.999652,0.994807,0.995094,0.981578,0.036496
