In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os

import numpy as np
import pandas as pd
import cv2

from sklearn.decomposition import PCA

In [4]:
mapping_dict = {
    'n02058221': 'albatross',
    'n02130308': 'cheetah',
    'n01518878': 'ostrich',
    'n02056570': 'penguin',
    'n02391049': 'zebra'
}

# 1. 画像のリサイズ (128 × 128) にする

In [29]:
workdir_path = "./../../imagenet"
img_dir_names = ['n02056570', 'n01518878', 'n02058221', 'n02130308', 'n02391049']

for img_dir_name in img_dir_names:
    target_dir_name = img_dir_name
    target_dir_path = os.path.join(workdir_path, target_dir_name)
    file_names = os.listdir(target_dir_path)

    output_dir = "./../../imagenet_resized/"
    output_dir_path  = os.path.join(output_dir, target_dir_name)

    if not os.path.exists(output_dir_path):
        os.makedirs(output_dir_path)

    for file_name in file_names:
        image = cv2.imread(os.path.join(target_dir_path, file_name))
        resized_image = cv2.resize(image, (128, 128))
        
        output_file_path = os.path.join(output_dir_path, file_name)

        cv2.imwrite(output_file_path, resized_image, [int(cv2.IMWRITE_JPEG_QUALITY), 95])

# 2. PCA

In [5]:
workdir_path = "./../../imagenet_resized"
img_dir_names = ['n02056570', 'n01518878', 'n02058221', 'n02130308', 'n02391049']

output_dir = "./../../imagenet_reconstructed_2"

compressed_dict = {}

for img_dir_name in img_dir_names:
    target_dir_name = img_dir_name
    target_dir_path = os.path.join(workdir_path, target_dir_name)
    file_names = os.listdir(target_dir_path)

    output_dir_path = os.path.join(output_dir, target_dir_name)
    if not os.path.exists(output_dir_path):
        os.makedirs(output_dir_path)

    flat_images = []
    
    for file_name in file_names:
        image = cv2.imread(os.path.join(target_dir_path, file_name))
        flat_image = image.flatten()

        flat_images.append(flat_image)

    flat_images = np.array(flat_images)

    normalized_flat_images = flat_images / 255

    pca = PCA(n_components=200)
    compressed_images = pca.fit_transform(normalized_flat_images)

    compressed_dict[img_dir_name] = compressed_images

    reconstructed_images = pca.inverse_transform(compressed_images) * 255

    for file_name, reconstructed_image in zip(file_names, reconstructed_images):
        reconstructed_image = reconstructed_image.reshape((128, 128, 3))
        output_file_path = os.path.join(output_dir_path, file_name)
        cv2.imwrite(output_file_path, reconstructed_image)


In [6]:
compressed_dict

{'n02056570': array([[-2.26824002e+01,  8.74566847e+00, -9.39514483e+00, ...,
          5.40232483e-01, -1.47043610e+00, -4.06881755e-01],
        [ 7.84656418e+00, -2.05139262e+01, -7.58031151e+00, ...,
         -1.15239214e-01,  2.13302040e-01,  2.36721771e-01],
        [ 1.35986908e+00,  1.45490540e+01,  7.90974437e+00, ...,
         -6.30269375e-01, -3.01994530e-01, -5.63315817e-01],
        ...,
        [-1.47346730e+01, -2.70280051e+01, -3.26901110e-01, ...,
         -3.92618535e-01, -2.22508181e-02,  7.16504980e-02],
        [-1.16020744e+01,  5.80189916e+00, -4.73568503e+00, ...,
          9.65237695e-01, -1.49072470e+00, -6.70109038e-01],
        [-2.84018517e+01, -7.32680213e+00, -2.82159285e+01, ...,
         -1.32019267e+00, -3.69840977e-01, -6.08766889e-01]]),
 'n01518878': array([[ 1.05052787e+01, -2.45230692e+01, -1.17290311e+00, ...,
          1.04928847e+00,  1.71197236e+00, -1.45219391e+00],
        [ 9.54989575e-01, -1.55889151e+01, -5.64398977e+00, ...,
          1.

In [7]:
for img_dir_name in img_dir_names:
    print(compressed_dict[img_dir_name].shape)
    df_compressed_images = pd.DataFrame(compressed_dict[img_dir_name])
    
    animal_name = mapping_dict[img_dir_name]
    df_compressed_images.insert(0, 'animal', animal_name)

    save_path = os.path.join('./winston_pca_2', img_dir_name + '.csv')
    df_compressed_images.to_csv(save_path)

(1300, 200)
(1300, 200)
(1300, 200)
(1300, 200)
(1300, 200)


# 3. SVM

In [8]:
file_name_list = os.listdir("./winston_pca_2/")
file_path_list = [os.path.join("./winston_pca_2", file_name) for file_name in file_name_list]
file_path_list

['./winston_pca_2/n02130308.csv',
 './winston_pca_2/n02056570.csv',
 './winston_pca_2/n02391049.csv',
 './winston_pca_2/n02058221.csv',
 './winston_pca_2/n01518878.csv']

In [9]:
dfs = [pd.read_csv(file, index_col=0) for file in file_path_list]
df = pd.concat(dfs, ignore_index=True)

df

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,cheetah,-20.060238,-11.480357,15.623350,-6.305073,-6.696852,4.830261,-5.379857,2.432294,9.030500,...,0.125542,-0.080214,0.926857,-0.314778,-0.374476,-0.507503,0.318324,-0.128268,-0.045080,-0.206593
1,cheetah,-48.204088,18.591528,4.103486,2.440145,-10.566893,-14.992575,-2.351199,9.304754,-7.887125,...,-1.832162,-0.673048,1.983817,2.164188,0.928383,1.989706,0.480698,-1.720412,0.499422,-4.106438
2,cheetah,-13.188533,-23.096405,2.572728,-3.706356,2.869635,-3.104971,3.141484,1.542608,5.318094,...,-0.348495,-0.930289,0.061565,0.067395,-0.050295,-0.080876,0.121771,-0.493991,-0.457423,-0.195045
3,cheetah,-20.250505,-9.332849,-3.731214,-2.956475,-4.907211,-2.487805,-10.028531,1.898730,1.671651,...,-0.171502,-0.323352,0.199332,0.127779,-0.849878,-0.229659,-0.313232,0.213185,-1.336118,-0.310712
4,cheetah,-20.059971,-1.582472,-7.138032,-11.185771,18.417116,-1.680940,-6.145799,-0.107130,3.890427,...,1.681730,0.600582,2.152980,-2.025689,-0.199217,-0.165662,-0.718164,2.490273,0.523311,1.913032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6495,ostrich,31.291656,6.482612,-11.086591,-10.524747,4.290951,6.700974,21.181147,-8.942196,5.719353,...,-0.332219,-0.871387,-0.577859,-0.421761,0.225885,-0.944522,0.130902,-0.473095,0.188853,-0.326705
6496,ostrich,-6.391303,-17.068252,-1.496960,-14.264796,3.159807,-2.084721,8.642888,-2.086789,-8.758540,...,-2.444848,-1.743830,-1.234302,-1.398103,-3.151357,0.797208,-0.660319,0.189967,0.322385,-0.457151
6497,ostrich,-33.814540,10.456383,15.762074,6.935928,-1.560411,-3.322494,-3.718956,10.833294,6.694091,...,0.151620,0.986486,-1.042699,0.025971,-0.971935,-0.679953,0.292943,0.210233,0.598297,-0.516183
6498,ostrich,34.769526,21.752479,-3.132908,5.004080,3.544195,14.021153,-4.892887,1.884487,3.299574,...,-0.115667,-0.087014,0.304643,0.442750,0.269512,-0.045713,0.555787,-0.068632,0.019545,-0.239788


In [10]:
import pandas as pd
import numpy as np

def random_sample_by_label(df, label_column='animal', sample_size=10):
    # 各ラベル毎に指定の個数をランダムに抜き出す
    sampled_dataframes = []
    unique_labels = df[label_column].unique()

    for label in unique_labels:
        label_data = df[df[label_column] == label]
        sample_data = label_data.sample(n=sample_size, random_state=42) 
        sampled_dataframes.append(sample_data)

    # 抜き出したデータを結合
    result_df = pd.concat(sampled_dataframes, ignore_index=True)

    return result_df


original_df = df.copy()  
sampled_df = random_sample_by_label(original_df, sample_size=50) 

sampled_df

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,cheetah,15.402662,-3.807138,3.632292,32.764571,-5.664949,2.876672,3.040756,-8.643766,7.808112,...,-4.031143,0.502019,0.630617,0.088934,-0.005006,2.169094,-1.383973,-0.150240,-2.225031,-0.375203
1,cheetah,-50.593476,-7.922209,-11.033482,6.115892,2.230875,-2.827979,-8.589891,3.639380,-0.459745,...,0.152746,0.734783,0.300104,-0.220096,-0.999787,-0.102783,-0.280356,-0.086377,-0.098555,-0.002831
2,cheetah,-10.398780,49.083416,12.765650,3.406599,2.639825,-1.695449,-15.687951,-9.690763,-6.096559,...,-0.634467,-0.049114,-0.563111,0.451740,0.678812,0.386950,-0.803237,-0.082105,-0.050870,-0.782950
3,cheetah,-0.934629,-2.057008,-0.887415,-7.865467,-7.703866,-2.916751,1.805643,-4.821153,0.240899,...,-0.383386,0.378471,1.924444,1.428540,0.363502,0.191559,-1.472842,-2.003535,0.022197,-0.438671
4,cheetah,10.387551,2.727248,0.374913,3.122029,-1.818549,-7.941755,-2.613647,-8.021543,-0.653920,...,-1.009989,-1.500397,-0.375630,-1.125995,-0.887603,1.666718,-0.122017,-0.161629,0.449841,0.364122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,ostrich,-14.085187,6.946688,4.236949,1.523434,23.357961,0.750909,10.241343,-6.452612,-14.922543,...,-0.341839,-0.337619,-2.224325,3.161991,-0.406111,2.493280,0.786014,-0.002902,-1.173549,2.579695
246,ostrich,26.686927,10.615728,-5.212343,5.190202,-9.727586,-5.055615,7.891977,-13.078991,-1.556410,...,-0.876674,-0.418941,2.241611,0.286563,-0.643628,0.293206,0.389215,0.799776,0.129429,1.047317
247,ostrich,32.134951,33.172245,11.132469,15.298142,0.112820,13.789996,-13.055686,-19.346945,-7.756368,...,-0.039271,-0.729182,2.103718,0.082758,-1.642207,-1.164854,1.543173,0.948596,0.549975,-0.310706
248,ostrich,23.693990,-3.812301,-1.630135,-9.871452,16.190618,-9.354711,-5.674912,-4.960369,-2.180019,...,-0.743257,-1.045149,0.099882,1.351653,0.288197,-0.038924,0.322326,-0.830426,0.728247,-0.089177


## 各ラベルごとのデータ数が 50

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# データを読み込む
data = sampled_df.copy()

# 特徴量とラベルを分ける
X = data.drop('animal', axis=1)
y = data['animal']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVMモデルの作成と学習
svm_model = SVC(kernel='linear')  # カーネルは適宜変更可能
svm_model.fit(X_train, y_train)

# テストデータでの予測
y_pred = svm_model.predict(X_test)

# 分類性能の評価
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [12]:
accuracy

0.16

In [13]:
print(classification_rep)

              precision    recall  f1-score   support

   albatross       0.08      0.09      0.08        11
     cheetah       0.21      0.23      0.22        13
     ostrich       0.22      0.22      0.22         9
     penguin       0.00      0.00      0.00         7
       zebra       0.29      0.20      0.24        10

    accuracy                           0.16        50
   macro avg       0.16      0.15      0.15        50
weighted avg       0.17      0.16      0.16        50



## 各ラベルごとのデータ数が 100

In [14]:
original_df = df.copy()  
sampled_df = random_sample_by_label(original_df, sample_size=100) 

sampled_df

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,cheetah,15.402662,-3.807138,3.632292,32.764571,-5.664949,2.876672,3.040756,-8.643766,7.808112,...,-4.031143,0.502019,0.630617,0.088934,-0.005006,2.169094,-1.383973,-0.150240,-2.225031,-0.375203
1,cheetah,-50.593476,-7.922209,-11.033482,6.115892,2.230875,-2.827979,-8.589891,3.639380,-0.459745,...,0.152746,0.734783,0.300104,-0.220096,-0.999787,-0.102783,-0.280356,-0.086377,-0.098555,-0.002831
2,cheetah,-10.398780,49.083416,12.765650,3.406599,2.639825,-1.695449,-15.687951,-9.690763,-6.096559,...,-0.634467,-0.049114,-0.563111,0.451740,0.678812,0.386950,-0.803237,-0.082105,-0.050870,-0.782950
3,cheetah,-0.934629,-2.057008,-0.887415,-7.865467,-7.703866,-2.916751,1.805643,-4.821153,0.240899,...,-0.383386,0.378471,1.924444,1.428540,0.363502,0.191559,-1.472842,-2.003535,0.022197,-0.438671
4,cheetah,10.387551,2.727248,0.374913,3.122029,-1.818549,-7.941755,-2.613647,-8.021543,-0.653920,...,-1.009989,-1.500397,-0.375630,-1.125995,-0.887603,1.666718,-0.122017,-0.161629,0.449841,0.364122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,ostrich,-12.482797,-30.933762,-6.013157,9.861464,1.896594,4.754647,3.501164,-2.381094,10.091673,...,-0.906364,-2.225252,-0.987800,-1.144777,0.589565,0.723704,1.193361,-2.327586,0.086581,-1.036532
496,ostrich,-13.566370,-53.437579,-14.962884,6.394149,3.063816,5.868486,-4.599914,-0.177140,5.667886,...,0.103600,3.507079,0.674486,-0.560502,-0.615897,-0.449020,-2.117607,1.454144,-2.226385,-1.337848
497,ostrich,21.314886,-3.036497,-4.751903,7.962252,-0.980617,-23.659995,9.136151,8.320628,6.658086,...,0.149194,2.361382,-1.682807,1.600437,-0.517845,0.359536,0.098433,0.886150,-0.819348,-0.792955
498,ostrich,-12.458502,-7.971079,6.026540,-12.469348,-9.048829,10.572471,8.400410,6.724181,2.581764,...,2.769564,0.445095,0.435890,0.378302,-0.401645,1.270537,1.580416,-2.985054,-1.197222,-4.755871


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# データを読み込む
data = sampled_df.copy()

# 特徴量とラベルを分ける
X = data.drop('animal', axis=1)
y = data['animal']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVMモデルの作成と学習
svm_model = SVC(kernel='linear')  # カーネルは適宜変更可能
svm_model.fit(X_train, y_train)

# テストデータでの予測
y_pred = svm_model.predict(X_test)

# 分類性能の評価
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [16]:
print(accuracy)
print()
print(classification_rep)

0.18

              precision    recall  f1-score   support

   albatross       0.21      0.29      0.24        24
     cheetah       0.26      0.18      0.21        28
     ostrich       0.22      0.17      0.19        24
     penguin       0.00      0.00      0.00        14
       zebra       0.12      0.20      0.15        10

    accuracy                           0.18       100
   macro avg       0.16      0.17      0.16       100
weighted avg       0.19      0.18      0.18       100



### kernel='rbf'

In [17]:
original_df = df.copy()  
sampled_df = random_sample_by_label(original_df, sample_size=100) 

sampled_df

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,cheetah,15.402662,-3.807138,3.632292,32.764571,-5.664949,2.876672,3.040756,-8.643766,7.808112,...,-4.031143,0.502019,0.630617,0.088934,-0.005006,2.169094,-1.383973,-0.150240,-2.225031,-0.375203
1,cheetah,-50.593476,-7.922209,-11.033482,6.115892,2.230875,-2.827979,-8.589891,3.639380,-0.459745,...,0.152746,0.734783,0.300104,-0.220096,-0.999787,-0.102783,-0.280356,-0.086377,-0.098555,-0.002831
2,cheetah,-10.398780,49.083416,12.765650,3.406599,2.639825,-1.695449,-15.687951,-9.690763,-6.096559,...,-0.634467,-0.049114,-0.563111,0.451740,0.678812,0.386950,-0.803237,-0.082105,-0.050870,-0.782950
3,cheetah,-0.934629,-2.057008,-0.887415,-7.865467,-7.703866,-2.916751,1.805643,-4.821153,0.240899,...,-0.383386,0.378471,1.924444,1.428540,0.363502,0.191559,-1.472842,-2.003535,0.022197,-0.438671
4,cheetah,10.387551,2.727248,0.374913,3.122029,-1.818549,-7.941755,-2.613647,-8.021543,-0.653920,...,-1.009989,-1.500397,-0.375630,-1.125995,-0.887603,1.666718,-0.122017,-0.161629,0.449841,0.364122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,ostrich,-12.482797,-30.933762,-6.013157,9.861464,1.896594,4.754647,3.501164,-2.381094,10.091673,...,-0.906364,-2.225252,-0.987800,-1.144777,0.589565,0.723704,1.193361,-2.327586,0.086581,-1.036532
496,ostrich,-13.566370,-53.437579,-14.962884,6.394149,3.063816,5.868486,-4.599914,-0.177140,5.667886,...,0.103600,3.507079,0.674486,-0.560502,-0.615897,-0.449020,-2.117607,1.454144,-2.226385,-1.337848
497,ostrich,21.314886,-3.036497,-4.751903,7.962252,-0.980617,-23.659995,9.136151,8.320628,6.658086,...,0.149194,2.361382,-1.682807,1.600437,-0.517845,0.359536,0.098433,0.886150,-0.819348,-0.792955
498,ostrich,-12.458502,-7.971079,6.026540,-12.469348,-9.048829,10.572471,8.400410,6.724181,2.581764,...,2.769564,0.445095,0.435890,0.378302,-0.401645,1.270537,1.580416,-2.985054,-1.197222,-4.755871


In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# データを読み込む
data = sampled_df.copy()

# 特徴量とラベルを分ける
X = data.drop('animal', axis=1)
y = data['animal']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVMモデルの作成と学習
svm_model = SVC(kernel='rbf')  # カーネルは適宜変更可能
svm_model.fit(X_train, y_train)

# テストデータでの予測
y_pred = svm_model.predict(X_test)

# 分類性能の評価
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [19]:
print(accuracy)
print()
print(classification_rep)

0.25

              precision    recall  f1-score   support

   albatross       0.53      0.42      0.47        24
     cheetah       0.33      0.29      0.31        28
     ostrich       0.20      0.04      0.07        24
     penguin       0.18      0.29      0.22        14
       zebra       0.07      0.20      0.10        10

    accuracy                           0.25       100
   macro avg       0.26      0.25      0.23       100
weighted avg       0.30      0.25      0.26       100



## フルデータ (各ラベルごとのデータ数が 1300)

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# データを読み込む
data = df.copy()

# 特徴量とラベルを分ける
X = data.drop('animal', axis=1)
y = data['animal']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVMモデルの作成と学習
svm_model = SVC(kernel='linear')  # カーネルは適宜変更可能
svm_model.fit(X_train, y_train)

# テストデータでの予測
y_pred = svm_model.predict(X_test)

# 分類性能の評価
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [None]:
# 結果の表示
print(f'Accuracy: {accuracy}')

In [None]:
print('Classification Report:')
print(classification_rep)

In [None]:
mapping_dict

# 4. SVM (ラベル数を絞る)

In [20]:
file_name_list = os.listdir("./winston_pca_2/")
file_path_list = [os.path.join("./winston_pca_2", file_name) for file_name in file_name_list]
file_path_list  

dfs = [pd.read_csv(file, index_col=0) for file in file_path_list]
df = pd.concat(dfs, ignore_index=True)

df

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,cheetah,-20.060238,-11.480357,15.623350,-6.305073,-6.696852,4.830261,-5.379857,2.432294,9.030500,...,0.125542,-0.080214,0.926857,-0.314778,-0.374476,-0.507503,0.318324,-0.128268,-0.045080,-0.206593
1,cheetah,-48.204088,18.591528,4.103486,2.440145,-10.566893,-14.992575,-2.351199,9.304754,-7.887125,...,-1.832162,-0.673048,1.983817,2.164188,0.928383,1.989706,0.480698,-1.720412,0.499422,-4.106438
2,cheetah,-13.188533,-23.096405,2.572728,-3.706356,2.869635,-3.104971,3.141484,1.542608,5.318094,...,-0.348495,-0.930289,0.061565,0.067395,-0.050295,-0.080876,0.121771,-0.493991,-0.457423,-0.195045
3,cheetah,-20.250505,-9.332849,-3.731214,-2.956475,-4.907211,-2.487805,-10.028531,1.898730,1.671651,...,-0.171502,-0.323352,0.199332,0.127779,-0.849878,-0.229659,-0.313232,0.213185,-1.336118,-0.310712
4,cheetah,-20.059971,-1.582472,-7.138032,-11.185771,18.417116,-1.680940,-6.145799,-0.107130,3.890427,...,1.681730,0.600582,2.152980,-2.025689,-0.199217,-0.165662,-0.718164,2.490273,0.523311,1.913032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6495,ostrich,31.291656,6.482612,-11.086591,-10.524747,4.290951,6.700974,21.181147,-8.942196,5.719353,...,-0.332219,-0.871387,-0.577859,-0.421761,0.225885,-0.944522,0.130902,-0.473095,0.188853,-0.326705
6496,ostrich,-6.391303,-17.068252,-1.496960,-14.264796,3.159807,-2.084721,8.642888,-2.086789,-8.758540,...,-2.444848,-1.743830,-1.234302,-1.398103,-3.151357,0.797208,-0.660319,0.189967,0.322385,-0.457151
6497,ostrich,-33.814540,10.456383,15.762074,6.935928,-1.560411,-3.322494,-3.718956,10.833294,6.694091,...,0.151620,0.986486,-1.042699,0.025971,-0.971935,-0.679953,0.292943,0.210233,0.598297,-0.516183
6498,ostrich,34.769526,21.752479,-3.132908,5.004080,3.544195,14.021153,-4.892887,1.884487,3.299574,...,-0.115667,-0.087014,0.304643,0.442750,0.269512,-0.045713,0.555787,-0.068632,0.019545,-0.239788


In [21]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
count,6500.0,6500.0,6500.0,6500.0,6500.0,6500.0,6500.0,6500.0,6500.0,6500.0,...,6500.0,6500.0,6500.0,6500.0,6500.0,6500.0,6500.0,6500.0,6500.0,6500.0
mean,-3.069545e-14,-6.471405e-16,-4.057746e-15,-1.696557e-15,6.281198e-15,1.661577e-16,-6.104109e-15,-6.996113e-17,-6.777485e-17,-1.464811e-16,...,-3.825999e-16,2.355722e-16,3.640165e-16,2.672734e-16,-5.299009e-16,1.902068e-16,-4.607596e-16,-1.5304e-16,1.475743e-16,7.105427000000001e-17
std,26.38975,16.29448,11.29991,9.829684,9.306925,8.123177,7.388529,6.927331,6.613067,6.021658,...,1.21583,1.210626,1.204723,1.202446,1.197292,1.193258,1.18883,1.183978,1.178906,1.174573
min,-82.71644,-58.75053,-44.0295,-45.77037,-45.50755,-34.6003,-33.00064,-34.90782,-31.58901,-28.20822,...,-7.389536,-7.044987,-7.545233,-6.262724,-6.084869,-6.740171,-5.774086,-7.289922,-6.568592,-6.176038
25%,-17.2317,-9.783334,-7.055338,-5.216939,-5.62834,-5.000008,-4.34426,-4.028954,-3.890612,-3.52742,...,-0.6479385,-0.6387636,-0.6374848,-0.645697,-0.6447501,-0.6054189,-0.6224281,-0.6123617,-0.6254095,-0.6164674
50%,-0.7984879,-0.3193345,-0.393791,-0.05609811,-0.127178,-0.2519657,-0.02594949,-0.1151172,-0.04406106,-0.08636009,...,0.009687905,-0.003012141,-0.0135903,0.01099385,-0.008299957,-0.004620152,0.003979756,-0.00618112,-0.001165837,0.005245288
75%,16.39901,9.617442,6.793211,5.103128,5.555428,4.717798,4.134927,3.865382,3.749421,3.461514,...,0.6258065,0.643512,0.6398258,0.6372348,0.6266261,0.6184873,0.6122985,0.616731,0.5985634,0.603456
max,100.8641,68.50564,53.38301,55.38744,46.69292,41.77507,45.61751,41.37904,38.92227,30.73674,...,11.66729,7.184253,9.789181,7.01145,7.888077,7.043943,8.116235,9.418765,8.054129,8.682527


In [22]:
import pandas as pd
import numpy as np

def random_sample_by_label(df, label_column='animal', sample_size=10, selected_labels=None):
    # 選択されたラベルのみを対象にする
    if selected_labels is not None:
        df = df[df[label_column].isin(selected_labels)]

    # 各ラベル毎に指定の個数をランダムに抜き出す
    sampled_dataframes = []
    unique_labels = df[label_column].unique()

    for label in unique_labels:
        label_data = df[df[label_column] == label]
        sample_data = label_data.sample(n=sample_size, random_state=42)
        sampled_dataframes.append(sample_data)

    # 抜き出したデータを結合
    result_df = pd.concat(sampled_dataframes, ignore_index=True)

    return result_df



original_df = df.copy()
selected_labels = ['cheetah', 'albatross']  
sampled_df = random_sample_by_label(original_df, sample_size=100, selected_labels=selected_labels)

sampled_df

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,cheetah,15.402662,-3.807138,3.632292,32.764571,-5.664949,2.876672,3.040756,-8.643766,7.808112,...,-4.031143,0.502019,0.630617,0.088934,-0.005006,2.169094,-1.383973,-0.150240,-2.225031,-0.375203
1,cheetah,-50.593476,-7.922209,-11.033482,6.115892,2.230875,-2.827979,-8.589891,3.639380,-0.459745,...,0.152746,0.734783,0.300104,-0.220096,-0.999787,-0.102783,-0.280356,-0.086377,-0.098555,-0.002831
2,cheetah,-10.398780,49.083416,12.765650,3.406599,2.639825,-1.695449,-15.687951,-9.690763,-6.096559,...,-0.634467,-0.049114,-0.563111,0.451740,0.678812,0.386950,-0.803237,-0.082105,-0.050870,-0.782950
3,cheetah,-0.934629,-2.057008,-0.887415,-7.865467,-7.703866,-2.916751,1.805643,-4.821153,0.240899,...,-0.383386,0.378471,1.924444,1.428540,0.363502,0.191559,-1.472842,-2.003535,0.022197,-0.438671
4,cheetah,10.387551,2.727248,0.374913,3.122029,-1.818549,-7.941755,-2.613647,-8.021543,-0.653920,...,-1.009989,-1.500397,-0.375630,-1.125995,-0.887603,1.666718,-0.122017,-0.161629,0.449841,0.364122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,albatross,-10.554250,-10.180245,15.334719,1.620547,-4.822957,5.442124,2.169065,3.001244,-3.199404,...,0.026839,1.849687,-2.736205,0.890098,1.161233,-0.217144,-0.313529,-0.044662,0.352890,-0.387748
196,albatross,2.097137,31.900102,-2.181612,9.192555,0.074148,17.854490,-5.033813,-3.029893,-1.265763,...,0.172376,0.070216,-0.786730,-0.045865,0.129781,0.010598,-0.116206,0.678112,-0.207397,0.046922
197,albatross,-34.750480,5.682939,-2.522935,1.560119,4.492572,5.512418,-0.092059,13.299925,-8.100403,...,1.065749,2.863806,-0.142327,1.671186,-1.470173,-0.542646,0.696255,-0.044227,0.497618,-1.096086
198,albatross,-51.186763,-6.981031,-5.796971,-21.319474,7.340640,-0.598602,4.284274,-10.050741,-5.227979,...,1.667506,-0.502423,0.967173,1.614759,-0.238520,-0.303419,-0.560537,-0.460491,0.500064,-0.487871


## ラベル数 2 ，各ラベルごとのデータ数は 100

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# データを読み込む
data = sampled_df.copy()

# 特徴量とラベルを分ける
X = data.drop('animal', axis=1)
y = data['animal']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVMモデルの作成と学習
svm_model = SVC(kernel='linear')  # カーネルは適宜変更可能
svm_model.fit(X_train, y_train)

# テストデータでの予測
y_pred = svm_model.predict(X_test)

# 分類性能の評価
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [24]:
print(accuracy)
print()
print(classification_rep)

0.6

              precision    recall  f1-score   support

   albatross       0.56      0.79      0.65        19
     cheetah       0.69      0.43      0.53        21

    accuracy                           0.60        40
   macro avg       0.62      0.61      0.59        40
weighted avg       0.63      0.60      0.59        40



### kernel = 'rbf'

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# データを読み込む
data = sampled_df.copy()

# 特徴量とラベルを分ける
X = data.drop('animal', axis=1)
y = data['animal']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVMモデルの作成と学習
svm_model = SVC(kernel='rbf')  # カーネルは適宜変更可能
svm_model.fit(X_train, y_train)

# テストデータでの予測
y_pred = svm_model.predict(X_test)

# 分類性能の評価
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [26]:
print(accuracy)
print()
print(classification_rep)

0.6

              precision    recall  f1-score   support

   albatross       0.59      0.53      0.56        19
     cheetah       0.61      0.67      0.64        21

    accuracy                           0.60        40
   macro avg       0.60      0.60      0.60        40
weighted avg       0.60      0.60      0.60        40



### random forest

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# データを読み込む
data = sampled_df.copy()

# 特徴量とラベルを分ける
X = data.drop('animal', axis=1)
y = data['animal']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forestモデルの作成と学習
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # n_estimatorsは適宜変更可能
rf_model.fit(X_train, y_train)

# テストデータでの予測
y_pred = rf_model.predict(X_test)

# 分類性能の評価
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [28]:
print(accuracy)
print()
print(classification_rep)

0.6

              precision    recall  f1-score   support

   albatross       0.55      0.84      0.67        19
     cheetah       0.73      0.38      0.50        21

    accuracy                           0.60        40
   macro avg       0.64      0.61      0.58        40
weighted avg       0.64      0.60      0.58        40



In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
# from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score

# データを読み込む
data = sampled_df.copy()

# 特徴量とラベルを分ける
X = data.drop('animal', axis=1)
y = data['animal']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVM (linear)モデルの作成と学習
svm_linear_model = SVC(kernel='linear')
svm_linear_model.fit(X_train, y_train)
y_pred_svm_linear = svm_linear_model.predict(X_test)
accuracy_svm_linear = accuracy_score(y_test, y_pred_svm_linear)
classification_rep_svm_linear = classification_report(y_test, y_pred_svm_linear)

# SVM (rbf)モデルの作成と学習
svm_rbf_model = SVC(kernel='rbf')
svm_rbf_model.fit(X_train, y_train)
y_pred_svm_rbf = svm_rbf_model.predict(X_test)
accuracy_svm_rbf = accuracy_score(y_test, y_pred_svm_rbf)
classification_rep_svm_rbf = classification_report(y_test, y_pred_svm_rbf)

# Random Forestモデルの作成と学習
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_rep_rf = classification_report(y_test, y_pred_rf)

# Logistic Regressionモデルの作成と学習
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)
y_pred_logreg = logreg_model.predict(X_test)
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
classification_rep_logreg = classification_report(y_test, y_pred_logreg)

# Decision Treeモデルの作成と学習
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
classification_rep_dt = classification_report(y_test, y_pred_dt)

# KNNモデルの作成と学習
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
classification_rep_knn = classification_report(y_test, y_pred_knn)

# # XGBoostモデルの作成と学習
# xgb_model = XGBClassifier(random_state=42)
# xgb_model.fit(X_train, y_train)
# y_pred_xgb = xgb_model.predict(X_test)
# accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
# classification_rep_xgb = classification_report(y_test, y_pred_xgb)

# Naive Bayesモデルの作成と学習
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
classification_rep_nb = classification_report(y_test, y_pred_nb)

# 結果の表示
print("SVM (Linear) Accuracy:\n", accuracy_svm_linear)
print("SVM (Linear) Classification Report:\n", classification_rep_svm_linear)

print("\nSVM (RBF) Accuracy:\n", accuracy_svm_rbf)
print("SVM (RBF) Classification Report:\n", classification_rep_svm_rbf)

print("\nRandom Forest Accuracy:\n", accuracy_rf)
print("Random Forest Classification Report:\n", classification_rep_rf)

print("\nLogistic Regression Accuracy:\n", accuracy_logreg)
print("Logistic Regression Classification Report:\n", classification_rep_logreg)

print("\nDecision Tree Accuracy:\n", accuracy_dt)
print("Decision Tree Classification Report:\n", classification_rep_dt)

print("\nK-Nearest Neighbors Accuracy:\n", accuracy_knn)
print("K-Nearest Neighbors Classification Report:\n", classification_rep_knn)

# print("\nXGBoost Accuracy:\n", accuracy_xgb)
# print("XGBoost Classification Report:\n", classification_rep_xgb)

print("\nNaive Bayes Accuracy:\n", accuracy_nb)
print("Naive Bayes Classification Report:\n", classification_rep_nb)


SVM (Linear) Accuracy:
 0.6
SVM (Linear) Classification Report:
               precision    recall  f1-score   support

   albatross       0.56      0.79      0.65        19
     cheetah       0.69      0.43      0.53        21

    accuracy                           0.60        40
   macro avg       0.62      0.61      0.59        40
weighted avg       0.63      0.60      0.59        40


SVM (RBF) Accuracy:
 0.6
SVM (RBF) Classification Report:
               precision    recall  f1-score   support

   albatross       0.59      0.53      0.56        19
     cheetah       0.61      0.67      0.64        21

    accuracy                           0.60        40
   macro avg       0.60      0.60      0.60        40
weighted avg       0.60      0.60      0.60        40


Random Forest Accuracy:
 0.6
Random Forest Classification Report:
               precision    recall  f1-score   support

   albatross       0.55      0.84      0.67        19
     cheetah       0.73      0.38      0.50 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## ラベル数 2 ，各ラベルごとのデータ数は 1000

In [6]:
original_df = df.copy()
selected_labels = ['cheetah', 'albatross']  
sampled_df = random_sample_by_label(original_df, sample_size=1000, selected_labels=selected_labels)

sampled_df

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,cheetah,3927.678876,-970.820148,926.234376,8354.965704,-1444.562035,733.551254,775.392766,-2204.160418,1991.068556,...,-383.314987,836.321862,-167.737536,155.635763,28.396193,-326.276349,317.260028,261.071826,-48.154635,-266.364229
1,cheetah,-12901.336469,-2020.163362,-2813.537940,1559.552536,568.873003,-721.134764,-2190.422034,928.041977,-117.235123,...,-239.839525,-18.464322,204.930955,86.645856,47.325411,1.298196,-135.692596,174.109791,-236.751453,-113.355846
2,cheetah,-2651.689022,12516.271205,3255.240645,868.682800,673.155328,-432.339572,-4000.427500,-2471.144523,-1554.622771,...,11.757454,192.660995,-122.615055,-201.713039,32.190923,-15.392368,126.347606,-111.187874,-242.953080,275.618284
3,cheetah,-238.330422,-524.536914,-226.290882,-2005.694037,-1964.485841,-743.771503,460.439080,-1229.393784,61.429087,...,-217.110972,255.426007,-95.527197,-144.122943,231.937847,-349.420038,-244.240717,-47.456660,46.924611,575.171720
4,cheetah,2648.825409,695.448219,95.602707,796.117338,-463.729982,-2025.147529,-666.479952,-2045.493594,-166.749532,...,-158.614595,55.097136,384.585690,-5.425583,276.438089,243.310228,557.055430,-105.910057,160.153754,-272.175591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,albatross,844.437065,378.200162,2685.682448,2153.760181,1497.156581,114.868815,76.992372,624.496587,-1210.334120,...,61.701614,-181.831461,63.133138,199.690703,-32.992910,134.411144,59.888706,32.572970,277.962175,-371.431583
1996,albatross,-3853.239617,678.028919,-3345.748331,212.217250,-1033.715043,-1436.311163,8.651006,505.583481,-1188.697371,...,-39.737225,-302.806979,468.391057,296.714824,-48.545503,232.462549,112.748991,325.951665,600.464918,-108.089411
1997,albatross,-2374.164091,4226.097385,-923.220861,-390.596550,1122.594942,-248.027486,2192.531925,912.705360,-213.632803,...,402.753224,-48.596443,-250.830180,82.581943,-43.480590,181.771868,44.248727,3.497554,-240.610385,-20.611566
1998,albatross,-15618.763318,2153.342297,3195.055594,1569.037667,-560.089472,-3941.548803,2524.292696,173.406761,-683.616902,...,-351.484770,64.342979,197.750594,103.805486,-431.560110,327.798326,-250.527510,-281.746001,-279.590561,272.329562


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# データを読み込む
data = sampled_df.copy()

# 特徴量とラベルを分ける
X = data.drop('animal', axis=1)
y = data['animal']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVMモデルの作成と学習
svm_model = SVC(kernel='linear')  # カーネルは適宜変更可能
svm_model.fit(X_train, y_train)

# テストデータでの予測
y_pred = svm_model.predict(X_test)

# 分類性能の評価
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [None]:
print(accuracy)
print()
print(classification_rep)