In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

import numpy as np
import pandas as pd
import cv2

from sklearn.decomposition import PCA

In [3]:
mapping_dict = {
    'n02058221': 'albatross',
    'n02130308': 'cheetah',
    'n01518878': 'ostrich',
    'n02056570': 'penguin',
    'n02391049': 'zebra'
}

# 1. 画像のリサイズ (128 × 128) にする

In [29]:
workdir_path = "./../../imagenet"
img_dir_names = ['n02056570', 'n01518878', 'n02058221', 'n02130308', 'n02391049']

for img_dir_name in img_dir_names:
    target_dir_name = img_dir_name
    target_dir_path = os.path.join(workdir_path, target_dir_name)
    file_names = os.listdir(target_dir_path)

    output_dir = "./../../imagenet_resized/"
    output_dir_path  = os.path.join(output_dir, target_dir_name)

    if not os.path.exists(output_dir_path):
        os.makedirs(output_dir_path)

    for file_name in file_names:
        image = cv2.imread(os.path.join(target_dir_path, file_name))
        resized_image = cv2.resize(image, (128, 128))
        
        output_file_path = os.path.join(output_dir_path, file_name)

        cv2.imwrite(output_file_path, resized_image, [int(cv2.IMWRITE_JPEG_QUALITY), 95])

# 2. PCA

In [54]:
workdir_path = "./../../imagenet_resized"
img_dir_names = ['n02056570', 'n01518878', 'n02058221', 'n02130308', 'n02391049']

output_dir = "./../../imagenet_reconstructed"

compressed_dict = {}

for img_dir_name in img_dir_names:
    target_dir_name = img_dir_name
    target_dir_path = os.path.join(workdir_path, target_dir_name)
    file_names = os.listdir(target_dir_path)

    output_dir_path = os.path.join(output_dir, target_dir_name)
    if not os.path.exists(output_dir_path):
        os.makedirs(output_dir_path)

    flat_images = []
    
    for file_name in file_names:
        image = cv2.imread(os.path.join(target_dir_path, file_name))
        flat_image = image.flatten()

        flat_images.append(flat_image)

    flat_images = np.array(flat_images)

    pca = PCA(n_components=200)
    compressed_images = pca.fit_transform(flat_images)

    compressed_dict[img_dir_name] = compressed_images

    reconstructed_images = pca.inverse_transform(compressed_images)

    for file_name, reconstructed_image in zip(file_names, reconstructed_images):
        reconstructed_image = reconstructed_image.reshape((128, 128, 3))
        output_file_path = os.path.join(output_dir_path, file_name)
        cv2.imwrite(output_file_path, reconstructed_image)


In [55]:
compressed_dict

{'n02056570': array([[-5784.01205221,  2230.14546068, -2395.76193224, ...,
           256.26720732,  -289.39870555,   157.50250171],
        [ 2000.87386562, -5231.05117222, -1932.97943357, ...,
          -400.06383331,  -142.88021631,   147.83957477],
        [  346.76661657,  3710.00877281,  2016.98481548, ...,
           201.68578268,   345.69171156,  -327.36021454],
        ...,
        [-3757.34160229, -6892.14129161,   -83.35978184, ...,
          -557.87167448,   198.62832144,   208.42038771],
        [-2958.52898393,  1479.48428542, -1207.59968082, ...,
          -271.42998278,   417.82355968,   156.68421443],
        [-7242.47219211, -1868.33454248, -7195.06177228, ...,
           360.58858933,   116.89815703,   218.21703257]]),
 'n01518878': array([[ 2.67884606e+03, -6.25338263e+03, -2.99090294e+02, ...,
         -1.06498297e+00, -1.38308185e+02,  9.38785526e+01],
        [ 2.43522342e+02, -3.97517336e+03, -1.43921739e+03, ...,
          4.13064557e+02,  3.16966916e+02,  5.26

In [68]:
for img_dir_name in img_dir_names:
    print(compressed_dict[img_dir_name].shape)
    df_compressed_images = pd.DataFrame(compressed_dict[img_dir_name])
    
    animal_name = mapping_dict[img_dir_name]
    df_compressed_images.insert(0, 'animal', animal_name)

    save_path = os.path.join('./winston_pca_1', img_dir_name + '.csv')
    df_compressed_images.to_csv(save_path)

(1300, 200)
(1300, 200)
(1300, 200)
(1300, 200)
(1300, 200)


# 3. SVM

In [4]:
file_name_list = os.listdir("./winston_pca_1/")
file_path_list = [os.path.join("./winston_pca_1", file_name) for file_name in file_name_list]
file_path_list

['./winston_pca/n02130308.csv',
 './winston_pca/n02056570.csv',
 './winston_pca/n02391049.csv',
 './winston_pca/n02058221.csv',
 './winston_pca/n01518878.csv']

In [5]:
dfs = [pd.read_csv(file, index_col=0) for file in file_path_list]
df = pd.concat(dfs, ignore_index=True)

df

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,cheetah,-5115.360633,-2927.491060,3983.954139,-1607.793565,-1707.697273,1231.716649,-1371.863639,620.235029,2302.777402,...,267.520077,-317.075512,-459.106383,-92.574474,-74.246461,372.197135,183.263019,-40.867809,41.995494,-292.902317
1,cheetah,-12292.042379,4740.839604,1046.388904,622.236931,-2694.557778,-3823.106702,-599.555827,2372.712356,-2011.216821,...,-832.601755,-80.981283,1443.672504,247.833193,-159.965861,-140.464022,-228.427258,361.792851,1131.269994,718.242180
2,cheetah,-3363.075950,-5889.583207,656.045701,-945.120750,731.756823,-791.767700,801.078591,393.364901,1356.113945,...,35.712819,-90.973905,45.616888,-12.432887,-80.691670,143.734972,116.544528,184.049601,195.105505,-134.299170
3,cheetah,-5163.878690,-2379.876547,-951.459624,-753.901112,-1251.338801,-634.390268,-2557.275545,484.175961,426.271124,...,-124.012225,115.598409,31.967310,-298.090207,-15.675591,21.348297,-103.782651,153.353827,64.761970,-143.816475
4,cheetah,-5115.292560,-403.530281,-1820.198062,-2852.371602,4696.364647,-428.639660,-1567.178713,-27.318138,992.058634,...,765.573161,599.910359,-267.457886,107.572501,-16.891712,-399.381829,-106.067050,-351.647419,-887.101674,125.240305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6495,ostrich,7979.372296,1653.066150,-2827.080818,-2683.810541,1094.192487,1708.748383,5401.192591,-2280.259863,1458.434964,...,-19.534184,61.627409,6.559735,150.518938,-52.109450,26.304132,72.550957,120.986373,83.038723,-11.839194
6496,ostrich,-1629.782357,-4352.404305,-381.724749,-3637.522918,805.750783,-531.603751,2203.936574,-532.131204,-2233.427722,...,347.544082,19.164663,-592.219340,277.308670,-51.325638,-83.757541,-148.734691,116.376113,37.296529,873.898038
6497,ostrich,-8622.707655,2666.377568,4019.328827,1768.661635,-397.904678,-847.235851,-948.333874,2762.489919,1706.993131,...,-47.199998,-90.762251,122.820261,177.954703,320.543994,295.480383,-293.603980,126.984810,-8.585088,81.371401
6498,ostrich,8866.229087,5546.882180,-798.891627,1276.040346,903.769834,3575.394117,-1247.686277,480.544157,841.391269,...,48.420065,53.192464,141.564325,-83.715151,285.670406,13.375321,-2.541073,-10.252679,-19.837557,190.620610


In [6]:
print(df)

        label             0            1            2            3  \
0     cheetah  -5115.360633 -2927.491060  3983.954139 -1607.793565   
1     cheetah -12292.042379  4740.839604  1046.388904   622.236931   
2     cheetah  -3363.075950 -5889.583207   656.045701  -945.120750   
3     cheetah  -5163.878690 -2379.876547  -951.459624  -753.901112   
4     cheetah  -5115.292560  -403.530281 -1820.198062 -2852.371602   
...       ...           ...          ...          ...          ...   
6495  ostrich   7979.372296  1653.066150 -2827.080818 -2683.810541   
6496  ostrich  -1629.782357 -4352.404305  -381.724749 -3637.522918   
6497  ostrich  -8622.707655  2666.377568  4019.328827  1768.661635   
6498  ostrich   8866.229087  5546.882180  -798.891627  1276.040346   
6499  ostrich  10458.226672    41.258462  -592.656394  1637.169626   

                4            5            6            7            8  ...  \
0    -1707.697273  1231.716649 -1371.863639   620.235029  2302.777402  ...   
1  

In [7]:
import pandas as pd
import numpy as np

def random_sample_by_label(df, label_column='animal', sample_size=10):
    # 各ラベル毎に指定の個数をランダムに抜き出す
    sampled_dataframes = []
    unique_labels = df[label_column].unique()

    for label in unique_labels:
        label_data = df[df[label_column] == label]
        sample_data = label_data.sample(n=sample_size, random_state=42) 
        sampled_dataframes.append(sample_data)

    # 抜き出したデータを結合
    result_df = pd.concat(sampled_dataframes, ignore_index=True)

    return result_df


original_df = df.copy()  
sampled_df = random_sample_by_label(original_df, sample_size=50) 

sampled_df

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,cheetah,3927.678876,-970.820148,926.234376,8354.965704,-1444.562035,733.551254,775.392766,-2204.160418,1991.068556,...,-383.314987,836.321862,-167.737536,155.635763,28.396193,-326.276349,317.260028,261.071826,-48.154635,-266.364229
1,cheetah,-12901.336469,-2020.163362,-2813.537940,1559.552536,568.873003,-721.134764,-2190.422034,928.041977,-117.235123,...,-239.839525,-18.464322,204.930955,86.645856,47.325411,1.298196,-135.692596,174.109791,-236.751453,-113.355846
2,cheetah,-2651.689022,12516.271205,3255.240645,868.682800,673.155328,-432.339572,-4000.427500,-2471.144523,-1554.622771,...,11.757454,192.660995,-122.615055,-201.713039,32.190923,-15.392368,126.347606,-111.187874,-242.953080,275.618284
3,cheetah,-238.330422,-524.536914,-226.290882,-2005.694037,-1964.485841,-743.771503,460.439080,-1229.393784,61.429087,...,-217.110972,255.426007,-95.527197,-144.122943,231.937847,-349.420038,-244.240717,-47.456660,46.924611,575.171720
4,cheetah,2648.825409,695.448219,95.602707,796.117338,-463.729982,-2025.147529,-666.479952,-2045.493594,-166.749532,...,-158.614595,55.097136,384.585690,-5.425583,276.438089,243.310228,557.055430,-105.910057,160.153754,-272.175591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,ostrich,-3591.722641,1771.405395,1080.422068,388.475783,5956.280181,191.481915,2611.542489,-1645.416123,-3805.248693,...,-29.327092,116.505751,-305.863097,274.695382,540.681801,-175.797632,607.766641,-259.527286,-100.097752,457.944212
246,ostrich,6805.166451,2707.010616,-1329.147368,1323.501595,-2480.534381,-1289.181822,2012.454164,-3335.142648,-396.884666,...,472.649574,-3.912229,-150.135714,68.267911,-501.459386,228.638376,-121.733125,285.619535,191.414748,-238.642824
247,ostrich,8194.412450,8458.922442,2838.779620,3901.026123,28.769219,3516.449001,-3329.199883,-4933.471020,-1977.873914,...,-1.730846,110.447340,-406.003294,-159.952932,65.250458,-29.546898,-112.572513,-130.602874,13.349578,-161.204308
248,ostrich,6041.967472,-972.136787,-415.684395,-2517.220207,4128.607644,-2385.451429,-1447.102460,-1264.894130,-555.904817,...,-150.131950,75.282524,-404.033836,-15.991802,312.247488,11.293940,-89.482431,171.053554,155.099368,-115.282418


## 各ラベルごとのデータ数が 50

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# データを読み込む
data = sampled_df.copy()

# 特徴量とラベルを分ける
X = data.drop('animal', axis=1)
y = data['animal']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVMモデルの作成と学習
svm_model = SVC(kernel='linear')  # カーネルは適宜変更可能
svm_model.fit(X_train, y_train)

# テストデータでの予測
y_pred = svm_model.predict(X_test)

# 分類性能の評価
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [9]:
accuracy

0.14

In [11]:
print(classification_rep)

              precision    recall  f1-score   support

   albatross       0.00      0.00      0.00        11
     cheetah       0.21      0.23      0.22        13
     ostrich       0.29      0.22      0.25         9
     penguin       0.00      0.00      0.00         7
       zebra       0.20      0.20      0.20        10

    accuracy                           0.14        50
   macro avg       0.14      0.13      0.13        50
weighted avg       0.15      0.14      0.14        50



## 各ラベルごとのデータ数が 100

In [12]:
original_df = df.copy()  
sampled_df = random_sample_by_label(original_df, sample_size=100) 

sampled_df

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,cheetah,3927.678876,-970.820148,926.234376,8354.965704,-1444.562035,733.551254,775.392766,-2204.160418,1991.068556,...,-383.314987,836.321862,-167.737536,155.635763,28.396193,-326.276349,317.260028,261.071826,-48.154635,-266.364229
1,cheetah,-12901.336469,-2020.163362,-2813.537940,1559.552536,568.873003,-721.134764,-2190.422034,928.041977,-117.235123,...,-239.839525,-18.464322,204.930955,86.645856,47.325411,1.298196,-135.692596,174.109791,-236.751453,-113.355846
2,cheetah,-2651.689022,12516.271205,3255.240645,868.682800,673.155328,-432.339572,-4000.427500,-2471.144523,-1554.622771,...,11.757454,192.660995,-122.615055,-201.713039,32.190923,-15.392368,126.347606,-111.187874,-242.953080,275.618284
3,cheetah,-238.330422,-524.536914,-226.290882,-2005.694037,-1964.485841,-743.771503,460.439080,-1229.393784,61.429087,...,-217.110972,255.426007,-95.527197,-144.122943,231.937847,-349.420038,-244.240717,-47.456660,46.924611,575.171720
4,cheetah,2648.825409,695.448219,95.602707,796.117338,-463.729982,-2025.147529,-666.479952,-2045.493594,-166.749532,...,-158.614595,55.097136,384.585690,-5.425583,276.438089,243.310228,557.055430,-105.910057,160.153754,-272.175591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,ostrich,-3183.113305,-7888.109401,-1533.354914,2514.673421,483.631346,1212.434937,892.796844,-607.178874,2573.376778,...,-235.154581,298.103951,-680.919872,-312.761759,-678.145786,-363.508911,239.910044,614.981588,-314.138304,-611.696789
496,ostrich,-3459.424401,-13626.582578,-3815.535481,1630.507905,781.273010,1496.463993,-1172.978082,-45.170755,1445.311068,...,-492.042276,-548.133118,-627.080243,-464.524204,-301.096892,47.517561,701.057616,-567.071027,47.620276,408.644949
497,ostrich,5435.295905,-774.306842,-1211.735241,2030.374298,-250.057327,-6033.298641,2329.718506,2121.760074,1697.811944,...,-188.759427,-83.460442,138.125811,-309.854254,250.037057,591.102298,317.420525,427.297637,695.521592,263.317848
498,ostrich,-3176.917985,-2032.625250,1536.767708,-3179.683673,-2307.451268,2695.980016,2142.104436,1714.666121,658.349573,...,-268.084925,-849.688586,-175.276022,768.389649,-495.313600,-406.878715,456.720156,-528.568374,14.830527,-593.001730


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# データを読み込む
data = sampled_df.copy()

# 特徴量とラベルを分ける
X = data.drop('animal', axis=1)
y = data['animal']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVMモデルの作成と学習
svm_model = SVC(kernel='linear')  # カーネルは適宜変更可能
svm_model.fit(X_train, y_train)

# テストデータでの予測
y_pred = svm_model.predict(X_test)

# 分類性能の評価
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [14]:
print(accuracy)
print()
print(classification_rep)

0.18

              precision    recall  f1-score   support

   albatross       0.19      0.25      0.21        24
     cheetah       0.28      0.25      0.26        28
     ostrich       0.25      0.12      0.17        24
     penguin       0.13      0.14      0.14        14
       zebra       0.00      0.00      0.00        10

    accuracy                           0.18       100
   macro avg       0.17      0.15      0.16       100
weighted avg       0.20      0.18      0.18       100



### kernel='rbf'

In [8]:
original_df = df.copy()  
sampled_df = random_sample_by_label(original_df, sample_size=100) 

sampled_df

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,cheetah,3927.678876,-970.820148,926.234376,8354.965704,-1444.562035,733.551254,775.392766,-2204.160418,1991.068556,...,-383.314987,836.321862,-167.737536,155.635763,28.396193,-326.276349,317.260028,261.071826,-48.154635,-266.364229
1,cheetah,-12901.336469,-2020.163362,-2813.537940,1559.552536,568.873003,-721.134764,-2190.422034,928.041977,-117.235123,...,-239.839525,-18.464322,204.930955,86.645856,47.325411,1.298196,-135.692596,174.109791,-236.751453,-113.355846
2,cheetah,-2651.689022,12516.271205,3255.240645,868.682800,673.155328,-432.339572,-4000.427500,-2471.144523,-1554.622771,...,11.757454,192.660995,-122.615055,-201.713039,32.190923,-15.392368,126.347606,-111.187874,-242.953080,275.618284
3,cheetah,-238.330422,-524.536914,-226.290882,-2005.694037,-1964.485841,-743.771503,460.439080,-1229.393784,61.429087,...,-217.110972,255.426007,-95.527197,-144.122943,231.937847,-349.420038,-244.240717,-47.456660,46.924611,575.171720
4,cheetah,2648.825409,695.448219,95.602707,796.117338,-463.729982,-2025.147529,-666.479952,-2045.493594,-166.749532,...,-158.614595,55.097136,384.585690,-5.425583,276.438089,243.310228,557.055430,-105.910057,160.153754,-272.175591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,ostrich,-3183.113305,-7888.109401,-1533.354914,2514.673421,483.631346,1212.434937,892.796844,-607.178874,2573.376778,...,-235.154581,298.103951,-680.919872,-312.761759,-678.145786,-363.508911,239.910044,614.981588,-314.138304,-611.696789
496,ostrich,-3459.424401,-13626.582578,-3815.535481,1630.507905,781.273010,1496.463993,-1172.978082,-45.170755,1445.311068,...,-492.042276,-548.133118,-627.080243,-464.524204,-301.096892,47.517561,701.057616,-567.071027,47.620276,408.644949
497,ostrich,5435.295905,-774.306842,-1211.735241,2030.374298,-250.057327,-6033.298641,2329.718506,2121.760074,1697.811944,...,-188.759427,-83.460442,138.125811,-309.854254,250.037057,591.102298,317.420525,427.297637,695.521592,263.317848
498,ostrich,-3176.917985,-2032.625250,1536.767708,-3179.683673,-2307.451268,2695.980016,2142.104436,1714.666121,658.349573,...,-268.084925,-849.688586,-175.276022,768.389649,-495.313600,-406.878715,456.720156,-528.568374,14.830527,-593.001730


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# データを読み込む
data = sampled_df.copy()

# 特徴量とラベルを分ける
X = data.drop('animal', axis=1)
y = data['animal']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVMモデルの作成と学習
svm_model = SVC(kernel='rbf')  # カーネルは適宜変更可能
svm_model.fit(X_train, y_train)

# テストデータでの予測
y_pred = svm_model.predict(X_test)

# 分類性能の評価
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [10]:
print(accuracy)
print()
print(classification_rep)

0.24

              precision    recall  f1-score   support

   albatross       0.50      0.38      0.43        24
     cheetah       0.31      0.29      0.30        28
     ostrich       0.20      0.04      0.07        24
     penguin       0.17      0.29      0.21        14
       zebra       0.07      0.20      0.11        10

    accuracy                           0.24       100
   macro avg       0.25      0.24      0.22       100
weighted avg       0.28      0.24      0.24       100



## フルデータ (各ラベルごとのデータ数が 1300)

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# データを読み込む
data = df.copy()

# 特徴量とラベルを分ける
X = data.drop('animal', axis=1)
y = data['animal']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVMモデルの作成と学習
svm_model = SVC(kernel='linear')  # カーネルは適宜変更可能
svm_model.fit(X_train, y_train)

# テストデータでの予測
y_pred = svm_model.predict(X_test)

# 分類性能の評価
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [None]:
# 結果の表示
print(f'Accuracy: {accuracy}')

In [None]:
print('Classification Report:')
print(classification_rep)

In [None]:
mapping_dict

# 4. SVM (ラベル数を絞る)

In [4]:
file_name_list = os.listdir("./winston_pca_1/")
file_path_list = [os.path.join("./winston_pca_1", file_name) for file_name in file_name_list]
file_path_list  

dfs = [pd.read_csv(file, index_col=0) for file in file_path_list]
df = pd.concat(dfs, ignore_index=True)

df

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,cheetah,-5115.360633,-2927.491060,3983.954139,-1607.793565,-1707.697273,1231.716649,-1371.863639,620.235029,2302.777402,...,267.520077,-317.075512,-459.106383,-92.574474,-74.246461,372.197135,183.263019,-40.867809,41.995494,-292.902317
1,cheetah,-12292.042379,4740.839604,1046.388904,622.236931,-2694.557778,-3823.106702,-599.555827,2372.712356,-2011.216821,...,-832.601755,-80.981283,1443.672504,247.833193,-159.965861,-140.464022,-228.427258,361.792851,1131.269994,718.242180
2,cheetah,-3363.075950,-5889.583207,656.045701,-945.120750,731.756823,-791.767700,801.078591,393.364901,1356.113945,...,35.712819,-90.973905,45.616888,-12.432887,-80.691670,143.734972,116.544528,184.049601,195.105505,-134.299170
3,cheetah,-5163.878690,-2379.876547,-951.459624,-753.901112,-1251.338801,-634.390268,-2557.275545,484.175961,426.271124,...,-124.012225,115.598409,31.967310,-298.090207,-15.675591,21.348297,-103.782651,153.353827,64.761970,-143.816475
4,cheetah,-5115.292560,-403.530281,-1820.198062,-2852.371602,4696.364647,-428.639660,-1567.178713,-27.318138,992.058634,...,765.573161,599.910359,-267.457886,107.572501,-16.891712,-399.381829,-106.067050,-351.647419,-887.101674,125.240305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6495,ostrich,7979.372296,1653.066150,-2827.080818,-2683.810541,1094.192487,1708.748383,5401.192591,-2280.259863,1458.434964,...,-19.534184,61.627409,6.559735,150.518938,-52.109450,26.304132,72.550957,120.986373,83.038723,-11.839194
6496,ostrich,-1629.782357,-4352.404305,-381.724749,-3637.522918,805.750783,-531.603751,2203.936574,-532.131204,-2233.427722,...,347.544082,19.164663,-592.219340,277.308670,-51.325638,-83.757541,-148.734691,116.376113,37.296529,873.898038
6497,ostrich,-8622.707655,2666.377568,4019.328827,1768.661635,-397.904678,-847.235851,-948.333874,2762.489919,1706.993131,...,-47.199998,-90.762251,122.820261,177.954703,320.543994,295.480383,-293.603980,126.984810,-8.585088,81.371401
6498,ostrich,8866.229087,5546.882180,-798.891627,1276.040346,903.769834,3575.394117,-1247.686277,480.544157,841.391269,...,48.420065,53.192464,141.564325,-83.715151,285.670406,13.375321,-2.541073,-10.252679,-19.837557,190.620610


In [4]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
count,6500.0,6500.0,6500.0,6500.0,6500.0,6500.0,6500.0,6500.0,6500.0,6500.0,...,6500.0,6500.0,6500.0,6500.0,6500.0,6500.0,6500.0,6500.0,6500.0,6500.0
mean,-2.014881e-14,-4.869295e-14,5.59689e-16,1.119378e-14,-1.287285e-14,-3.917823e-14,2.182787e-14,3.358134e-14,-7.275958e-15,3.358134e-15,...,-1.119378e-15,-2.938367e-15,2.798445e-16,2.798445e-16,-4.33759e-15,3.917823e-15,-3.07829e-15,4.757357e-15,9.094947e-16,-6.996113e-16
std,6729.387,4155.093,2881.477,2506.569,2373.266,2071.41,1884.075,1766.469,1686.332,1535.523,...,309.6199,308.4385,307.4147,305.9639,305.1223,303.8837,303.0639,302.274,300.2282,299.2817
min,-21092.69,-14981.39,-11227.52,-11671.44,-11604.43,-8823.078,-8415.163,-8901.493,-8055.197,-7193.096,...,-1974.585,-1721.207,-1486.726,-1724.071,-1490.577,-1874.882,-1777.292,-1698.259,-1568.659,-1604.531
25%,-4394.083,-2494.75,-1799.111,-1330.32,-1435.227,-1275.002,-1107.786,-1027.384,-992.1062,-899.4924,...,-165.3142,-165.4893,-166.376,-158.5413,-159.5761,-162.9961,-159.8728,-159.8084,-161.1168,-159.7955
50%,-203.6144,-81.4303,-100.4167,-14.30502,-32.43038,-64.25123,-6.617048,-29.35519,-11.23433,-22.02192,...,-3.084637,1.692115,-5.725093,-4.74424,-5.344272,1.344597,-2.53541,1.154375,-1.043821,-0.02996375
75%,4181.748,2452.448,1732.269,1301.298,1416.634,1203.038,1054.406,985.6726,956.102,882.6872,...,159.4457,163.7962,167.3199,156.0138,167.474,163.457,159.6676,158.7704,156.9612,162.1476
max,25720.35,17468.94,13612.67,14123.8,11906.7,10652.64,11632.47,10551.66,9925.178,7837.869,...,2097.546,2464.798,1758.805,2374.503,2315.463,2002.168,2759.453,1834.75,1591.818,1825.117


In [11]:
import pandas as pd
import numpy as np

def random_sample_by_label(df, label_column='animal', sample_size=10, selected_labels=None):
    # 選択されたラベルのみを対象にする
    if selected_labels is not None:
        df = df[df[label_column].isin(selected_labels)]

    # 各ラベル毎に指定の個数をランダムに抜き出す
    sampled_dataframes = []
    unique_labels = df[label_column].unique()

    for label in unique_labels:
        label_data = df[df[label_column] == label]
        sample_data = label_data.sample(n=sample_size, random_state=42)
        sampled_dataframes.append(sample_data)

    # 抜き出したデータを結合
    result_df = pd.concat(sampled_dataframes, ignore_index=True)

    return result_df



original_df = df.copy()
selected_labels = ['cheetah', 'albatross']  
sampled_df = random_sample_by_label(original_df, sample_size=100, selected_labels=selected_labels)

sampled_df

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,cheetah,3927.678876,-970.820148,926.234376,8354.965704,-1444.562035,733.551254,775.392766,-2204.160418,1991.068556,...,-383.314987,836.321862,-167.737536,155.635763,28.396193,-326.276349,317.260028,261.071826,-48.154635,-266.364229
1,cheetah,-12901.336469,-2020.163362,-2813.537940,1559.552536,568.873003,-721.134764,-2190.422034,928.041977,-117.235123,...,-239.839525,-18.464322,204.930955,86.645856,47.325411,1.298196,-135.692596,174.109791,-236.751453,-113.355846
2,cheetah,-2651.689022,12516.271205,3255.240645,868.682800,673.155328,-432.339572,-4000.427500,-2471.144523,-1554.622771,...,11.757454,192.660995,-122.615055,-201.713039,32.190923,-15.392368,126.347606,-111.187874,-242.953080,275.618284
3,cheetah,-238.330422,-524.536914,-226.290882,-2005.694037,-1964.485841,-743.771503,460.439080,-1229.393784,61.429087,...,-217.110972,255.426007,-95.527197,-144.122943,231.937847,-349.420038,-244.240717,-47.456660,46.924611,575.171720
4,cheetah,2648.825409,695.448219,95.602707,796.117338,-463.729982,-2025.147529,-666.479952,-2045.493594,-166.749532,...,-158.614595,55.097136,384.585690,-5.425583,276.438089,243.310228,557.055430,-105.910057,160.153754,-272.175591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,albatross,-2691.333835,-2595.962516,3910.353243,413.239433,-1229.854033,1387.741501,553.111635,765.317225,-815.848029,...,467.018883,152.560094,282.348083,668.507604,-119.369784,-88.791866,249.526005,762.519345,-190.153521,-816.794116
196,albatross,534.769974,8134.526003,-556.311056,2344.101426,18.907841,4552.894962,-1283.622345,-772.622593,-322.769675,...,-124.163285,-121.777809,8.800509,5.045501,135.550989,-212.385459,-147.237703,180.788477,207.895640,37.608725
197,albatross,-8861.372388,1449.149357,-643.348442,397.830262,1145.605963,1405.666487,-23.475198,3391.480827,-2065.602777,...,-214.180353,41.828336,557.506630,-146.912380,589.363123,-14.607097,332.088609,454.582528,-155.196430,74.760552
198,albatross,-13052.624666,-1780.163013,-1478.227545,-5436.465960,1871.863322,-152.643628,1092.489817,-2562.938836,-1333.134564,...,115.377656,-4.491137,139.228756,160.033193,223.114608,-103.386035,161.953820,-38.242462,-211.292487,-217.447453


## ラベル数 2 ，各ラベルごとのデータ数は 100

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# データを読み込む
data = sampled_df.copy()

# 特徴量とラベルを分ける
X = data.drop('animal', axis=1)
y = data['animal']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVMモデルの作成と学習
svm_model = SVC(kernel='linear')  # カーネルは適宜変更可能
svm_model.fit(X_train, y_train)

# テストデータでの予測
y_pred = svm_model.predict(X_test)

# 分類性能の評価
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [7]:
print(accuracy)
print()
print(classification_rep)

0.5

              precision    recall  f1-score   support

   albatross       0.48      0.68      0.57        19
     cheetah       0.54      0.33      0.41        21

    accuracy                           0.50        40
   macro avg       0.51      0.51      0.49        40
weighted avg       0.51      0.50      0.48        40



### kernel = 'rbf'

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# データを読み込む
data = sampled_df.copy()

# 特徴量とラベルを分ける
X = data.drop('animal', axis=1)
y = data['animal']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVMモデルの作成と学習
svm_model = SVC(kernel='rbf')  # カーネルは適宜変更可能
svm_model.fit(X_train, y_train)

# テストデータでの予測
y_pred = svm_model.predict(X_test)

# 分類性能の評価
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [13]:
print(accuracy)
print()
print(classification_rep)

0.625

              precision    recall  f1-score   support

   albatross       0.61      0.58      0.59        19
     cheetah       0.64      0.67      0.65        21

    accuracy                           0.62        40
   macro avg       0.62      0.62      0.62        40
weighted avg       0.62      0.62      0.62        40



### random forest

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# データを読み込む
data = sampled_df.copy()

# 特徴量とラベルを分ける
X = data.drop('animal', axis=1)
y = data['animal']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forestモデルの作成と学習
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # n_estimatorsは適宜変更可能
rf_model.fit(X_train, y_train)

# テストデータでの予測
y_pred = rf_model.predict(X_test)

# 分類性能の評価
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [15]:
print(accuracy)
print()
print(classification_rep)

0.5

              precision    recall  f1-score   support

   albatross       0.48      0.68      0.57        19
     cheetah       0.54      0.33      0.41        21

    accuracy                           0.50        40
   macro avg       0.51      0.51      0.49        40
weighted avg       0.51      0.50      0.48        40



In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
# from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score

# データを読み込む
data = sampled_df.copy()

# 特徴量とラベルを分ける
X = data.drop('animal', axis=1)
y = data['animal']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVM (linear)モデルの作成と学習
svm_linear_model = SVC(kernel='linear')
svm_linear_model.fit(X_train, y_train)
y_pred_svm_linear = svm_linear_model.predict(X_test)
accuracy_svm_linear = accuracy_score(y_test, y_pred_svm_linear)
classification_rep_svm_linear = classification_report(y_test, y_pred_svm_linear)

# SVM (rbf)モデルの作成と学習
svm_rbf_model = SVC(kernel='rbf')
svm_rbf_model.fit(X_train, y_train)
y_pred_svm_rbf = svm_rbf_model.predict(X_test)
accuracy_svm_rbf = accuracy_score(y_test, y_pred_svm_rbf)
classification_rep_svm_rbf = classification_report(y_test, y_pred_svm_rbf)

# Random Forestモデルの作成と学習
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_rep_rf = classification_report(y_test, y_pred_rf)

# Logistic Regressionモデルの作成と学習
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)
y_pred_logreg = logreg_model.predict(X_test)
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
classification_rep_logreg = classification_report(y_test, y_pred_logreg)

# Decision Treeモデルの作成と学習
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
classification_rep_dt = classification_report(y_test, y_pred_dt)

# KNNモデルの作成と学習
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
classification_rep_knn = classification_report(y_test, y_pred_knn)

# # XGBoostモデルの作成と学習
# xgb_model = XGBClassifier(random_state=42)
# xgb_model.fit(X_train, y_train)
# y_pred_xgb = xgb_model.predict(X_test)
# accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
# classification_rep_xgb = classification_report(y_test, y_pred_xgb)

# Naive Bayesモデルの作成と学習
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
classification_rep_nb = classification_report(y_test, y_pred_nb)

# 結果の表示
print("SVM (Linear) Accuracy:\n", accuracy_svm_linear)
print("SVM (Linear) Classification Report:\n", classification_rep_svm_linear)

print("\nSVM (RBF) Accuracy:\n", accuracy_svm_rbf)
print("SVM (RBF) Classification Report:\n", classification_rep_svm_rbf)

print("\nRandom Forest Accuracy:\n", accuracy_rf)
print("Random Forest Classification Report:\n", classification_rep_rf)

print("\nLogistic Regression Accuracy:\n", accuracy_logreg)
print("Logistic Regression Classification Report:\n", classification_rep_logreg)

print("\nDecision Tree Accuracy:\n", accuracy_dt)
print("Decision Tree Classification Report:\n", classification_rep_dt)

print("\nK-Nearest Neighbors Accuracy:\n", accuracy_knn)
print("K-Nearest Neighbors Classification Report:\n", classification_rep_knn)

# print("\nXGBoost Accuracy:\n", accuracy_xgb)
# print("XGBoost Classification Report:\n", classification_rep_xgb)

print("\nNaive Bayes Accuracy:\n", accuracy_nb)
print("Naive Bayes Classification Report:\n", classification_rep_nb)


SVM (Linear) Accuracy:
 0.5
SVM (Linear) Classification Report:
               precision    recall  f1-score   support

   albatross       0.48      0.68      0.57        19
     cheetah       0.54      0.33      0.41        21

    accuracy                           0.50        40
   macro avg       0.51      0.51      0.49        40
weighted avg       0.51      0.50      0.48        40


SVM (RBF) Accuracy:
 0.625
SVM (RBF) Classification Report:
               precision    recall  f1-score   support

   albatross       0.61      0.58      0.59        19
     cheetah       0.64      0.67      0.65        21

    accuracy                           0.62        40
   macro avg       0.62      0.62      0.62        40
weighted avg       0.62      0.62      0.62        40


Random Forest Accuracy:
 0.5
Random Forest Classification Report:
               precision    recall  f1-score   support

   albatross       0.48      0.68      0.57        19
     cheetah       0.54      0.33      0.4

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## ラベル数 2 ，各ラベルごとのデータ数は 1000

In [6]:
original_df = df.copy()
selected_labels = ['cheetah', 'albatross']  
sampled_df = random_sample_by_label(original_df, sample_size=1000, selected_labels=selected_labels)

sampled_df

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,cheetah,3927.678876,-970.820148,926.234376,8354.965704,-1444.562035,733.551254,775.392766,-2204.160418,1991.068556,...,-383.314987,836.321862,-167.737536,155.635763,28.396193,-326.276349,317.260028,261.071826,-48.154635,-266.364229
1,cheetah,-12901.336469,-2020.163362,-2813.537940,1559.552536,568.873003,-721.134764,-2190.422034,928.041977,-117.235123,...,-239.839525,-18.464322,204.930955,86.645856,47.325411,1.298196,-135.692596,174.109791,-236.751453,-113.355846
2,cheetah,-2651.689022,12516.271205,3255.240645,868.682800,673.155328,-432.339572,-4000.427500,-2471.144523,-1554.622771,...,11.757454,192.660995,-122.615055,-201.713039,32.190923,-15.392368,126.347606,-111.187874,-242.953080,275.618284
3,cheetah,-238.330422,-524.536914,-226.290882,-2005.694037,-1964.485841,-743.771503,460.439080,-1229.393784,61.429087,...,-217.110972,255.426007,-95.527197,-144.122943,231.937847,-349.420038,-244.240717,-47.456660,46.924611,575.171720
4,cheetah,2648.825409,695.448219,95.602707,796.117338,-463.729982,-2025.147529,-666.479952,-2045.493594,-166.749532,...,-158.614595,55.097136,384.585690,-5.425583,276.438089,243.310228,557.055430,-105.910057,160.153754,-272.175591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,albatross,844.437065,378.200162,2685.682448,2153.760181,1497.156581,114.868815,76.992372,624.496587,-1210.334120,...,61.701614,-181.831461,63.133138,199.690703,-32.992910,134.411144,59.888706,32.572970,277.962175,-371.431583
1996,albatross,-3853.239617,678.028919,-3345.748331,212.217250,-1033.715043,-1436.311163,8.651006,505.583481,-1188.697371,...,-39.737225,-302.806979,468.391057,296.714824,-48.545503,232.462549,112.748991,325.951665,600.464918,-108.089411
1997,albatross,-2374.164091,4226.097385,-923.220861,-390.596550,1122.594942,-248.027486,2192.531925,912.705360,-213.632803,...,402.753224,-48.596443,-250.830180,82.581943,-43.480590,181.771868,44.248727,3.497554,-240.610385,-20.611566
1998,albatross,-15618.763318,2153.342297,3195.055594,1569.037667,-560.089472,-3941.548803,2524.292696,173.406761,-683.616902,...,-351.484770,64.342979,197.750594,103.805486,-431.560110,327.798326,-250.527510,-281.746001,-279.590561,272.329562


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# データを読み込む
data = sampled_df.copy()

# 特徴量とラベルを分ける
X = data.drop('animal', axis=1)
y = data['animal']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVMモデルの作成と学習
svm_model = SVC(kernel='linear')  # カーネルは適宜変更可能
svm_model.fit(X_train, y_train)

# テストデータでの予測
y_pred = svm_model.predict(X_test)

# 分類性能の評価
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [None]:
print(accuracy)
print()
print(classification_rep)