In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
'''
1. 원본 데이터에 Na 값 제거 및 평균 값으로 수정
2. sex 칼럼은 랜덤으로 설정
3. penguins_size_filled.csv 파일로 재생성
'''

# CSV 파일 경로 설정
file_path = "penguins_size.csv"

# CSV 파일을 pandas DataFrame으로 읽기
df = pd.read_csv(file_path)

# 데이터 확인
#print(df.head())

In [3]:
#df_filled = df.fillna(df.mean()).astype(int)

df['culmen_length_mm'] = df['culmen_length_mm'].fillna(df['culmen_length_mm'].mean()).round(1)
df['culmen_depth_mm'] = df['culmen_depth_mm'].fillna(df['culmen_depth_mm'].mean()).round(1)
df['flipper_length_mm'] = df['flipper_length_mm'].fillna(df['flipper_length_mm'].mean()).round(1)
df['body_mass_g'] = df['body_mass_g'].fillna(df['body_mass_g'].mean()).round(0)

# 'sex' 열에 누락된 값들을 랜덤으로 할당
sex_values = ['Male', 'Female']
missing_sex_indices = df['sex'].isnull()
random_sex = [random.choice(sex_values) for _ in range(sum(missing_sex_indices))]
df.loc[missing_sex_indices, 'sex'] = random_sex
df['sex'] = df['sex'].str.upper()

In [4]:
print(df)

    species     island  culmen_length_mm  culmen_depth_mm  flipper_length_mm  \
0    Adelie  Torgersen              39.1             18.7              181.0   
1    Adelie  Torgersen              39.5             17.4              186.0   
2    Adelie  Torgersen              40.3             18.0              195.0   
3    Adelie  Torgersen              43.9             17.2              200.9   
4    Adelie  Torgersen              36.7             19.3              193.0   
..      ...        ...               ...              ...                ...   
339  Gentoo     Biscoe              43.9             17.2              200.9   
340  Gentoo     Biscoe              46.8             14.3              215.0   
341  Gentoo     Biscoe              50.4             15.7              222.0   
342  Gentoo     Biscoe              45.2             14.8              212.0   
343  Gentoo     Biscoe              49.9             16.1              213.0   

     body_mass_g     sex  
0         37

In [5]:
# one-hot 인코딩

# 1) species
one_hot_encoded_species = pd.get_dummies(df['species'], prefix='species')

# 2) island
one_hot_encoded_island = pd.get_dummies(df['island'], prefix='island')

# 3) sex
one_hot_encoded_sex = pd.get_dummies(df['sex'], prefix='sex')

df_final = pd.concat([df, one_hot_encoded_island], axis=1)
df_final = pd.concat([df_final, one_hot_encoded_species], axis=1)
df_final = pd.concat([df_final, one_hot_encoded_sex], axis=1)

In [6]:
# 확인
print(df_final)

# 파일 재생성


#df_final.to_csv("penguins_size_filled.csv", index=False)

    species     island  culmen_length_mm  culmen_depth_mm  flipper_length_mm  \
0    Adelie  Torgersen              39.1             18.7              181.0   
1    Adelie  Torgersen              39.5             17.4              186.0   
2    Adelie  Torgersen              40.3             18.0              195.0   
3    Adelie  Torgersen              43.9             17.2              200.9   
4    Adelie  Torgersen              36.7             19.3              193.0   
..      ...        ...               ...              ...                ...   
339  Gentoo     Biscoe              43.9             17.2              200.9   
340  Gentoo     Biscoe              46.8             14.3              215.0   
341  Gentoo     Biscoe              50.4             15.7              222.0   
342  Gentoo     Biscoe              45.2             14.8              212.0   
343  Gentoo     Biscoe              49.9             16.1              213.0   

     body_mass_g     sex  island_Biscoe

In [7]:
# 독립 변수
'''
species : 펭귄의 종을 나타내는 문자열
island : 샘플들이 수집된 Palmer Station 근처 섬 이름
Culmen Length (mm) : 펭귄 옆모습 기준 부리의 가로 길이
Culmen Depth (mm) : 펭귄 옆모습 기준 부리의 세로 길이
Flipper Length (mm) : 펭귄의 팔(날개) 길이
'''
# 종속 변수
'''
Body Mass : 펭귄의 몸무게를 나타내는 숫자 (g)
'''

'\nBody Mass : 펭귄의 몸무게를 나타내는 숫자 (g)\n'

In [8]:
import pandas as pd
import numpy as np
import random

# CSV 파일 경로 설정
file_path = "penguins_size_filled.csv"

# CSV 파일을 pandas DataFrame으로 읽기
df = pd.read_csv(file_path)

In [9]:
print(df.head(10))

  species     island  culmen_length_mm  culmen_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen              39.1             18.7              181.0   
1  Adelie  Torgersen              39.5             17.4              186.0   
2  Adelie  Torgersen              40.3             18.0              195.0   
3  Adelie  Torgersen              43.9             17.2              200.9   
4  Adelie  Torgersen              36.7             19.3              193.0   
5  Adelie  Torgersen              39.3             20.6              190.0   
6  Adelie  Torgersen              38.9             17.8              181.0   
7  Adelie  Torgersen              39.2             19.6              195.0   
8  Adelie  Torgersen              34.1             18.1              193.0   
9  Adelie  Torgersen              42.0             20.2              190.0   

   body_mass_g     sex  island_Biscoe  island_Dream  island_Torgersen  ...  \
0       3750.0    MALE              0             0            

In [10]:
# np로 칼럼에서 데이터 추출
species = np.array(df.species)
island = np.array(df.island)
culmen_length_mm = np.array(df.culmen_length_mm)
culmen_depth_mm = np.array(df.culmen_depth_mm)
flipper_length_mm = np.array(df.flipper_length_mm)
sex = np.array(df.sex)
body_mass_g = np.array(df.body_mass_g)

In [11]:
# 데이터 준비

#pg_input = df[['culmen_length_mm','culmen_depth_mm','flipper_length_mm']].to_numpy()
pg_input = df[['culmen_length_mm','culmen_depth_mm','flipper_length_mm','island_Biscoe','island_Dream','island_Torgersen','species_Adelie','species_Chinstrap','species_Gentoo','sex_FEMALE','sex_MALE']].to_numpy()

pg_input[:5]

array([[ 39.1,  18.7, 181. ,   0. ,   0. ,   1. ,   1. ,   0. ,   0. ,
          0. ,   1. ],
       [ 39.5,  17.4, 186. ,   0. ,   0. ,   1. ,   1. ,   0. ,   0. ,
          1. ,   0. ],
       [ 40.3,  18. , 195. ,   0. ,   0. ,   1. ,   1. ,   0. ,   0. ,
          1. ,   0. ],
       [ 43.9,  17.2, 200.9,   0. ,   0. ,   1. ,   1. ,   0. ,   0. ,
          1. ,   0. ],
       [ 36.7,  19.3, 193. ,   0. ,   0. ,   1. ,   1. ,   0. ,   0. ,
          1. ,   0. ]])

In [12]:
pg_target = df['body_mass_g'].to_numpy()
pg_target[:5]

array([3750., 3800., 3250., 4202., 3450.])

In [27]:
# 데이터 나누기
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(
    pg_input, pg_target, random_state=42)

In [28]:
# 데이터 스케일링

from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)

In [29]:
# 스케일링 X
# 최근접 이웃 분류 모델

from sklearn.neighbors import KNeighborsClassifier

kn = KNeighborsClassifier(n_neighbors=10) # 이웃 개수 10개로
kn.fit(train_input, train_target)

print(kn.score(train_input, train_target))
print(kn.score(test_input, test_target))

0.18992248062015504
0.046511627906976744


In [30]:
print(kn.classes_)

[2700. 2850. 2900. 2975. 3000. 3050. 3075. 3150. 3175. 3200. 3250. 3275.
 3300. 3325. 3350. 3400. 3425. 3450. 3475. 3500. 3525. 3550. 3575. 3600.
 3625. 3650. 3675. 3700. 3725. 3750. 3775. 3800. 3850. 3875. 3900. 3950.
 4000. 4050. 4075. 4100. 4150. 4200. 4202. 4250. 4300. 4350. 4375. 4400.
 4450. 4475. 4500. 4550. 4575. 4600. 4650. 4700. 4725. 4750. 4800. 4850.
 4875. 4900. 4925. 4950. 4975. 5000. 5050. 5100. 5150. 5200. 5250. 5300.
 5350. 5400. 5500. 5550. 5650. 5700. 5800. 5850. 5950. 6000. 6050. 6300.]


In [31]:
print(kn.predict(test_input[:5]))

[3300. 3300. 4200. 3525. 4050.]


In [32]:
proba = kn.predict_proba(test_input[:5])
print(np.round(proba, decimals=4))

distances, indexes = kn.kneighbors(test_input[3:4])
print(train_target[indexes])

[[0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.1 0.  0.  0.  0.  0.1
  0.  0.  0.  0.  0.  0.1 0.  0.  0.1 0.1 0.1 0.1 0.1 0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.1 0.  0.  0.  0.  0.  0.  0.1 0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.1 0.  0.  0.  0.  0.
  0.  0.1 0.  0.  0.  0.  0.  0.1 0.  0.  0.  0.  0.  0.  0.1 0.  0.1 0.
  0.  0.  0.1 0.  0.1 0.  0.1 0.  0.  0.  0.  0.1 0.  0.  0.  0.  0.  0.
  0.  0.1 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.2 0.  0.  0.1 0.  0.  0.1 0.1 0.  0.  0.  0.  0.2
  0.  0.  0.  0.  0.2 0.1 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.

In [33]:
# 스케일링 O
# 최근접 이웃 분류 모델

from sklearn.neighbors import KNeighborsClassifier

kn = KNeighborsClassifier(n_neighbors=10) # 이웃 개수 25개로
kn.fit(train_scaled, train_target)

print(kn.score(train_scaled, train_target))
print(kn.score(test_scaled, test_target))

0.21705426356589147
0.046511627906976744


In [34]:
print(kn.predict(test_scaled[:5]))

[3300. 3300. 4200. 3350. 4200.]


In [35]:
proba = kn.predict_proba(test_scaled[:5])
print(np.round(proba, decimals=4))

distances, indexes = kn.kneighbors(test_scaled[3:4])
print(train_target[indexes])

[[0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.1 0.  0.  0.  0.  0.1
  0.  0.  0.  0.  0.  0.1 0.  0.1 0.  0.1 0.  0.1 0.  0.1 0.  0.  0.  0.1
  0.  0.  0.  0.  0.1 0.  0.  0.  0.  0.  0.  0.  0.1 0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.1 0.  0.  0.1 0.  0.1
  0.  0.  0.  0.  0.1 0.1 0.  0.1 0.  0.  0.  0.  0.  0.1 0.1 0.  0.1 0.
  0.  0.  0.  0.  0.1 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.1 0.  0.  0.1 0.  0.  0.1 0.  0.  0.  0.  0.  0.1
  0.1 0.  0.  0.1 0.  0.1 0.1 0.  0.1 0.  0.  0.  0.  0.1 0.  0.  0.  0.
  0.  0.  0.  0.  