In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/yoonkt200/FastCampusDataset/master/Pokemon.csv")

In [3]:
#데이터 타입 변경

In [4]:
df['Legendary'] = df['Legendary'].astype(int)
df['Generation'] = df['Generation'].astype(str) #원핫인코딩을 위해 str 형태로

In [5]:
preprocessed_df = df[['Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Generation', 'Legendary']]

In [6]:
preprocessed_df

Unnamed: 0,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,Grass,Poison,318,45,49,49,65,65,45,1,0
1,Grass,Poison,405,60,62,63,80,80,60,1,0
2,Grass,Poison,525,80,82,83,100,100,80,1,0
3,Grass,Poison,625,80,100,123,122,120,80,1,0
4,Fire,,309,39,52,43,60,50,65,1,0
...,...,...,...,...,...,...,...,...,...,...,...
795,Rock,Fairy,600,50,100,150,100,150,50,6,1
796,Rock,Fairy,700,50,160,110,160,110,110,6,1
797,Psychic,Ghost,600,80,110,60,150,130,70,6,1
798,Psychic,Dark,680,80,160,60,170,130,80,6,1


In [7]:
#원 핫 인코딩(get_dummies)

encoded_df = pd.get_dummies(preprocessed_df['Type 1'])
encoded_df.head()

Unnamed: 0,Bug,Dark,Dragon,Electric,Fairy,Fighting,Fire,Flying,Ghost,Grass,Ground,Ice,Normal,Poison,Psychic,Rock,Steel,Water
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [8]:
# pokemon type list 생성

def make_list(x1, x2):
    type_list = []
    type_list.append(x1)
    if x2 is not np.nan:
        type_list.append(x2)
    return type_list

#하나의 컬럼에 2개 입력
preprocessed_df['Type'] = preprocessed_df.apply(lambda x: make_list(x['Type 1'], x['Type 2']), axis=1)
preprocessed_df.head()

Unnamed: 0,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Type
0,Grass,Poison,318,45,49,49,65,65,45,1,0,"[Grass, Poison]"
1,Grass,Poison,405,60,62,63,80,80,60,1,0,"[Grass, Poison]"
2,Grass,Poison,525,80,82,83,100,100,80,1,0,"[Grass, Poison]"
3,Grass,Poison,625,80,100,123,122,120,80,1,0,"[Grass, Poison]"
4,Fire,,309,39,52,43,60,50,65,1,0,[Fire]


In [9]:
preprocessed_df = preprocessed_df.drop(['Type 1','Type 2'], axis=1)
preprocessed_df

Unnamed: 0,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Type
0,318,45,49,49,65,65,45,1,0,"[Grass, Poison]"
1,405,60,62,63,80,80,60,1,0,"[Grass, Poison]"
2,525,80,82,83,100,100,80,1,0,"[Grass, Poison]"
3,625,80,100,123,122,120,80,1,0,"[Grass, Poison]"
4,309,39,52,43,60,50,65,1,0,[Fire]
...,...,...,...,...,...,...,...,...,...,...
795,600,50,100,150,100,150,50,6,1,"[Rock, Fairy]"
796,700,50,160,110,160,110,110,6,1,"[Rock, Fairy]"
797,600,80,110,60,150,130,70,6,1,"[Psychic, Ghost]"
798,680,80,160,60,170,130,80,6,1,"[Psychic, Dark]"


In [10]:
# multi label binarizer 적용

from sklearn.preprocessing import MultiLabelBinarizer

In [11]:
mlb = MultiLabelBinarizer()
preprocessed_df = preprocessed_df.join(pd.DataFrame(mlb.fit_transform(preprocessed_df.pop('Type')),columns=mlb.classes_))

In [12]:
preprocessed_df.head()

Unnamed: 0,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Bug,...,Ghost,Grass,Ground,Ice,Normal,Poison,Psychic,Rock,Steel,Water
0,318,45,49,49,65,65,45,1,0,0,...,0,1,0,0,0,1,0,0,0,0
1,405,60,62,63,80,80,60,1,0,0,...,0,1,0,0,0,1,0,0,0,0
2,525,80,82,83,100,100,80,1,0,0,...,0,1,0,0,0,1,0,0,0,0
3,625,80,100,123,122,120,80,1,0,0,...,0,1,0,0,0,1,0,0,0,0
4,309,39,52,43,60,50,65,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# apply one-hot encoding to 'Generation'
preprocessed_df = pd.get_dummies(preprocessed_df)
preprocessed_df.head()

Unnamed: 0,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Legendary,Bug,Dark,...,Psychic,Rock,Steel,Water,Generation_1,Generation_2,Generation_3,Generation_4,Generation_5,Generation_6
0,318,45,49,49,65,65,45,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,405,60,62,63,80,80,60,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,525,80,82,83,100,100,80,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,625,80,100,123,122,120,80,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,309,39,52,43,60,50,65,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [14]:
#피쳐 표준화 min-max로 하면 더 좋은 성능을 할수 있을거같음
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scale_coulums = ['Total','HP','Attack','Defense', 'Sp. Atk', 'Sp. Def', 'Speed'] #스케일링이 필요한 피쳐
preprocessed_df[scale_coulums] = scaler.fit_transform(preprocessed_df[scale_coulums]) 
preprocessed_df.head()

Unnamed: 0,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Legendary,Bug,Dark,...,Psychic,Rock,Steel,Water,Generation_1,Generation_2,Generation_3,Generation_4,Generation_5,Generation_6
0,-0.976765,-0.950626,-0.924906,-0.797154,-0.23913,-0.248189,-0.801503,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,-0.251088,-0.362822,-0.52413,-0.347917,0.21956,0.291156,-0.285015,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0.749845,0.420917,0.092448,0.293849,0.831146,1.010283,0.403635,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,1.583957,0.420917,0.647369,1.577381,1.503891,1.729409,0.403635,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,-1.051836,-1.185748,-0.832419,-0.989683,-0.392027,-0.787533,-0.112853,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [15]:
#데이터 셋 분리
from sklearn.model_selection import train_test_split

x = preprocessed_df.loc[:,preprocessed_df.columns != 'Legendary']
y = preprocessed_df['Legendary']

x_train, x_test , y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=33)

In [16]:
# Logistic Regression 모델 학습

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

lr = LogisticRegression(random_state=0)
lr.fit(x_train, y_train)

#결과
y_pred = lr.predict(x_test)

In [17]:
#모델 평가
print("accuracy_score: ",accuracy_score(y_test, y_pred))
print("precision_score: ",precision_score(y_test, y_pred))
print("recall_score: ",recall_score(y_test, y_pred))
print("f1_score: ",f1_score(y_test, y_pred))

"""
정확도만 높게 나옴
confusion_matrix를 통해 확인
"""

accuracy_score:  0.955
precision_score:  0.6153846153846154
recall_score:  0.6666666666666666
f1_score:  0.64


'\n정확도만 높게 나옴\nconfusion_matrix를 통해 확인\n'

In [18]:
from sklearn.metrics import confusion_matrix

confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
print(confmat)

"""
183 케이스에 대하여 너무 많이 답을 내놓음
= 클래스의 불균형임 
ex) 무조건 맞다라고만해도 정확도가 높아지는 모델이 학습이 됨

= 클래스의 비율을 맞춰줘야함
"""

[[183   5]
 [  4   8]]


'\n183 케이스에 대하여 너무 많이 답을 내놓음\n= 클래스의 불균형임 \nex) 무조건 맞다라고만해도 정확도가 높아지는 모델이 학습이 됨\n\n= 클래스의 비율을 맞춰줘야함\n'

In [19]:
#클래스 불균형 조정

#비율 확인
preprocessed_df['Legendary'].value_counts()

0    735
1     65
Name: Legendary, dtype: int64

In [20]:
positive_random_idx = preprocessed_df[preprocessed_df['Legendary']==1].sample(65, random_state=33).index.tolist()
negative_random_idx = preprocessed_df[preprocessed_df['Legendary']==0].sample(65, random_state=33).index.tolist()

In [21]:
random_idx = positive_random_idx+negative_random_idx

x = preprocessed_df.loc[random_idx,preprocessed_df.columns != 'Legendary'] #레전드컬럼만 빼기
y = preprocessed_df['Legendary'][random_idx]

x_train, x_test , y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=33)

In [22]:
print(x_train.shape)
print(x_test.shape)

(97, 31)
(33, 31)


In [23]:
#모델 재학습
lr = LogisticRegression()
y_pred = lr.fit(x_train,y_train).predict(x_test)

In [24]:
#모델 평가
print("accuracy_score: ",accuracy_score(y_test, y_pred))
print("precision_score: ",precision_score(y_test, y_pred))
print("recall_score: ",recall_score(y_test, y_pred))
print("f1_score: ",f1_score(y_test, y_pred))

accuracy_score:  0.9696969696969697
precision_score:  0.9230769230769231
recall_score:  1.0
f1_score:  0.9600000000000001


In [31]:
preprocessed_df.to_csv("file.csv", index=False, encoding="utf-8-sig")