### 지도학습
- 학습이 결과데이터가 주어지고 결과를 예측하는 목표를 갖고 있다.
- 분류 : 결과 데이터가 범주형(혈액형, 성별 등)인 경우

In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 불필요한 경고 안뜨게
import warnings
warnings.filterwarnings('ignore')

# 데이터 전처리 알고리즘(비지도학습)
# 문자열 데이터를 숫자로 변환한다.
from sklearn.preprocessing import LabelEncoder
# 표준편차 기반으로 표준화(컬럼 당 숫자 편차가 다르니까.)
# 잘못된 학습을 정상화 시키는 목적으로 사용함
from sklearn.preprocessing import StandardScaler # 젤 유명 표준화

# train data랑 test data 나눔
from sklearn.model_selection import train_test_split

#교차검증
from sklearn.model_selection import cross_val_score

# 평가 함수
# 정확도 평가 함수
from sklearn.metrics import accuracy_score

# 학습 알고리즘 - 분류
# 최 근접 이웃
# 학습시 : 주어진 데이터를 저장만 한다
# 예측시 : 주변의 데이터를 보고 가장 많은 결과로 결정한다
from sklearn.neighbors import KNeighborsClassifier

# 선형
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# 트리
# 학습시 : 주어진 데이터를 갖고 질문을 생성한다
# 예측시 : 질문을 통해 최종 결과 예측
from sklearn.tree import DecisionTreeClassifier

# 앙상블 - 다수의 알고리즘이 던지는 결과를 취합하여 최종 결과를 결정함
# 트리들을 사용한다
# 학습시 : 데이터를 랜덤하게 섞어서 80%를 추출한다.
# 추출된 데이터를 트리의 개수만큼 생성해 각 트리들에게 주고 학습 수행
# 예측시 : 각 트리가 던지는 결과를 취합하여 다수결의 원칙으로 최종결과 결정
from sklearn.ensemble import RandomForestClassifier

# 부스팅 - 앙상블 알고리즘이 잘못 예측한 데이터를 다시 학습하는 방식
# 학습과 예측 원리는 앙상블과 동일함
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

### 데이터를 불러온데이

In [2]:
# csv 파일의 데이터를 불러와 DataFrame으로 생성한다.
# csv 파일의 첫 줄은 컬럼 이름으로 사용한다.
df1 = pd.read_csv('data/breast_cancer.csv')
df1

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.6656,0.7119,0.26540,0.4601,0.11890,malignant
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.1866,0.2416,0.18600,0.2750,0.08902,malignant
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.4245,0.4504,0.24300,0.3613,0.08758,malignant
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.8663,0.6869,0.25750,0.6638,0.17300,malignant
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.2050,0.4000,0.16250,0.2364,0.07678,malignant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449,21.10,20.52,138.10,1384.0,0.09684,0.11750,0.15720,0.11550,0.1554,0.05661,...,32.07,168.20,2022.0,0.13680,0.3101,0.4399,0.22800,0.2268,0.07425,malignant
450,11.87,21.54,76.83,432.0,0.06613,0.10640,0.08777,0.02386,0.1349,0.06612,...,28.18,83.51,507.2,0.09457,0.3399,0.3218,0.08750,0.2305,0.09952,benign
451,19.59,25.00,127.70,1191.0,0.10320,0.09871,0.16550,0.09063,0.1663,0.05391,...,30.96,139.80,1421.0,0.15280,0.1845,0.3977,0.14660,0.2293,0.06091,malignant
452,12.00,28.23,76.77,442.5,0.08437,0.06450,0.04055,0.01945,0.1615,0.06104,...,37.88,85.07,523.7,0.12080,0.1856,0.1811,0.07116,0.2447,0.08194,benign


### 데이터 전처리
- 결측치(데이터가 없는 공간)에 대한 처리
- 이상치에 대한 처리


In [3]:
# 결측치 확인
df1.isna().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64

In [4]:
# 범위형 데이터의 이상치 확인 (통계량 확인)
df1['mean area'].agg(['min', 'max'])

min     143.5
max    2499.0
Name: mean area, dtype: float64

In [5]:
# 범주형 데이터의 이상치 확인
df1['target'].value_counts().index

Index(['benign', 'malignant'], dtype='object')

In [6]:
# 컬럼의 타입을 확인한다.
# int : 정수, float : 실수, datetime : 날짜, object : 문자열
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 454 entries, 0 to 453
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              454 non-null    float64
 1   mean texture             454 non-null    float64
 2   mean perimeter           454 non-null    float64
 3   mean area                454 non-null    float64
 4   mean smoothness          454 non-null    float64
 5   mean compactness         454 non-null    float64
 6   mean concavity           454 non-null    float64
 7   mean concave points      454 non-null    float64
 8   mean symmetry            454 non-null    float64
 9   mean fractal dimension   454 non-null    float64
 10  radius error             454 non-null    float64
 11  texture error            454 non-null    float64
 12  perimeter error          454 non-null    float64
 13  area error               454 non-null    float64
 14  smoothness error         4

In [7]:
# 문자열 데이터를 숫자로 변환한다
# LabelEncoder : 지정된 데이터들의 값을 확인하여
# 각 값들을 0부터 1씩 증가하는 숫자로 변환해준다
# 숫자로 되어있는 값을 원래의 문자열로 변환하는 기능도 제공한다.
encoder1 = LabelEncoder()
df1['target'] = encoder1.fit_transform(df1['target'])
df1

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.6656,0.7119,0.26540,0.4601,0.11890,1
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.1866,0.2416,0.18600,0.2750,0.08902,1
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.4245,0.4504,0.24300,0.3613,0.08758,1
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.8663,0.6869,0.25750,0.6638,0.17300,1
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.2050,0.4000,0.16250,0.2364,0.07678,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449,21.10,20.52,138.10,1384.0,0.09684,0.11750,0.15720,0.11550,0.1554,0.05661,...,32.07,168.20,2022.0,0.13680,0.3101,0.4399,0.22800,0.2268,0.07425,1
450,11.87,21.54,76.83,432.0,0.06613,0.10640,0.08777,0.02386,0.1349,0.06612,...,28.18,83.51,507.2,0.09457,0.3399,0.3218,0.08750,0.2305,0.09952,0
451,19.59,25.00,127.70,1191.0,0.10320,0.09871,0.16550,0.09063,0.1663,0.05391,...,30.96,139.80,1421.0,0.15280,0.1845,0.3977,0.14660,0.2293,0.06091,1
452,12.00,28.23,76.77,442.5,0.08437,0.06450,0.04055,0.01945,0.1615,0.06104,...,37.88,85.07,523.7,0.12080,0.1856,0.1811,0.07116,0.2447,0.08194,0


In [8]:
# 독립변수 종속변수 나눔
X = df1.drop('target', axis = 1)
y = df1['target']

display(X)
display(y)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.38,17.33,184.60,2019.0,0.16220,0.6656,0.7119,0.26540,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.99,23.41,158.80,1956.0,0.12380,0.1866,0.2416,0.18600,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.57,25.53,152.50,1709.0,0.14440,0.4245,0.4504,0.24300,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.91,26.50,98.87,567.7,0.20980,0.8663,0.6869,0.25750,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.54,16.67,152.20,1575.0,0.13740,0.2050,0.4000,0.16250,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449,21.10,20.52,138.10,1384.0,0.09684,0.11750,0.15720,0.11550,0.1554,0.05661,...,25.68,32.07,168.20,2022.0,0.13680,0.3101,0.4399,0.22800,0.2268,0.07425
450,11.87,21.54,76.83,432.0,0.06613,0.10640,0.08777,0.02386,0.1349,0.06612,...,12.79,28.18,83.51,507.2,0.09457,0.3399,0.3218,0.08750,0.2305,0.09952
451,19.59,25.00,127.70,1191.0,0.10320,0.09871,0.16550,0.09063,0.1663,0.05391,...,21.44,30.96,139.80,1421.0,0.15280,0.1845,0.3977,0.14660,0.2293,0.06091
452,12.00,28.23,76.77,442.5,0.08437,0.06450,0.04055,0.01945,0.1615,0.06104,...,13.09,37.88,85.07,523.7,0.12080,0.1856,0.1811,0.07116,0.2447,0.08194


0      1
1      1
2      1
3      1
4      1
      ..
449    1
450    0
451    1
452    0
453    0
Name: target, Length: 454, dtype: int32

In [9]:
# 입력데이터 표준화
# 모든 컬럼의 값의 범위를 비슷한 수준으로 맞추어 학습의 정상화를 보장받도록 한다.
scaler1 = StandardScaler()
X = scaler1.fit_transform(X)
X

array([[ 1.0716471 , -2.11021126,  1.24691885, ...,  2.21627442,
         2.57629265,  1.89296305],
       [ 1.80872514, -0.29975355,  1.665741  , ...,  1.02843583,
        -0.29491256,  0.27048275],
       [ 1.55731868,  0.55280299,  1.54548514, ...,  1.88116631,
         1.04374227,  0.19229093],
       ...,
       [ 1.52874976,  1.47150615,  1.4501098 , ...,  0.43900458,
        -1.00379467, -1.25588676],
       [-0.63963097,  2.26281581, -0.66183197, ..., -0.68959168,
        -0.76491536, -0.11396037],
       [ 0.0831626 , -1.22825622,  0.04684827, ..., -0.15491471,
        -0.51828023, -0.32247189]])

### 데이터에 적합한 학습 알고리즘을 선택

In [10]:
# 검증 첫 번째 방식
# 주어진 데이터를 학습용과 검증용으로 나누어 모든 모델을 테스트한다
# 전체 데이터를 랜덤하게 섞어서 80%의 학습용, 20%의 검증용을 추출한다.
# random_state를 지정해주면 랜덤의 패턴이 고정된다.
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 1)

In [11]:
# 사용할 학습 모델들을 생성한다
# shift + tab 누르면 파라미터 설명 나옴
model1 = KNeighborsClassifier() # 예측시간 빠름/ 정확도 떨어짐
model2 = LogisticRegression()
model3 = SVC()
model4 = DecisionTreeClassifier()
model5 = RandomForestClassifier()
model6 = AdaBoostClassifier()
model7 = GradientBoostingClassifier() # 예측시간 오래걸림 /정확도 높음

In [12]:
# 학습용 데이터를 학습한다.
model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)
model4.fit(X_train, y_train)
model5.fit(X_train, y_train)
model6.fit(X_train, y_train)
model7.fit(X_train, y_train)

GradientBoostingClassifier()

In [14]:
# 검증용 데이터를 통해 결과 예측
pred1 = model1.predict(X_test)
pred2 = model2.predict(X_test)
pred3 = model3.predict(X_test)
pred4 = model4.predict(X_test)
pred5 = model5.predict(X_test)
pred6 = model6.predict(X_test)
pred7 = model7.predict(X_test)

In [15]:
# 평가한다.
r1 = accuracy_score(y_test, pred1)
r2 = accuracy_score(y_test, pred2)
r3 = accuracy_score(y_test, pred3)
r4 = accuracy_score(y_test, pred4)
r5 = accuracy_score(y_test, pred5)
r6 = accuracy_score(y_test, pred6)
r7 = accuracy_score(y_test, pred7)

In [16]:
# 80이상 정확도와 적당한 속도 모델을 고르기
print(r1)
print(r2)
print(r3)
print(r4)
print(r5)
print(r6)
print(r7)

0.9824561403508771
0.9649122807017544
0.9736842105263158
0.8947368421052632
0.9473684210526315
0.9473684210526315
0.9385964912280702


- 속도 : kNN > 선형 > 트리 > 앙상블 > 부스팅
- 예측 : 부스팅 > 앙상블 > 트리 > 선형 > 최근접이웃 일 가능성이 높다.
- 성능이 80% 이상되는 것들 중 만족하는 수준의 알고리즘 중에서 속도 젤 빠른것 선택
- 프로젝트에 따라 성능이 우선시 될 수도 속도가 우선이 될 수도 있기때매
- 중요한 부분에 따라 적절한 모델 선택

- 교차검증 : 전체 데이터를 나눈 다음 학습셋과 검증셋을 변경하면서 테스트 해보는 방법
- 10회를 한다고 가정한다면 전체 데이터를 열등분 한 후 데이터 변경하면서 학습과 검증 수행

In [17]:
# 검증할 모델 생성
model1 = KNeighborsClassifier()
model2 = LogisticRegression()
model3 = SVC()
model4 = DecisionTreeClassifier()
model5 = RandomForestClassifier()
model6 = AdaBoostClassifier()
model7 = GradientBoostingClassifier()
#학습을 안한 모델을 복제해서 학습과 검증을 수행한다.

In [18]:
# 교차검증 수행
# cross_val_score(검증모델, 독립변수, 종속변수, 평가지표, 검증횟수)
r1 = cross_val_score(model1, X, y, scoring='accuracy', cv = 10)
r2 = cross_val_score(model2, X, y, scoring='accuracy', cv=10)
r3 = cross_val_score(model3, X, y, scoring='accuracy', cv=10)
r4 = cross_val_score(model4, X, y, scoring='accuracy', cv=10)
r5 = cross_val_score(model5, X, y, scoring='accuracy', cv=10)
r6 = cross_val_score(model6, X, y, scoring='accuracy', cv=10)
r7 = cross_val_score(model7, X, y, scoring='accuracy', cv=10)
# 리턴값 : 매 회차마다 정확도를 리스트로 뽑아냄

In [20]:
# r1
# 각 모델의 성능 평균 수치를 출력
print(r1.mean())
print(r2.mean())
print(r3.mean())
print(r4.mean())
print(r5.mean())
print(r6.mean())
print(r7.mean())
# 교차검증 하니까 로지스틱 회귀 더 높게 나옴

0.9648309178743961
0.9734299516908212
0.9648792270531402
0.9207246376811595
0.9539613526570049
0.9538647342995169
0.9537681159420289


In [21]:
# 평균 성능 수치가 가장 높은 모델 선택
model2

LogisticRegression()

In [22]:
# 선정된 모델에 전체 데이터를 학습시킨다 (몰랐노.... 데이터 많으면 학습데이터만 해도 된다)
best_model = LogisticRegression()
best_model.fit(X,y)

LogisticRegression()

### 미래 데이터 분류

In [23]:
# 예측할 데이터를 불러온다.
df2 = pd.read_csv('data/breast_cancer_new.csv')
df2
# target 컬럼이 없는 ~~ 새로운 데이터

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,12.62,17.15,80.62,492.9,0.08583,0.05430,0.02966,0.02272,0.1799,0.05826,...,14.340,22.15,91.62,633.5,0.12250,0.15170,0.18870,0.09851,0.3270,0.07330
1,13.38,30.72,86.34,557.2,0.09245,0.07426,0.02819,0.03264,0.1375,0.06016,...,15.050,41.61,96.69,705.6,0.11720,0.14210,0.07003,0.07763,0.2196,0.07675
2,11.63,29.29,74.87,415.1,0.09357,0.08574,0.07160,0.02017,0.1799,0.06166,...,13.120,38.81,86.04,527.8,0.14060,0.20310,0.29230,0.06835,0.2884,0.07220
3,13.21,25.25,84.10,537.9,0.08791,0.05205,0.02772,0.02068,0.1619,0.05584,...,14.350,34.23,91.29,632.9,0.12890,0.10630,0.13900,0.06005,0.2444,0.06788
4,13.00,25.13,82.61,520.2,0.08369,0.05073,0.01206,0.01762,0.1667,0.05449,...,14.340,31.88,91.06,628.5,0.12180,0.10930,0.04462,0.05921,0.2306,0.06291
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.41070,0.22160,0.2060,0.07115
111,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.32150,0.16280,0.2572,0.06637
112,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.34030,0.14180,0.2218,0.07820
113,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.93870,0.26500,0.4087,0.12400


In [24]:
# 전처리
X = scaler1.transform(df2)
X

array([[-0.46250369, -0.45164581, -0.50218195, ..., -0.28043066,
         0.5116929 , -0.5831113 ],
       [-0.24537993,  2.87283471, -0.26498762, ..., -0.59279929,
        -1.15425761, -0.39577673],
       [-0.74533596,  2.52250257, -0.7406203 , ..., -0.7316298 ,
        -0.08705654, -0.64284116],
       ...,
       [ 0.67453917,  2.22606768,  0.64563954, ...,  0.3671957 ,
        -1.12013199, -0.31704191],
       [ 1.81729582,  2.53230207,  1.96430728, ...,  2.21029035,
         1.77899418,  2.16989242],
       [-1.85095302,  1.3588119 , -1.85817047, ..., -1.75415837,
        -0.10722167, -0.74112394]])

In [26]:
# 예측한다
pred = best_model.predict(X)
pred

array([0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 0])

In [27]:
# 예측된 결과를 문자열로 복원한다.
result = encoder1.inverse_transform(pred)
result

array(['benign', 'malignant', 'benign', 'benign', 'benign', 'benign',
       'malignant', 'malignant', 'benign', 'benign', 'benign', 'benign',
       'benign', 'benign', 'malignant', 'benign', 'benign', 'benign',
       'benign', 'benign', 'benign', 'benign', 'benign', 'benign',
       'benign', 'malignant', 'benign', 'benign', 'benign', 'benign',
       'benign', 'benign', 'benign', 'malignant', 'benign', 'malignant',
       'benign', 'benign', 'malignant', 'benign', 'benign', 'benign',
       'benign', 'benign', 'malignant', 'malignant', 'benign',
       'malignant', 'benign', 'malignant', 'benign', 'benign', 'benign',
       'benign', 'benign', 'malignant', 'benign', 'benign', 'malignant',
       'benign', 'malignant', 'benign', 'malignant', 'malignant',
       'benign', 'benign', 'benign', 'malignant', 'benign', 'benign',
       'benign', 'benign', 'benign', 'benign', 'benign', 'benign',
       'benign', 'benign', 'benign', 'malignant', 'benign', 'malignant',
       'malignant', 'b

In [28]:
# 결과를 붙여주고 저장한다
df2['target']= result
df2.to_csv('data/breaset_cancer_result.csv', index = False)