In [None]:
import pandas as pd
df=pd.read_csv("d:/data/titanic/train3.csv") #데이터 파일 로딩
df.head()

In [None]:
df.columns #필드 목록

In [None]:
df.shape #데이터셋의 형태

In [None]:
df.describe() #기초 통계량

In [None]:
df.groupby("Survived").mean() #클래스별 평균값

In [None]:
df["Survived"].value_counts() #클래스별 샘플 갯수

In [None]:
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import font_manager,rc
font_name=font_manager.FontProperties(fname="c:/windows/fonts/malgun.ttf").get_name()
rc("font",family=font_name)
colors=["yellow","red"]
sns.countplot("Survived",data=df,palette=colors) #카운트 플롯
plt.title("분류\n(0:사망, 1:생존)",fontsize=14)

In [None]:
train_cols=df.columns[0:6]
X=df[train_cols] #독립변수
y=df["Survived"] #종속변수

In [None]:
#산점도 행렬
import mglearn
pd.plotting.scatter_matrix(df,c=y,figsize=(15,15),marker="o",
                          cmap=mglearn.cm3)

In [None]:
#언더샘플링
from imblearn.under_sampling import RandomUnderSampler
X_sample,y_sample=RandomUnderSampler(random_state=0).fit_sample(X,y)
X_samp=pd.DataFrame(data=X_sample,columns=train_cols)
y_samp=pd.DataFrame(data=y_sample,columns=["Survived"])
df2=pd.concat([X_samp,y_samp],axis=1)

In [None]:
df2["Survived"].value_counts()

In [None]:
X=X_samp[train_cols]
y=y_samp["Survived"]

In [None]:
#데이터셋 구분 8:2
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,
    stratify=y,random_state=0)

In [None]:
#svm 모형 생성
from sklearn.svm import SVC
model=SVC(random_state=10)
model.fit(X_train,y_train)

In [None]:
print(model.score(X_train,y_train))
print(model.score(X_test,y_test))
#과적합 문제

In [None]:
#전처리(변수 표준화)
min_on_training=X_train.min(axis=0)
range_on_training=(X_train - min_on_training).max(axis=0)
X_train_scaled=(X_train - min_on_training) / range_on_training
X_test_scaled=(X_test - min_on_training) / range_on_training

In [None]:
model=SVC(random_state=10)
model.fit(X_train_scaled,y_train)

In [None]:
print(model.score(X_train_scaled,y_train))
print(model.score(X_test_scaled,y_test))

In [None]:
#최적의 C, gamma를 찾는 과정
import numpy as np
train_rate=[]
test_rate=[]

c_values=[0.001, 0.01, 0.1, 1, 10, 100, 1000]
g_values=[0.0001, 0.001, 0.01, 0.1]

for n in c_values:
    for g in g_values:
        model=SVC(C=n, gamma=g, random_state=20) #svm 모형 생성
        model.fit(X_train_scaled,y_train) #모형 훈련
        train_rate.append([n,g,model.score(X_train_scaled,y_train)])
        test_rate.append([n,g,model.score(X_test_scaled,y_test)])
        
train_arr=np.array(train_rate) #넘파이 배열로 변환
test_arr=np.array(test_rate) 
max_rate=np.max(test_arr[:,2]) #정확도의 최대값
idx=np.where(test_arr == max_rate)[0][0] #가장 높은 정확도의 인덱스를 찾음
print("최적의 C:",test_rate[idx][0])
print("최적의 gamma:",test_rate[idx][1])
print("최적의 정확도:",test_rate[idx][2])

plt.rcParams["font.size"]=15
plt.plot(range(len(train_rate)),train_arr[:,2],label="학습용 데이터셋")
plt.plot(range(len(test_rate)),test_arr[:,2],label="검증용 데이터셋")
plt.ylabel("정확도")
plt.legend()

In [None]:
#실험을 통해 얻은 최적의 C,gamma value를 사용한 모형
model=SVC(C=1000,gamma=0.1, random_state=10)
model.fit(X_train_scaled,y_train)

In [None]:
print(model.score(X_train_scaled,y_train))
print(model.score(X_test_scaled,y_test))

In [None]:
#오분류표 출력
from sklearn.metrics import confusion_matrix
pred=model.predict(X_test_scaled)
cm=confusion_matrix(y_test,pred)
cm

In [None]:
df_cm=pd.DataFrame(cm,index=[0,1],columns=[0,1])
df_cm

In [None]:
sns.set(font_scale=1.4)
plt.figure(figsize=(9,6))
plt.title("Confusion Matrix")
sns.heatmap(df_cm,annot=True,fmt="d")