In [None]:
import pandas as pd
df=pd.read_csv("d:/learn/data/creditcard.csv")
df.head()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.groupby("Class").mean()

In [None]:
df["Class"].value_counts()

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import font_manager,rc
font_name=font_manager.FontProperties(fname="c:/windows/fonts/malgun.ttf").get_name()
rc("font",family=font_name)
colors=["yellow","red"]
sns.countplot("Class",data=df,palette=colors)
plt.title("분류\n(0:정상, 1:사기)",fontsize=14)

In [None]:
print(df.Amount[df.Class == 1].describe()) #사기거래
print(df.Amount[df.Class == 0].describe()) #정상거래

In [None]:
train_cols=df.columns[1:29]
X=df[train_cols]
y=df["Class"]

In [None]:
# #언더샘플링
# from imblearn.under_sampling import RandomUnderSampler
# X_sample,y_sample=RandomUnderSampler(random_state=0).fit_sample(X,y)
# X_samp=pd.DataFrame(data=X_sample,columns=train_cols)
# y_samp=pd.DataFrame(data=y_sample,columns=["Class"])
# df2=pd.concat([X_samp,y_samp],axis=1)

In [None]:
#오버샘플링
from imblearn.over_sampling import SMOTE
X_sample,y_sample=SMOTE(random_state=0).fit_sample(X,y)
X_samp=pd.DataFrame(data=X_sample,columns=train_cols)
y_samp=pd.DataFrame(data=y_sample,columns=["Class"])
df2=pd.concat([X_samp,y_samp],axis=1)

In [None]:
df2["Class"].value_counts()

In [None]:
X=X_samp[train_cols]
y=y_samp["Class"]

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,
    stratify=y,random_state=10)

In [None]:
#svm 모형 생성
from sklearn.svm import SVC
model=SVC(random_state=10)
model.fit(X_train,y_train)

In [None]:
#과적합 문제
print(model.score(X_train,y_train))
print(model.score(X_test,y_test))

In [None]:
#데이터 전처리
min_on_training=X_train.min(axis=0) #특성별 최소값
range_on_training=(X_train - min_on_training).max(axis=0) #최대값
X_train_scaled=(X_train - min_on_training) / range_on_training
X_test_scaled=(X_test - min_on_training) / range_on_training

In [None]:
model=SVC(random_state=10)
model.fit(X_train_scaled,y_train) #전처리한 데이터로 학습

In [None]:
#과적합 문제가 해소됨
print(model.score(X_train_scaled,y_train))
print(model.score(X_test_scaled,y_test))

In [None]:
#최적의 C, gamma를 찾는 과정
import numpy as np
train_rate=[]
test_rate=[]

c_values=[0.001, 0.01, 0.1, 1, 10, 100, 1000]
g_values=[0.0001, 0.001, 0.01, 0.1]

for n in c_values:
    for g in g_values:
        model=SVC(C=n, gamma=g, random_state=20) #svm 모형 생성
        model.fit(X_train_scaled,y_train) #모형 훈련
        train_rate.append([n,g,model.score(X_train_scaled,y_train)])
        test_rate.append([n,g,model.score(X_test_scaled,y_test)])
        
train_arr=np.array(train_rate) #넘파이 배열로 변환
test_arr=np.array(test_rate) 
max_rate=np.max(test_arr[:,2]) #정확도의 최대값
idx=np.where(test_arr == max_rate)[0][0] #가장 높은 정확도의 인덱스를 찾음
print("최적의 C:",test_rate[idx][0])
print("최적의 gamma:",test_rate[idx][1])
print("최적의 정확도:",test_rate[idx][2])

plt.rcParams["font.size"]=15
plt.plot(range(len(train_rate)),train_arr[:,2],label="학습용 데이터셋")
plt.plot(range(len(test_rate)),test_arr[:,2],label="검증용 데이터셋")
plt.ylabel("정확도")
plt.legend()

In [None]:
#최적의 C,gamma value로 만든 모형
model=SVC(C=1000,gamma=0.001,random_state=10)
model.fit(X_train_scaled,y_train) 

In [None]:
print(model.score(X_train_scaled,y_train))
print(model.score(X_test_scaled,y_test))