In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
from functools import reduce
from datetime import datetime, timezone
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, f1_score

# 경고 무시
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# 한글 출력
import matplotlib
matplotlib.rcParams['font.family'] = 'Malgun Gothic' 
matplotlib.rcParams['font.size'] = 15 
matplotlib.rcParams['axes.unicode_minus']

True

# 비선형 SVM
- Non-Linear Support Vector Machine

<div style="background-color:rgb(253, 255, 187); padding: 30px; border-radius: 10px; color: black; font-size: 18px;">
    데이터 로드
</div>

### original data

In [2]:
origin_data = pd.read_csv('prep_data.csv')
origin_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 694175 entries, 0 to 694174
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   tier                     694175 non-null  int64  
 1   rank                     694175 non-null  int64  
 2   winRate                  694175 non-null  float64
 3   veteran                  694175 non-null  int64  
 4   freshBlood               694175 non-null  int64  
 5   hotStreak                694175 non-null  int64  
 6   revisionDate             694175 non-null  int64  
 7   summonerLevel            694175 non-null  int64  
 8   championId               694175 non-null  int64  
 9   championLevel            694175 non-null  int64  
 10  championPoints           694175 non-null  int64  
 11  lastPlayTime             694175 non-null  int64  
 12  championSeasonMilestone  694175 non-null  int64  
 13  churn                    694175 non-null  int64  
dtypes: f

### oversampling_data

In [2]:
oversampling_data = pd.read_csv('oversampling_data.csv')
oversampling_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1210694 entries, 0 to 1210693
Data columns (total 14 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   tier                     1210694 non-null  int64  
 1   rank                     1210694 non-null  int64  
 2   winRate                  1210694 non-null  float64
 3   veteran                  1210694 non-null  int64  
 4   freshBlood               1210694 non-null  int64  
 5   hotStreak                1210694 non-null  int64  
 6   revisionDate             1210694 non-null  int64  
 7   summonerLevel            1210694 non-null  int64  
 8   championId               1210694 non-null  int64  
 9   championLevel            1210694 non-null  int64  
 10  championPoints           1210694 non-null  int64  
 11  lastPlayTime             1210694 non-null  int64  
 12  championSeasonMilestone  1210694 non-null  int64  
 13  churn                    1210694 non-null 

### hotStreak_churn_oversampling_data

In [4]:
hotStreak_churn_oversampling_data = pd.read_csv('./hotStreak_churn_oversampling_data.csv')
hotStreak_churn_oversampling_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2289284 entries, 0 to 2289283
Data columns (total 14 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   tier                     int64  
 1   rank                     int64  
 2   winRate                  float64
 3   veteran                  int64  
 4   freshBlood               int64  
 5   revisionDate             int64  
 6   summonerLevel            int64  
 7   championId               int64  
 8   championLevel            int64  
 9   championPoints           int64  
 10  lastPlayTime             int64  
 11  championSeasonMilestone  int64  
 12  hotStreak                int64  
 13  churn                    int64  
dtypes: float64(1), int64(13)
memory usage: 244.5 MB


### hotStreak_oversampling_data

In [5]:
hotStreak_oversampling_data = pd.read_csv('./hotStreak_oversampling_data.csv')
hotStreak_oversampling_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1245718 entries, 0 to 1245717
Data columns (total 14 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   tier                     1245718 non-null  int64  
 1   rank                     1245718 non-null  int64  
 2   winRate                  1245718 non-null  float64
 3   veteran                  1245718 non-null  int64  
 4   freshBlood               1245718 non-null  int64  
 5   revisionDate             1245718 non-null  int64  
 6   summonerLevel            1245718 non-null  int64  
 7   championId               1245718 non-null  int64  
 8   championLevel            1245718 non-null  int64  
 9   championPoints           1245718 non-null  int64  
 10  lastPlayTime             1245718 non-null  int64  
 11  championSeasonMilestone  1245718 non-null  int64  
 12  churn                    1245718 non-null  int64  
 13  hotStreak                1245718 non-null 

### undersampling_data

In [6]:
undersampling_data = pd.read_csv('undersampling_data.csv')
undersampling_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 676721 entries, 0 to 676720
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Unnamed: 0               676721 non-null  int64  
 1   tier                     676721 non-null  int64  
 2   rank                     676721 non-null  int64  
 3   winRate                  676721 non-null  float64
 4   veteran                  676721 non-null  int64  
 5   freshBlood               676721 non-null  int64  
 6   hotStreak                676721 non-null  int64  
 7   revisionDate             676721 non-null  int64  
 8   summonerLevel            676721 non-null  int64  
 9   championId               676721 non-null  int64  
 10  championLevel            676721 non-null  int64  
 11  championPoints           676721 non-null  int64  
 12  lastPlayTime             676721 non-null  int64  
 13  championSeasonMilestone  676721 non-null  int64  
 14  chur

<div style="background-color:rgb(253, 255, 187); padding: 30px; border-radius: 10px; color: black; font-size: 18px;">
    PCA
</div>

In [7]:
# pca = PCA(n_components=2)  # 2차원으로 축소
# X_train_2d = pca.fit_transform(X_train)
# X_test_2d = pca.fit_transform(X_test)

In [8]:
# plt.figure(figsize=(6, 5))
# plt.scatter(X_train_2d[:, 0], X_train_2d[:, 1], alpha=0.5, label="PCA 변환 데이터")

# for i, comp in enumerate(pca.components_):
#     plt.arrow(0, 0, comp[0], comp[1], color='r', alpha=0.7, head_width=0.05)
#     plt.text(comp[0], comp[1], f"PC{i+1}", color='r')

# plt.xlabel("PC1")
# plt.ylabel("PC2")
# plt.legend()
# plt.title("PCA 주성분 방향")
# plt.show()

<div style="background-color:rgb(253, 255, 187); padding: 30px; border-radius: 10px; color: black; font-size: 18px;">
    Machine Learning
</div>

In [3]:
# tier, winRate, veteran, revisionDate, championId
over_X = oversampling_data.drop(['churn', 'revisionDate','championId','tier','winRate','veteran'], axis=1)
over_y = oversampling_data['churn']

over_X_train, over_X_test, over_y_train, over_y_test = train_test_split(over_X, over_y, random_state=0)
over_X_train.shape, over_X_test.shape, over_y_train.shape, over_y_test.shape

((908020, 8), (302674, 8), (908020,), (302674,))

In [9]:
under_X = undersampling_data.drop(['churn'], axis=1)
under_y = undersampling_data['churn']

under_X_train, under_X_test, under_y_train, under_y_test = train_test_split(under_X, under_y, random_state=0)
under_X_train.shape, under_X_test.shape, under_y_train.shape, under_y_test.shape

((507540, 14), (169181, 14), (507540,), (169181,))

In [8]:
origin_X = origin_data.drop([
    "tier", "rank", "winRate","veteran", "hotStreak", "revisionDate", "championId", "championLevel", "championPoints"], axis=1)
origin_y = origin_data['churn']

origin_X_train, origin_X_test, origin_y_train, origin_y_test = train_test_split(origin_X, origin_y, random_state=0)
origin_X_train.shape, origin_X_test.shape, origin_y_train.shape, origin_y_test.shape

((520631, 5), (173544, 5), (520631,), (173544,))

---

### over_linear_score

In [4]:
over_linear_model = LinearSVC()

over_linear_model.fit(over_X_train, over_y_train)

over_linear_score = over_linear_model.score(over_X_test, over_y_test)
print('LinearSVC 정확도:', over_linear_score)

LinearSVC 정확도: 0.6288911502144221


In [7]:
over_y_pred_train = over_linear_model.predict(over_X_train)
over_acc_score_train = accuracy_score(over_y_train, over_y_pred_train)
over_f1_score_train = f1_score(over_y_train, over_y_pred_train)
print(over_acc_score_train)
print(over_f1_score_train)

0.6289046496773198
0.5036822972821701


In [8]:
over_y_pred_test = over_linear_model.predict(over_X_test)
over_acc_score_test = accuracy_score(over_y_test, over_y_pred_test)
over_f1_score_test = f1_score(over_y_test, over_y_pred_test)
print(over_acc_score_test)
print(over_f1_score_test)

0.6288911502144221
0.5042349129845654


In [11]:
origin_linear_model = LinearSVC()

origin_linear_model.fit(origin_X_train, origin_y_train)

origin_linear_score = origin_linear_model.score(origin_X_test, origin_y_test)
print('LinearSVC 정확도:', origin_linear_score)

LinearSVC 정확도: 1.0


In [12]:
origin_y_pred_train = origin_linear_model.predict(origin_X_train)
origin_acc_score_train = accuracy_score(origin_y_train, origin_y_pred_train)
origin_f1_score_train = f1_score(origin_y_train, origin_y_pred_train)
print(origin_acc_score_train)
print(origin_f1_score_train)

1.0
1.0


In [13]:
origin_y_pred_test = origin_linear_model.predict(origin_X_test)
origin_acc_score_test = accuracy_score(origin_y_test, origin_y_pred_test)
origin_f1_score_test = f1_score(origin_y_test, origin_y_pred_test)
print(origin_acc_score_test)
print(origin_f1_score_test)

1.0
1.0


In [None]:
origin_y_pred_train = origin_rbf_model.predict(origin_X_train)
origin_acc_score_train = accuracy_score(origin_y_train, origin_y_pred_train)
origin_f1_score_train = f1_score(origin_y_train, origin_y_pred_train)
print('훈련 데이터 평가:', origin_acc_score_train)
print('훈련 데이터 f1 평가:', origin_f1_score_train)

origin_y_pred_test = origin_rbf_model.predict(origin_X_test)
origin_acc_score_test = accuracy_score(origin_y_test, origin_y_pred_test)
origin_f1_score_test = f1_score(origin_y_test, origin_y_pred_test)
print('평가 데이터 평가:', origin_acc_score_test)
print('평가 데이터 f1 평가:',origin_f1_score_test)

In [None]:
origin_poly_model = SVC()

origin_poly_model.fit(origin_X_train, origin_y_train)

origin_poly_score = origin_poly_model.score(origin_X_test, origin_y_test)

print('poly kernel 정확도:', origin_poly_score)

In [None]:
origin_y_pred_train = origin_poly_model.predict(origin_X_train)
origin_acc_score_train = accuracy_score(origin_y_train, origin_y_pred_train)
origin_f1_score_train = f1_score(origin_y_train, origin_y_pred_train)
print('훈련 데이터 평가:', origin_acc_score_train)
print('훈련 데이터 f1 평가:', origin_f1_score_train)

origin_y_pred_test = origin_poly_model.predict(origin_X_test)
origin_acc_score_test = accuracy_score(origin_y_test, origin_y_pred_test)
origin_f1_score_test = f1_score(origin_y_test, origin_y_pred_test)
print('평가 데이터 평가:', origin_acc_score_test)
print('평가 데이터 f1 평가:',origin_f1_score_test)

# pca

In [9]:
pca = PCA(n_components=2)
over_X_train_2d = pca.fit_transform(over_X_train)
over_X_test_2d = pca.transform(over_X_test)

In [None]:
# rbf_model = SVC(kernel='rbf', cache_size=2048)
linear_model = SVC(kernel='linear')
# poly_model = SVC(kernel='poly')

# rbf_model.fit(over_X_train_2d, over_y_train)
linear_model.fit(over_X_train_2d, over_y_train)
# poly_model.fit(over_X_train_2d, over_y_train)

# rbf_score = rbf_model.score(over_X_test_2d, over_y_test)
linear_score = linear_model.score(over_X_train_2d, over_y_train)
# poly_score = poly_model.score(over_X_test_2d, over_y_test)

print('--- 각 kernel별 정확도 ---')
# print('rbf:', rbf_score)
print('linear:', linear_score)
# print('poly:', poly_score)