# 03. SVM 적용

### 서포트 벡터 머신 (SVM ; support vector machine)
- 선형이나 비선형 분류, 회귀, 이상치 탐색에 사용하는 다목적 머신러닝 모델
- 복잡한 분류 문제, 작거나 중간 크기의 데이터 셋에 적합
- 특성의 스케일에 민감
    - 특성의 스케일을 조정하면 결정 경계가 훨씬 좋아짐
    - sklearn의 StandardScaler 사용

**하드 마진 분류**
- 모든 샘플이 도로 바깥쪽에 올바르게 분류
- 데이터가 선형적으로 구분될 수 있어야 제대로 작동
- 이상치에 민감

**소프트 마진 분류**
- 도로의 폭을 가능한 넓게 유지하고 마진 오류 사이에 적절한 균형 잡음

In [74]:
import pymysql
import pandas as pd
from sklearn.svm import SVR, LinearSVR
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Lasso

In [54]:
conn = pymysql.connect(host='localhost', user='ssafy', password='ssafy',
                       db='docong_test_v3', charset='utf8')
cursor = conn.cursor()

getPomoSql = f"SELECT user.seq, user.birth, user.gender, user.job, user.position, user.mbti, todo.work_importance, todo.predicted_pomo, todo.real_pomo, todo.work_proficiency, todo.work_type, pomo.start_time, pomo.end_time, pomo.time_status FROM pomodoro pomo LEFT JOIN todo ON pomo.todo_seq=todo.seq INNER JOIN user ON pomo.user_seq=user.seq WHERE todo.`status`=2;"
columns = ["user_seq", "birth","gender","job","position","mbti","importance","predicted_pomo", "real_pomo", "proficiency","type","start_time","end_time","timeStatus"]

cursor.execute(getPomoSql)
total = pd.DataFrame(cursor.fetchall(), columns=columns)
total.fillna(value=-1, inplace=True)
total.head()

Unnamed: 0,user_seq,birth,gender,job,position,mbti,importance,predicted_pomo,real_pomo,proficiency,type,start_time,end_time,timeStatus
0,2,1998-01-24T00:00:00.000Z,FEMALE,IT/인터넷,0,ISFP,4,12,20,1,4,2022-03-29 17:39:52,2022-03-29 17:54:52,SHORT
1,2,1998-01-24T00:00:00.000Z,FEMALE,IT/인터넷,0,ISFP,4,12,20,1,4,2022-03-29 21:07:02,2022-03-29 21:57:02,LONG
2,2,1998-01-24T00:00:00.000Z,FEMALE,IT/인터넷,0,ISFP,4,12,20,1,4,2022-03-29 22:19:53,2022-03-29 23:09:53,LONG
3,2,1998-01-24T00:00:00.000Z,FEMALE,IT/인터넷,0,ISFP,4,12,20,1,4,2022-03-29 23:17:30,2022-03-29 23:32:30,SHORT
4,2,1998-01-24T00:00:00.000Z,FEMALE,IT/인터넷,0,ISFP,4,12,20,1,4,2022-03-29 23:52:35,2022-03-30 00:17:35,BASIC


### 전처리

In [55]:
# birth -> 년/월/일 로 변경
total['birth_year'] = total.birth.str.split('-').str[0]
total['birth_month'] = total.birth.str.split('-').str[1]
total['birth_day'] = total.birth.str.split('-').str[2].str.split('T').str[0]
total.drop(['birth'], axis=1, inplace=True)

# 결측치는 0으로 채움
total['birth_year'].fillna(value=0, inplace=True)
total['birth_month'].fillna(value=0, inplace=True)
total['birth_day'].fillna(value=0, inplace=True)

total = total.astype({'birth_year': 'int', 'birth_month': 'int', 'birth_day': 'int'})
total.head()

Unnamed: 0,user_seq,gender,job,position,mbti,importance,predicted_pomo,real_pomo,proficiency,type,start_time,end_time,timeStatus,birth_year,birth_month,birth_day
0,2,FEMALE,IT/인터넷,0,ISFP,4,12,20,1,4,2022-03-29 17:39:52,2022-03-29 17:54:52,SHORT,1998,1,24
1,2,FEMALE,IT/인터넷,0,ISFP,4,12,20,1,4,2022-03-29 21:07:02,2022-03-29 21:57:02,LONG,1998,1,24
2,2,FEMALE,IT/인터넷,0,ISFP,4,12,20,1,4,2022-03-29 22:19:53,2022-03-29 23:09:53,LONG,1998,1,24
3,2,FEMALE,IT/인터넷,0,ISFP,4,12,20,1,4,2022-03-29 23:17:30,2022-03-29 23:32:30,SHORT,1998,1,24
4,2,FEMALE,IT/인터넷,0,ISFP,4,12,20,1,4,2022-03-29 23:52:35,2022-03-30 00:17:35,BASIC,1998,1,24


In [56]:
# gender
total['gender'] = total['gender'].map({'MALE': 0, 'FEMALE': 1, -1: -1})

total.head()

Unnamed: 0,user_seq,gender,job,position,mbti,importance,predicted_pomo,real_pomo,proficiency,type,start_time,end_time,timeStatus,birth_year,birth_month,birth_day
0,2,1,IT/인터넷,0,ISFP,4,12,20,1,4,2022-03-29 17:39:52,2022-03-29 17:54:52,SHORT,1998,1,24
1,2,1,IT/인터넷,0,ISFP,4,12,20,1,4,2022-03-29 21:07:02,2022-03-29 21:57:02,LONG,1998,1,24
2,2,1,IT/인터넷,0,ISFP,4,12,20,1,4,2022-03-29 22:19:53,2022-03-29 23:09:53,LONG,1998,1,24
3,2,1,IT/인터넷,0,ISFP,4,12,20,1,4,2022-03-29 23:17:30,2022-03-29 23:32:30,SHORT,1998,1,24
4,2,1,IT/인터넷,0,ISFP,4,12,20,1,4,2022-03-29 23:52:35,2022-03-30 00:17:35,BASIC,1998,1,24


In [57]:
# job
jobs = ['경영/사무', '마케팅/무역/유통', '영업/고객상담', 'IT/인터넷', '연구개발/설계',
        '생산/제조', '전문/특수직', '디자인', '미디어', '서비스', '건설']
map_dic = {-1: -1}
for i in range(len(jobs)):
    map_dic[jobs[i]] = i

total['job'] = total['job'].map(map_dic)
total.head()

Unnamed: 0,user_seq,gender,job,position,mbti,importance,predicted_pomo,real_pomo,proficiency,type,start_time,end_time,timeStatus,birth_year,birth_month,birth_day
0,2,1,3,0,ISFP,4,12,20,1,4,2022-03-29 17:39:52,2022-03-29 17:54:52,SHORT,1998,1,24
1,2,1,3,0,ISFP,4,12,20,1,4,2022-03-29 21:07:02,2022-03-29 21:57:02,LONG,1998,1,24
2,2,1,3,0,ISFP,4,12,20,1,4,2022-03-29 22:19:53,2022-03-29 23:09:53,LONG,1998,1,24
3,2,1,3,0,ISFP,4,12,20,1,4,2022-03-29 23:17:30,2022-03-29 23:32:30,SHORT,1998,1,24
4,2,1,3,0,ISFP,4,12,20,1,4,2022-03-29 23:52:35,2022-03-30 00:17:35,BASIC,1998,1,24


In [58]:
# mbti
mbtis = ['ISTJ', 'ISFJ', 'INFJ', 'INTJ', 'ISTP', 'ISFP', 'INFP', 'INTP',
        'ESTP', 'ESFP', 'ENFP', 'ENTP', 'ESTJ', 'ESFJ', 'ENFJ', 'ENTJ']
map_dic = {-1: -1}
for i in range(len(mbtis)):
    map_dic[mbtis[i]] = i

total['mbti'] = total['mbti'].map(map_dic)
total.head()

Unnamed: 0,user_seq,gender,job,position,mbti,importance,predicted_pomo,real_pomo,proficiency,type,start_time,end_time,timeStatus,birth_year,birth_month,birth_day
0,2,1,3,0,5,4,12,20,1,4,2022-03-29 17:39:52,2022-03-29 17:54:52,SHORT,1998,1,24
1,2,1,3,0,5,4,12,20,1,4,2022-03-29 21:07:02,2022-03-29 21:57:02,LONG,1998,1,24
2,2,1,3,0,5,4,12,20,1,4,2022-03-29 22:19:53,2022-03-29 23:09:53,LONG,1998,1,24
3,2,1,3,0,5,4,12,20,1,4,2022-03-29 23:17:30,2022-03-29 23:32:30,SHORT,1998,1,24
4,2,1,3,0,5,4,12,20,1,4,2022-03-29 23:52:35,2022-03-30 00:17:35,BASIC,1998,1,24


In [59]:
# start_time, end_time

# 실행시간
def during_time(x):
    return x.total_seconds()

total['during_sec'] = (total['end_time'] - total['start_time']).apply(during_time)
total = total.astype({'during_sec': 'int'})

# 시간대
# 0 : 새벽 (24~4) 1: 아침(5~8) 2: 오전(9~11) 3: 오후(12~17) 4: 저녁(18~23)
bins = [-1, 4, 8, 11, 17, 23]
labels = [x for x in range(5)]

def hours(x):
    return x.hour

total['hours'] = total['start_time'].apply(hours)
cats = pd.cut(total['hours'], bins, labels=labels)
total['timezone'] = cats

total.drop(['start_time'], axis=1, inplace=True)
total.drop(['end_time'], axis=1, inplace=True)
total.drop(['hours'], axis=1, inplace=True)

total.head()

Unnamed: 0,user_seq,gender,job,position,mbti,importance,predicted_pomo,real_pomo,proficiency,type,timeStatus,birth_year,birth_month,birth_day,during_sec,timezone
0,2,1,3,0,5,4,12,20,1,4,SHORT,1998,1,24,900,3
1,2,1,3,0,5,4,12,20,1,4,LONG,1998,1,24,3000,4
2,2,1,3,0,5,4,12,20,1,4,LONG,1998,1,24,3000,4
3,2,1,3,0,5,4,12,20,1,4,SHORT,1998,1,24,900,4
4,2,1,3,0,5,4,12,20,1,4,BASIC,1998,1,24,1500,4


In [60]:
# time_status

total['timeStatus'] = total['timeStatus'].map({
    'SHORT': 0, 'BASIC': 1,
    'LONG': 2, -1: -1
})

total.head()

Unnamed: 0,user_seq,gender,job,position,mbti,importance,predicted_pomo,real_pomo,proficiency,type,timeStatus,birth_year,birth_month,birth_day,during_sec,timezone
0,2,1,3,0,5,4,12,20,1,4,0,1998,1,24,900,3
1,2,1,3,0,5,4,12,20,1,4,2,1998,1,24,3000,4
2,2,1,3,0,5,4,12,20,1,4,2,1998,1,24,3000,4
3,2,1,3,0,5,4,12,20,1,4,0,1998,1,24,900,4
4,2,1,3,0,5,4,12,20,1,4,1,1998,1,24,1500,4


In [61]:
total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213 entries, 0 to 212
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   user_seq        213 non-null    int64   
 1   gender          213 non-null    int64   
 2   job             213 non-null    int64   
 3   position        213 non-null    object  
 4   mbti            213 non-null    int64   
 5   importance      213 non-null    int64   
 6   predicted_pomo  213 non-null    int64   
 7   real_pomo       213 non-null    int64   
 8   proficiency     213 non-null    int64   
 9   type            213 non-null    int64   
 10  timeStatus      213 non-null    int64   
 11  birth_year      213 non-null    int64   
 12  birth_month     213 non-null    int64   
 13  birth_day       213 non-null    int64   
 14  during_sec      213 non-null    int64   
 15  timezone        213 non-null    category
dtypes: category(1), int64(14), object(1)
memory usage: 25.5+ KB


In [62]:
total.describe()

Unnamed: 0,user_seq,gender,job,mbti,importance,predicted_pomo,real_pomo,proficiency,type,timeStatus,birth_year,birth_month,birth_day,during_sec
count,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0
mean,18.57277,0.352113,2.887324,5.638498,3.150235,8.084507,20.967136,1.089202,5.230047,1.028169,1939.478873,6.169014,16.676056,1812.690141
std,18.065933,0.534611,0.663381,5.183958,0.939672,4.013232,13.720383,1.08442,3.704744,0.800445,330.983078,3.881448,7.322313,880.086047
min,1.0,-1.0,-1.0,-1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
25%,4.0,0.0,3.0,1.0,2.0,5.0,8.0,0.0,4.0,0.0,1995.0,1.0,14.0,900.0
50%,25.0,0.0,3.0,5.0,4.0,8.0,20.0,1.0,4.0,1.0,1995.0,7.0,18.0,1500.0
75%,25.0,1.0,3.0,10.0,4.0,12.0,40.0,2.0,5.0,2.0,1996.0,9.0,20.0,3000.0
max,99.0,1.0,3.0,15.0,4.0,12.0,40.0,4.0,16.0,2.0,1998.0,12.0,28.0,3000.0


In [63]:
total.describe(include='O')

Unnamed: 0,position
count,213
unique,2
top,0
freq,207


### 상관관계

In [92]:
total_corr = abs(total.corr())
total_corr['real_pomo'].sort_values(ascending=False).to_frame()

Unnamed: 0,real_pomo
real_pomo,1.0
predicted_pomo,0.86107
proficiency,0.767331
mbti,0.620116
gender,0.503227
birth_month,0.450501
type,0.353783
birth_day,0.286251
job,0.206889
birth_year,0.205761


### SVR vs LinearSVR
- SVR : SVC의 회귀버전
    - 훈련 세트가 커지면 훨씬 느려짐
- LinearSVR : LinearSVC의 회귀버전
    - 필요한 시간이 훈련 세트의 크기에 비례하여 선형적으로 늘어남

In [80]:
X, y = total.drop(['predicted_pomo', 'real_pomo'], axis=1), total.real_pomo
X_train, X_test, y_train, y_test = train_test_split(X, y)

**SVR**

In [81]:
svm_ploy_reg = SVR(kernel='poly', degree=2, C=100, epsilon=0.1)
svm_ploy_reg.fit(X_train, y_train)

SVR(C=100, cache_size=200, coef0=0.0, degree=2, epsilon=0.1, gamma='scale',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [82]:
print("훈련 세트 점수: {:.2f}".format(svm_ploy_reg.score(X_train, y_train)))
print("테스트 세트 점수: {:.2f}".format(svm_ploy_reg.score(X_test, y_test)))

훈련 세트 점수: 0.12
테스트 세트 점수: -0.24


In [83]:
svm_ploy_reg_pred = svm_ploy_reg.predict(X_test)
print(f'score: {cross_val_score(svm_ploy_reg, X_train, y_train, cv=5).mean()}')

score: 0.053627250505488465


In [84]:
print("훈련 세트 점수: {:.2f}".format(svm_ploy_reg.score(X_train, y_train)))
print("테스트 세트 점수: {:.2f}".format(svm_ploy_reg.score(X_test, y_test)))

훈련 세트 점수: 0.12
테스트 세트 점수: -0.24


**Linear SVR**

In [85]:
svm_reg = LinearSVR(epsilon=1.5)
svm_reg.fit(X_train, y_train)



LinearSVR(C=1.0, dual=True, epsilon=1.5, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=None, tol=0.0001, verbose=0)

In [86]:
svm_reg_pred = svm_reg.predict(X_test)
print(f'score: {cross_val_score(svm_reg, X_train, y_train, cv=5).mean()}')



score: -0.013558651238064634




In [87]:
print("훈련 세트 점수: {:.2f}".format(svm_reg.score(X_train, y_train)))
print("테스트 세트 점수: {:.2f}".format(svm_reg.score(X_test, y_test)))

훈련 세트 점수: 0.37
테스트 세트 점수: 0.33


In [88]:
# 모델
lasso_model = Lasso(alpha=0.0001, max_iter=50000)
# 모델 학습
lasso_model.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=50000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [89]:
pred = lasso_model.predict(X_test)

In [90]:
print("훈련 세트 점수: {:.2f}".format(lasso_model.score(X_train, y_train)))
print("테스트 세트 점수: {:.2f}".format(lasso_model.score(X_test, y_test)))

print(f'train_score: {cross_val_score(lasso_model, X_train, y_train, cv=5).mean()}')
print(f'test_score: {cross_val_score(lasso_model, X_test, y_test, cv=5).mean()}')

훈련 세트 점수: 0.85
테스트 세트 점수: 0.83


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


score: 0.8042118298872426


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


score: 0.8100039569561159


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [91]:
data = {'y_test' : y_test, 'pred' : pred}
pd.DataFrame(data)

Unnamed: 0,y_test,pred
197,8,11.705136
153,40,38.643848
74,17,19.131285
188,40,39.517158
36,4,16.396296
180,40,38.643848
8,20,14.325068
30,26,30.506954
28,26,30.506954
54,18,9.329341
