In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('분석용최최종.csv')
data = data.drop(columns=['num', 'enddate', 'end+1','target2','Start Date', 'End Date','#','팀명','경기','투_승','투_패','상대1','경기1','투_승1','투_패1'])

X = data.drop(columns=['target'])
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = SVC(kernel='rbf', C=1, gamma=0.01)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

train_score = model.score(X_train, y_train)
print(f"훈련 세트 점수: {train_score}")

test_score = model.score(X_test, y_test)
print(f"테스트 세트 점수: {test_score}")

# 최적의 하이퍼파라미터
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [ 0.0001,0.001, 0.01, 0.1, 1],
    'kernel': ['rbf']
}

grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=5, scoring='accuracy', refit=True)
grid_search.fit(X_train_scaled, y_train)

print(f"최적의 하이퍼파라미터: {grid_search.best_params_}")
print(f"최고 교차 검증 점수: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

train_score = best_model.score(X_train, y_train)
print(f"훈련 세트 점수: {train_score}")

test_score =best_model.score(X_test, y_test)
print(f"테스트 세트 점수: {test_score}")






훈련 세트 점수: 0.5072951739618407
테스트 세트 점수: 0.47085201793721976
최적의 하이퍼파라미터: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
최고 교차 검증 점수: 0.5386918586403866
훈련 세트 점수: 0.5072951739618407
테스트 세트 점수: 0.47085201793721976




In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score


data = pd.read_csv('분석용최최종.csv')
data = data.drop(columns=['num', 'enddate', 'end+1','target2','Start Date', 'End Date','#','팀명','경기','투_승','투_패','상대1','경기1','투_승1','투_패1'])

X = data.drop(columns=['target'])
y = data['target']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


svc = SVC(kernel='linear', C=1, gamma='scale')  
rfe = RFE(estimator=svc, n_features_to_select=10, step=1)
rfe.fit(X_train_scaled, y_train)


selected_features = X.columns[rfe.support_]
print("상위 10개 중요한 피처:")
for feature in selected_features:
    print(feature)

X_selected_train = X_train_scaled[:, rfe.support_]
X_selected_test = X_test_scaled[:, rfe.support_]


model = SVC(kernel='linear', C=1, gamma='scale')  
model.fit(X_selected_train, y_train)

y_pred_train = model.predict(X_selected_train)
y_pred_test = model.predict(X_selected_test)

train_score = accuracy_score(y_train, y_pred_train)
print(f"훈련 세트 점수: {train_score:.4f}")

test_score = accuracy_score(y_test, y_pred_test)
print(f"테스트 세트 점수: {test_score:.4f}")


상위 10개 중요한 피처:
타_홈런
타_삼진
타_BABIP
타_타석1
타_단타1
타_홈런1
타_삼진1
타_BABIP1
차_타율
차_장타율
훈련 세트 점수: 0.5520
테스트 세트 점수: 0.5313


In [12]:
# 상관관계 높은 거 줄이기 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

data = pd.read_csv('분석용최최종.csv')
data = data.drop(columns=['num', 'enddate', 'end+1','target2','Start Date', 'End Date','#','팀명','경기','투_승','투_패','상대1','경기1','투_승1','투_패1'])

corr_matrix = data.corr()
threshold = 0.9  
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) 
to_drop = [column for column in upper.columns if any(upper[column].abs() > threshold)]
data_reduced = data.drop(columns=to_drop)


X = data_reduced.drop(columns=['target'])
y = data_reduced['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = SVC(kernel='rbf', C=1, gamma=0.01)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

train_score = model.score(X_train, y_train)
print(f"훈련 세트 점수: {train_score}")

test_score = model.score(X_test, y_test)
print(f"테스트 세트 점수: {test_score}")




훈련 세트 점수: 0.510662177328844
테스트 세트 점수: 0.45739910313901344




In [11]:
# 홈어웨이 추가 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('홈어웨이 데이터 추가.csv')

X = data[['투_ERA', '투_세', '타_OPS', '투_이닝당타자','타_타점','홈 경기수']]
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = SVC(kernel='rbf', C=10, gamma=0.1)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

train_score = model.score(X_train, y_train)
print(f"훈련 세트 점수: {train_score}")

test_score = model.score(X_test, y_test)
print(f"테스트 세트 점수: {test_score}")


훈련 세트 점수: 0.5016835016835017
테스트 세트 점수: 0.49327354260089684


