In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# 1. 데이터 셋 로드
df = pd.read_csv('StudentsPerformance.csv')

In [3]:
# 데이터셋: 특성, 타깃
X = df.drop(columns=['math score']) # 학습용 입력 데이터
y = df['math score']                # 타깃: 답


In [4]:
# 범주형 데이터 -> 숫치 데이터 
ctgy_data = ['gender', 
             'race/ethnicity', 
             'parental level of education', 
             'lunch', 
             'test preparation course']


In [None]:
# 전처리
preprocessor = ColumnTransformer(
    transformers=[
        ('cool', OneHotEncoder(handle_unknown='ignore'), ctgy_data)
    ],
    remainder='passthrough'
)



In [6]:
model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ]
)

In [7]:
# 데이터를 8:2 로 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [10]:
# 학습
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('mse:', mse)
print('r2:',r2)

mse: 29.095169866715516
r2: 0.8804332983749564
