In [1]:
import pandas as pd
df = pd.read_csv("C:/Users/kyoun/Downloads/open (1)/open/train.csv")

In [2]:
def remove_outliers_iqr_by_col(df, cols, k=1.5):
    
    df_clean = df.copy()

    for col in cols:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1

        lower = Q1 - k * IQR
        upper = Q3 + k * IQR

        df_clean = df_clean[
            (df_clean[col] >= lower) &
            (df_clean[col] <= upper)
        ]

    return df_clean


In [3]:
cols = df.columns

In [4]:
# Weight_Status => Ordinal Encoder (순서형 변수)
# 순서인 이유 뚱뚱한 사람이 마른사람보다 칼로리 소모량이 많다.
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder(
    categories=[['Normal Weight', 'Overweight', 'Obese']]
)

df[['Weight_Status']] = encoder.fit_transform(df[['Weight_Status']])

In [5]:
# 남성(M) -> 1 , 여성(F) -> 0으로 치완(남성이 여성보다 칼로리 소모량이 많음)
df["Gender"] = df["Gender"].map({"M": 1, "F": 0})

In [6]:
# df = remove_outliers_iqr_by_col(df, cols)


In [7]:
df

Unnamed: 0,ID,Exercise_Duration,Body_Temperature(F),BPM,Height(Feet),Height(Remainder_Inches),Weight(lb),Weight_Status,Gender,Age,Calories_Burned
0,TRAIN_0000,26.0,105.6,107.0,5.0,9.0,154.3,0.0,0,45,166.0
1,TRAIN_0001,7.0,103.3,88.0,6.0,6.0,224.9,1.0,1,50,33.0
2,TRAIN_0002,7.0,103.3,86.0,6.0,3.0,218.3,1.0,1,29,23.0
3,TRAIN_0003,17.0,104.0,99.0,5.0,6.0,147.7,0.0,0,33,91.0
4,TRAIN_0004,9.0,102.7,88.0,5.0,10.0,169.8,0.0,1,38,32.0
...,...,...,...,...,...,...,...,...,...,...,...
7495,TRAIN_7495,22.0,105.1,104.0,4.0,10.0,112.4,0.0,0,75,151.0
7496,TRAIN_7496,20.0,105.3,104.0,5.0,8.0,147.7,0.0,0,21,114.0
7497,TRAIN_7497,8.0,103.1,90.0,6.0,2.0,202.8,1.0,1,57,41.0
7498,TRAIN_7498,12.0,104.4,97.0,5.0,9.0,167.6,1.0,1,35,57.0


In [8]:
X = df.drop(['ID','Calories_Burned','Body_Temperature(F)'], axis = 1)
y = df['Calories_Burned']

In [9]:
X

Unnamed: 0,Exercise_Duration,BPM,Height(Feet),Height(Remainder_Inches),Weight(lb),Weight_Status,Gender,Age
0,26.0,107.0,5.0,9.0,154.3,0.0,0,45
1,7.0,88.0,6.0,6.0,224.9,1.0,1,50
2,7.0,86.0,6.0,3.0,218.3,1.0,1,29
3,17.0,99.0,5.0,6.0,147.7,0.0,0,33
4,9.0,88.0,5.0,10.0,169.8,0.0,1,38
...,...,...,...,...,...,...,...,...
7495,22.0,104.0,4.0,10.0,112.4,0.0,0,75
7496,20.0,104.0,5.0,8.0,147.7,0.0,0,21
7497,8.0,90.0,6.0,2.0,202.8,1.0,1,57
7498,12.0,97.0,5.0,9.0,167.6,1.0,1,35


In [10]:
X['Total_Height_Inches'] = X['Height(Feet)'] * 12 + X['Height(Remainder_Inches)']
X["Total_Height_Cm"] = X["Total_Height_Inches"] * 2.54
X["BMI"] = X["Weight(lb)"] * 703 / X['Total_Height_Inches']**2

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size= 0.2, random_state= 42)

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline , make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

pipe_scaler = Pipeline([
    ("poly", PolynomialFeatures(degree=3, include_bias=False)),
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])

In [13]:
pipe_scaler.fit(X_train,y_train)
y_pred = pipe_scaler.predict(X_test)



In [14]:
from sklearn.metrics import mean_squared_error
import numpy as np

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [15]:
rmse

np.float64(0.29635411079793667)

In [16]:
test = pd.read_csv("C:/Users/kyoun/Downloads/open (1)/open/test.csv")

In [17]:
# Weight_Status => Ordinal Encoder (순서형 변수)
# 순서인 이유 뚱뚱한 사람이 마른사람보다 칼로리 소모량이 많다.
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder(
    categories=[['Normal Weight', 'Overweight', 'Obese']]
)

test[['Weight_Status']] = encoder.fit_transform(test[['Weight_Status']])

In [18]:
# 남성(M) -> 1 , 여성(F) -> 0으로 치완(남성이 여성보다 칼로리 소모량이 많음)
test["Gender"] = test["Gender"].map({"M": 1, "F": 0})

In [19]:
test['Total_Height_Inches'] = test['Height(Feet)'] * 12 + test['Height(Remainder_Inches)']
test["Total_Height_Cm"] = test["Total_Height_Inches"] * 2.54
test["BMI"] = test["Weight(lb)"] * 703 / test['Total_Height_Inches']**2
test = test.drop(['ID','Body_Temperature(F)'], axis = 1)

In [20]:
test_pred = pipe_scaler.predict(test)


In [21]:
submit = pd.read_csv("C:/Users/kyoun/Downloads/open (1)/open/sample_submission.csv")

In [22]:
submit["Calories_Burned"] = test_pred
submit

Unnamed: 0,ID,Calories_Burned
0,TEST_0000,172.787984
1,TEST_0001,189.246673
2,TEST_0002,53.525674
3,TEST_0003,161.382250
4,TEST_0004,225.820492
...,...,...
7495,TEST_7495,196.593804
7496,TEST_7496,9.939086
7497,TEST_7497,130.214395
7498,TEST_7498,32.062053


In [23]:
submit.to_csv('C:/Users/kyoun/Downloads/open (1)/open/submission.csv', index = False)