In [11]:
# 작성자 : 권유섭
# 작성일자 : 2024-09-26
# 작성목적 : 모델링 및 회귀 분석

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns

In [12]:
import pandas as pd

# CSV 파일 읽기
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
print(train.head())

car = pd.DataFrame(train)
car_test = pd.DataFrame(test)

   id          brand              model  model_year  milage      fuel_type  \
0   0           MINI      Cooper S Base        2007  213000       Gasoline   
1   1        Lincoln              LS V8        2002  143250       Gasoline   
2   2      Chevrolet  Silverado 2500 LT        2002  136731  E85 Flex Fuel   
3   3        Genesis   G90 5.0 Ultimate        2017   19500       Gasoline   
4   4  Mercedes-Benz        Metris Base        2021    7388       Gasoline   

                                              engine  \
0       172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel   
1       252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel   
2  320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...   
3       420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel   
4       208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel   

                     transmission ext_col int_col  \
0                             A/T  Yellow    Gray   
1                             A/T  Silver   Beige   
2                             A/T  

In [13]:
car['model_year_log'] = np.log10(car['model_year'])
car_test['model_year_log'] = np.log10(car_test['model_year'])

# 변환된 값들을 확인
car[['model_year', 'model_year_log']].head()
car_test[['model_year', 'model_year_log']].head()

car = car.drop(columns=['model_year'])
car_test = car_test.drop(columns=['model_year'])


In [14]:
car['milage_log'] = np.log10(car['milage'])
car_test['milage_log'] = np.log10(car_test['milage'])

# 변환된 값들을 확인
car[['milage', 'milage_log']].head()
car_test[['milage', 'milage_log']].head()

car = car.drop(columns=['milage'])
car_test = car_test.drop(columns=['milage'])

In [15]:
replace_dict={
    '10-Speed Automatic':'10-Speed A/T',
    '8-Speed Automatic' :'8-Speed A/T',
    '9-Speed Automatic':'9-Speed A/T',
    '7-Speed Automatic with Auto-Shift':'7-Speed A/T',
    '6-Speed Automatic':'6-Speed A/T',
    '4-Speed Automatic':'4-Speed A/T',
    '6-Speed Manual':'6-Speed M/T',
    '8-Speed Automatic with Auto-Shift':'8-Speed A/T',
    '7-Speed Automatic':'7-Speed A/T',
    '9-Speed Automatic with Auto-Shift':'9-Speed A/T',
    '6-Speed Automatic with Auto-Shift':'6-Speed A/T',
    '6-Speed Electronically Controlled Automatic with O':'6-Speed A/T',
    '8-Speed Manual':'8-Speed M/T',
    '1-Speed Automatic':'1-Speed A/T',
    '7-Speed Manual':'7-Speed M/T',
    '5-Speed Automatic':'5-Speed A/T',
    '2-Speed Automatic':'2-Speed A/T',
    '8-SPEED AT':'8-Speed A/T',
    '10-Speed Automatic with Overdrive':'10-Speed A/T',
    '6 Speed Mt' :'6-Speed M/T',
    '8-SPEED A/T':'8-Speed A/T',
    'Automatic':'A/T',
    'AT' :'A/T',
    'Manual':'M/T',
    'MT':'M/T'
}

car['transmission']=car['transmission'].replace(replace_dict)
car_test['transmission']=car_test['transmission'].replace(replace_dict)

In [16]:
#문자열 통일
car['engine'] = car['engine'].str.replace('Liter', 'L')
car['engine'] = car['engine'].str.replace(' L', 'L')
car_test['engine'] = car_test['engine'].str.replace('Liter', 'L')
car_test['engine'] = car_test['engine'].str.replace(' L', 'L')
#엔진 값 나누기
import re

def extract_engine_details(engine_str):
    # 패턴 정의
    hp_pattern = r'(\d+\.?\d*)HP'
    size_pattern = r'(\d+\.?\d*)L'
    cylinders_pattern = r'(\d+)\s*Cylinder'
    cylinders_v_pattern = r'V(\d+)'  # V6, V8 등
    arrangement_pattern = r'I(\d+)|V(\d+)'  # I4, V6 등
    engine_type_pattern = r'(Gasoline Fuel|Electric Fuel System|Gas/Electric Hybrid|Flex Fuel Capability|Plug-In Electric/Gas)'

    # 정규표현식을 사용하여 정보 추출
    hp = re.search(hp_pattern, engine_str)
    size = re.search(size_pattern, engine_str)
    cylinders = re.search(cylinders_pattern, engine_str)
    arrangement = re.search(arrangement_pattern, engine_str)  # I4, V6 등
    engine_type = re.search(engine_type_pattern, engine_str)

    # 실린더 값을 통합하여 반환
    if arrangement:
        cylinders_value = f'I{arrangement.group(1)}' if arrangement.group(1) else f'V{arrangement.group(2)}'
    else:
        cylinders_value = cylinders.group(1) if cylinders else None

    # 추출된 정보를 반환
    return {
        'hp': hp.group(1) if hp else None,
        'size': size.group(1) if size else None,
        'cylinders': cylinders_value,
        'engine_type': engine_type.group(1) if engine_type else None
    }

car[['hp', 'size', 'cylinders', 'engine_type']] = car['engine'].apply(lambda x: pd.Series(extract_engine_details(x)))
car_test[['hp', 'size', 'cylinders', 'engine_type']] = car_test['engine'].apply(lambda x: pd.Series(extract_engine_details(x)))

#문자열 통일
car['cylinders'] = car['cylinders'].str.replace('I', '')
car_test['cylinders'] = car_test['cylinders'].str.replace('I', '')

#난수 처리
car['fuel_type']=car['fuel_type'].replace('–', np.nan)
car_test['fuel_type']=car_test['fuel_type'].replace('–', np.nan)

# engine_type & fuel_type 통일
car.loc[car['fuel_type'].isna() & car['engine_type'].notna(), 'fuel_type'] = car['engine_type']
car.loc[car['fuel_type']=='not supported', 'fuel_type'] = car['engine_type']
car_test.loc[car['fuel_type'].isna() & car_test['engine_type'].notna(), 'fuel_type'] = car['engine_type']
car_test.loc[car['fuel_type']=='not supported', 'fuel_type'] = car_test['engine_type']

#문자열 통일
car['fuel_type']=car['fuel_type'].str.replace('Gasoline Fuel', 'Gasoline')
car['fuel_type']=car['fuel_type'].str.replace('Gas/Electric Hybrid', 'Hybrid')
car['fuel_type']=car['fuel_type'].str.replace('Electric Fuel System', 'Electric Fuel')
car['fuel_type']=car['fuel_type'].str.replace('Flex Fuel Capability', 'E85 Flex Fuel')
car_test['fuel_type']=car_test['fuel_type'].str.replace('Gasoline Fuel', 'Gasoline')
car_test['fuel_type']=car_test['fuel_type'].str.replace('Gas/Electric Hybrid', 'Hybrid')
car_test['fuel_type']=car_test['fuel_type'].str.replace('Electric Fuel System', 'Electric Fuel')
car_test['fuel_type']=car_test['fuel_type'].str.replace('Flex Fuel Capability', 'E85 Flex Fuel')
# 정보를 알 수 있는 결측치 채우기
car.loc[(car['model'] == 'e-Golf SE') & (car['fuel_type'].isna()), 'fuel_type'] = 'Electric Fuel'
car.loc[(car['model'] == 'Challenger R/T Scat Pack') & (car['fuel_type'].isna()), 'fuel_type'] = 'Gasoline'
car.loc[(car['model'] == 'SLS AMG Base') & (car['fuel_type'].isna()), 'fuel_type'] = 'Gasoline'
car.loc[(car['model'] == 'Challenger R/T') & (car['fuel_type'].isna()), 'fuel_type'] = 'Gasoline'
car.loc[(car['model'] == 'Mustang EcoBoost Premium') & (car['fuel_type'].isna()), 'fuel_type'] = 'Gasoline'
car.loc[(car['model'] == 'DeVille Base') & (car['fuel_type'].isna()), 'fuel_type'] = 'Gasoline'
car.loc[(car['model'] == 'Challenger SRT8 392') & (car['fuel_type'].isna()), 'fuel_type'] = 'Gasoline'

car_test.loc[(car['model'] == 'e-Golf SE') & (car_test['fuel_type'].isna()), 'fuel_type'] = 'Electric Fuel'
car_test.loc[(car['model'] == 'Challenger R/T Scat Pack') & (car_test['fuel_type'].isna()), 'fuel_type'] = 'Gasoline'
car_test.loc[(car['model'] == 'SLS AMG Base') & (car_test['fuel_type'].isna()), 'fuel_type'] = 'Gasoline'
car_test.loc[(car['model'] == 'Challenger R/T') & (car_test['fuel_type'].isna()), 'fuel_type'] = 'Gasoline'
car_test.loc[(car['model'] == 'Mustang EcoBoost Premium') & (car_test['fuel_type'].isna()), 'fuel_type'] = 'Gasoline'
car_test.loc[(car['model'] == 'DeVille Base') & (car_test['fuel_type'].isna()), 'fuel_type'] = 'Gasoline'
car_test.loc[(car['model'] == 'Challenger SRT8 392') & (car_test['fuel_type'].isna()), 'fuel_type'] = 'Gasoline'


In [17]:
# model 열에서 첫 번째 띄어쓰기 전까지의 부분을 추출하여 새로운 열로 저장
car['model_group'] = car['model'].str.split().str[0]
car_test['model_group'] = car_test['model'].str.split().str[0]

# model 열 삭제
car = car.drop(columns=['model'])
car_test = car_test.drop(columns=['model'])

In [18]:
# accidet name 수정
car['accident_dummy'] = car['accident'].map({
    'None reported': 0,
    'At least 1 accident or damage reported': 1})
car_test['accident_dummy'] = car_test['accident'].map({
    'None reported': 0,
    'At least 1 accident or damage reported': 1})

In [19]:
import numpy as np
import pandas as pd

# ext_col 난수 처리
car['ext_col'] = car['ext_col'].replace('–', np.nan)
car_test['ext_col'] = car_test['ext_col'].replace('–', np.nan)

# ext_col 문자열 통일
car['ext_col'] = car['ext_col'].apply(lambda x: 'Red' if isinstance(x, str) and 'red' in x.lower() else x)
car['ext_col'] = car['ext_col'].apply(lambda x: 'White' if isinstance(x, str) and 'white' in x.lower() else x)
car['ext_col'] = car['ext_col'].apply(lambda x: 'Blue' if isinstance(x, str) and ('blue' in x.lower() or 'blu' in x.lower()) else x)
car['ext_col'] = car['ext_col'].apply(lambda x: 'Gray' if isinstance(x, str) and ('gray' in x.lower() or 'grey' in x.lower()) else x)
car['ext_col'] = car['ext_col'].apply(lambda x: 'Black' if isinstance(x, str) and 'black' in x.lower() else x)
car['ext_col'] = car['ext_col'].apply(lambda x: 'Silver' if isinstance(x, str) and 'silver' in x.lower() else x)
car['ext_col'] = car['ext_col'].apply(lambda x: x if x in ['Red', 'Blue', 'White', 'Gray', 'Black', 'Silver'] else 'other')

car_test['ext_col'] = car_test['ext_col'].apply(lambda x: 'Red' if isinstance(x, str) and 'red' in x.lower() else x)
car_test['ext_col'] = car_test['ext_col'].apply(lambda x: 'White' if isinstance(x, str) and 'white' in x.lower() else x)
car_test['ext_col'] = car_test['ext_col'].apply(lambda x: 'Blue' if isinstance(x, str) and ('blue' in x.lower() or 'blu' in x.lower()) else x)
car_test['ext_col'] = car_test['ext_col'].apply(lambda x: 'Gray' if isinstance(x, str) and ('gray' in x.lower() or 'grey' in x.lower()) else x)
car_test['ext_col'] = car_test['ext_col'].apply(lambda x: 'Black' if isinstance(x, str) and 'black' in x.lower() else x)
car_test['ext_col'] = car_test['ext_col'].apply(lambda x: 'Silver' if isinstance(x, str) and 'silver' in x.lower() else x)
car_test['ext_col'] = car_test['ext_col'].apply(lambda x: x if x in ['Red', 'Blue', 'White', 'Gray', 'Black', 'Silver'] else 'other')

# int_col 난수 처리
car['int_col'] = car['int_col'].replace('–', np.nan)
car_test['int_col'] = car_test['int_col'].replace('–', np.nan)

# int_col 문자열 통일
car['int_col'] = car['int_col'].apply(lambda x: 'Beige' if isinstance(x, str) and 'beige' in x.lower() else x)
car['int_col'] = car['int_col'].apply(lambda x: 'White' if isinstance(x, str) and 'white' in x.lower() else x)
car['int_col'] = car['int_col'].apply(lambda x: 'Gray' if isinstance(x, str) and ('gray' in x.lower() or 'grey' in x.lower()) else x)
car['int_col'] = car['int_col'].apply(lambda x: 'Black' if isinstance(x, str) and 'black' in x.lower() else x)
car['int_col'] = car['int_col'].apply(lambda x: 'Brown' if isinstance(x, str) and 'brown' in x.lower() else x)
car['int_col'] = car['int_col'].apply(lambda x: 'Red' if isinstance(x, str) and 'red' in x.lower() else x)
car['int_col'] = car['int_col'].apply(lambda x: x if x in ['Red', 'Beige', 'White', 'Gray', 'Black', 'Brown'] else 'other')

car_test['int_col'] = car_test['int_col'].apply(lambda x: 'Beige' if isinstance(x, str) and 'beige' in x.lower() else x)
car_test['int_col'] = car_test['int_col'].apply(lambda x: 'White' if isinstance(x, str) and 'white' in x.lower() else x)
car_test['int_col'] = car_test['int_col'].apply(lambda x: 'Gray' if isinstance(x, str) and ('gray' in x.lower() or 'grey' in x.lower()) else x)
car_test['int_col'] = car_test['int_col'].apply(lambda x: 'Black' if isinstance(x, str) and 'black' in x.lower() else x)
car_test['int_col'] = car_test['int_col'].apply(lambda x: 'Brown' if isinstance(x, str) and 'brown' in x.lower() else x)
car_test['int_col'] = car_test['int_col'].apply(lambda x: 'Red' if isinstance(x, str) and 'red' in x.lower() else x)
car_test['int_col'] = car_test['int_col'].apply(lambda x: x if x in ['Red', 'Beige', 'White', 'Gray', 'Black', 'Brown'] else 'other')


In [20]:
# NaN 값을 "NO"로 치환
car['clean_title'] = car['clean_title'].fillna('NO')
car_test['clean_title'] = car_test['clean_title'].fillna('NO')

In [21]:
# price열 log 변환
car['price_log'] = np.log10(car['price'])

# 변환된 값들을 확인
# car[['price', 'price_log']].head()


In [22]:
# 숫자형 NaN은 0으로, 문자형 NaN은 "Null"로 채우기
for column in car.columns:
    if car[column].dtype == 'object':  # 문자형 데이터
        car[column] = car[column].fillna('Null')
    else:  # 숫자형 데이터
        car[column] = car[column].fillna(0)

In [23]:
# 숫자형 NaN은 0으로, 문자형 NaN은 "Null"로 채우기
for column in car_test.columns:
    if car_test[column].dtype == 'object':  # 문자형 데이터
        car_test[column] = car_test[column].fillna('Null')
    else:  # 숫자형 데이터
        car_test[column] = car_test[column].fillna(0)

In [24]:
# engine 열 삭제
car = car.drop('engine',axis=1)

In [25]:
# test 데이터 engine 열 삭제
car_test = car_test.drop('engine',axis=1)

In [26]:
# train 데이터 price 열 삭제 및 확인
car.drop(columns=['price'],inplace=True)
car.head()

Unnamed: 0,id,brand,fuel_type,transmission,ext_col,int_col,accident,clean_title,model_year_log,milage_log,hp,size,cylinders,engine_type,model_group,accident_dummy,price_log
0,0,MINI,Gasoline,A/T,other,Gray,None reported,Yes,3.302547,5.32838,172.0,1.6,4,Gasoline Fuel,Cooper,0.0,3.623249
1,1,Lincoln,Gasoline,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,3.301464,5.156095,252.0,3.9,8,Gasoline Fuel,LS,1.0,3.698883
2,2,Chevrolet,E85 Flex Fuel,A/T,Blue,Gray,None reported,Yes,3.301464,5.135867,320.0,5.3,8,Flex Fuel Capability,Silverado,0.0,4.143015
3,3,Genesis,Gasoline,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,3.304706,4.290035,420.0,5.0,8,Gasoline Fuel,G90,0.0,4.653213
4,4,Mercedes-Benz,Gasoline,7-Speed A/T,Black,Beige,None reported,Yes,3.305566,3.868527,208.0,2.0,4,Gasoline Fuel,Metris,0.0,4.989005


In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import QuantileTransformer


# 'Null' 문자열을 NaN으로 변환
car.replace('Null', np.nan, inplace=True)

# 'hp', 'cylinders' 등 숫자형으로 변환할 수 있는 열을 처리
car['hp'] = pd.to_numeric(car['hp'], errors='coerce')
car['size'] = pd.to_numeric(car['size'], errors='coerce')
car['cylinders'] = pd.to_numeric(car['cylinders'], errors='coerce')

# 범주형 변수를 원핫 인코딩
X = pd.get_dummies(car.drop(columns=['id', 'price_log']), drop_first=True)  # 'id'는 제거하고 'price_log'는 타겟 변수이므로 제외
y = car['price_log']

# 결측치 처리 (예: 중앙값으로 채우기)
X.fillna(X.median(), inplace=True)

# 데이터 분할 (80% 학습, 20% 검증)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 표준화
scaler = QuantileTransformer(output_distribution='normal')
X_train_scaled = scaler.fit_transform(X_train)
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# ElasticNet 모델 정의
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)

# 모델 학습
elastic_net.fit(X_train_scaled, y_train)

# 검증 데이터에 대한 예측
y_val_pred = elastic_net.predict(X_val_scaled)

# 로그 변환을 해제하여 실제 가격을 예측 (price_log가 로그 변환된 값인 경우)
y_val_pred_actual = np.expm1(y_val_pred)
y_val_actual = np.expm1(y_val)

# RMSE 계산
rmse = np.sqrt(mean_squared_error(y_val_actual, y_val_pred_actual))
print(f'ElasticNet 모델의 RMSE: {rmse:.2f}')


ElasticNet 모델의 RMSE: 29.22


In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# 'Null' 문자열을 NaN으로 변환
car.replace('Null', np.nan, inplace=True)

# 'hp', 'size', 'cylinders' 등 숫자형으로 변환할 수 있는 열을 처리
car['hp'] = pd.to_numeric(car['hp'], errors='coerce')
car['size'] = pd.to_numeric(car['size'], errors='coerce')
car['cylinders'] = pd.to_numeric(car['cylinders'], errors='coerce')

# 범주형 변수를 원핫 인코딩
X = pd.get_dummies(car.drop(columns=['id', 'price_log']), drop_first=True)  # 'id'는 제거하고 'price_log'는 타겟 변수이므로 제외
y = car['price_log']

# 결측치 처리 (예: 중앙값으로 채우기)
X.fillna(X.median(), inplace=True)

# 데이터 분할 (80% 학습, 20% 검증)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 표준화 (스케일 남겨둠)
scaler = QuantileTransformer(output_distribution='normal')
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 랜덤 포레스트 회귀 모델 정의 (n_estimators가 높을 수록 좋다(?_))
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# 모델 학습
rf_regressor.fit(X_train_scaled, y_train)

# 검증 데이터에 대한 예측
y_val_pred = rf_regressor.predict(X_val_scaled)

# 로그 변환을 해제하여 실제 가격을 예측 (price_log가 로그 변환된 값인 경우)
y_val_pred_actual = np.expm1(y_val_pred)
y_val_actual = np.expm1(y_val)

# RMSE 계산
rmse = np.sqrt(mean_squared_error(y_val_actual, y_val_pred_actual))
print(f'랜덤 포레스트 모델의 RMSE: {rmse:.2f}')


랜덤 포레스트 모델의 RMSE: 27.24


In [29]:
# 'Null' 문자열을 NaN으로 변환 (테스트 데이터에 대해서도 동일한 처리)
car_test.replace('Null', np.nan, inplace=True)

# 'hp'와 'cylinders' 열을 숫자형으로 변환
car_test['hp'] = pd.to_numeric(car_test['hp'], errors='coerce')
car_test['cylinders'] = pd.to_numeric(car_test['cylinders'], errors='coerce')

# 범주형 변수를 원핫 인코딩 (train 데이터와 동일한 방식으로)
X_test = pd.get_dummies(car_test.drop(['id'], axis=1), drop_first=True)

# 원핫 인코딩한 열이 train 데이터와 맞지 않을 수 있으므로, train 데이터에 있던 열을 맞추기 위한 조치
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# 결측치 처리 (예: 중앙값으로 채우기)
X_test.fillna(X_test.median(), inplace=True)

# 표준화 (train 데이터에서 fit한 scaler 사용)
X_test_scaled = scaler.transform(X_test)

# 예측 (ElasticNet 모델 사용)
y_test_pred = elastic_net.predict(X_test_scaled)

# 로그 변환을 해제하여 실제 가격으로 변환 (만약 로그 변환을 했었다면)
y_test_pred_actual = np.expm1(y_test_pred)

# 예측 결과 출력
print(y_test_pred_actual)

# 예측 결과를 새로운 데이터프레임으로 저장 (id와 함께)
car_new = pd.DataFrame({
    'id': car_test['id'],
    'price': y_test_pred_actual
})

# 새로운 데이터프레임을 CSV 파일로 저장
car_new.to_csv('car_new.csv', index=False)

print("car_new.csv 파일이 생성되었습니다.")


[ 73.36824116 113.87177416 105.67243701 ...  77.10436866  72.8708314
  86.20059101]
car_new.csv 파일이 생성되었습니다.


In [30]:
# 'Null' 문자열을 NaN으로 변환 (테스트 데이터에 대해서도 동일한 처리)
car_test.replace('Null', np.nan, inplace=True)

# 'hp'와 'cylinders' 열을 숫자형으로 변환
car_test['hp'] = pd.to_numeric(car_test['hp'], errors='coerce')
car_test['cylinders'] = pd.to_numeric(car_test['cylinders'], errors='coerce')

# 범주형 변수를 원핫 인코딩 (train 데이터와 동일한 방식으로)
X_test = pd.get_dummies(car_test.drop(['id'], axis=1), drop_first=True)

# 원핫 인코딩한 열이 train 데이터와 맞지 않을 수 있으므로, train 데이터에 있던 열을 맞추기 위한 조치
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# 결측치 처리 (예: 중앙값으로 채우기)
X_test.fillna(X_test.median(), inplace=True)

# 표준화 (train 데이터에서 fit한 scaler 사용)
X_test_scaled = scaler.transform(X_test)

# 예측 (ElasticNet 모델 사용)
y_test_pred = elastic_net.predict(X_test_scaled)

# 상용 로그 변환을 해제하여 실제 가격으로 변환 (log10 -> original scale)
y_test_pred_actual = 10**y_test_pred  # 상용 로그 해제

# 예측 결과를 새로운 데이터프레임으로 저장 (id와 함께)
car_new = pd.DataFrame({
    'id': car_test['id'],
    'price': y_test_pred_actual
})

# 새로운 데이터프레임을 CSV 파일로 저장
car_new.to_csv('submission.csv', index=False)

print("submission.csv 파일이 생성되었습니다.")


submission.csv 파일이 생성되었습니다.


In [31]:
# 파일 저장
car_new.to_csv('submission.csv',index=False)

In [32]:
car_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125690 entries, 0 to 125689
Data columns (total 16 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              125690 non-null  int64  
 1   brand           125690 non-null  object 
 2   fuel_type       121183 non-null  object 
 3   transmission    125690 non-null  object 
 4   ext_col         125690 non-null  object 
 5   int_col         125690 non-null  object 
 6   accident        124058 non-null  object 
 7   clean_title     125690 non-null  object 
 8   model_year_log  125690 non-null  float64
 9   milage_log      125690 non-null  float64
 10  hp              103509 non-null  float64
 11  size            121310 non-null  object 
 12  cylinders       72095 non-null   float64
 13  engine_type     98808 non-null   object 
 14  model_group     125690 non-null  object 
 15  accident_dummy  125690 non-null  float64
dtypes: float64(5), int64(1), object(10)
memory usage: 15.3+ 

In [33]:
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              188533 non-null  int64  
 1   brand           188533 non-null  object 
 2   fuel_type       187347 non-null  object 
 3   transmission    188533 non-null  object 
 4   ext_col         188533 non-null  object 
 5   int_col         188533 non-null  object 
 6   accident        186081 non-null  object 
 7   clean_title     188533 non-null  object 
 8   model_year_log  188533 non-null  float64
 9   milage_log      188533 non-null  float64
 10  hp              155274 non-null  float64
 11  size            181835 non-null  float64
 12  cylinders       107938 non-null  float64
 13  engine_type     148373 non-null  object 
 14  model_group     188533 non-null  object 
 15  accident_dummy  188533 non-null  float64
 16  price_log       188533 non-null  float64
dtypes: float64