https://www.kaggle.com/code/gusthema/house-prices-prediction-using-tfdf

In [4]:
#hiddencell
# from pbl_tools import *

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

fe = fm.FontEntry(fname = 'MaruBuri-Regular.otf', name = 'MaruBuri')
fm.fontManager.ttflist.insert(0, fe)
plt.rc('font', family='MaruBuri')

In [5]:
import os
import random
import numpy as np
import pandas as pd 

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(77) 

In [6]:
train_org = pd.read_csv('D:/Daegu_Data/train.csv') 
test_org = pd.read_csv('D:/Daegu_Data/test.csv')
sample_submission = pd.read_csv("D:/Daegu_Data/sample_submission.csv")

train_org.drop(['ID'], axis=1, inplace=True)
test_org.drop(['ID'], axis=1, inplace=True)

train_df = train_org.copy()
test_df = test_org.copy()


time_pattern = r'(\d{4})-(\d{1,2})-(\d{1,2}) (\d{1,2})' 
train_df[['연', '월', '일', '시간']] = train_org['사고일시'].str.extract(time_pattern)
train_df[['연', '월', '일', '시간']] = train_df[['연', '월', '일', '시간']].apply(pd.to_numeric) # 추출된 문자열을 수치화해줍니다 
train_df = train_df.drop(columns=['사고일시']) # 정보 추출이 완료된 '사고일시' 컬럼은 제거합니다 
# 해당 과정을 test_x에 대해서도 반복해줍니다 
test_df[['연', '월', '일', '시간']] = test_org['사고일시'].str.extract(time_pattern)
test_df[['연', '월', '일', '시간']] = test_df[['연', '월', '일', '시간']].apply(pd.to_numeric)
test_df = test_df.drop(columns=['사고일시'])


location_pattern = r'(\S+) (\S+) (\S+)'
train_df[['도시', '구', '동']] = train_org['시군구'].str.extract(location_pattern)
train_df = train_df.drop(columns=['시군구'])
test_df[['도시', '구', '동']] = test_org['시군구'].str.extract(location_pattern)
test_df = test_df.drop(columns=['시군구'])


road_pattern = r'(.+) - (.+)'
train_df[['도로형태1', '도로형태2']] = train_org['도로형태'].str.extract(road_pattern)
train_df = train_df.drop(columns=['도로형태'])
test_df[['도로형태1', '도로형태2']] = test_org['도로형태'].str.extract(road_pattern)
test_df = test_df.drop(columns=['도로형태'])

In [7]:
display(train_df.head())
display(test_df.head())

Unnamed: 0,요일,기상상태,노면상태,사고유형,사고유형 - 세부분류,법규위반,가해운전자 차종,가해운전자 성별,가해운전자 연령,가해운전자 상해정도,...,ECLO,연,월,일,시간,도시,구,동,도로형태1,도로형태2
0,화요일,맑음,건조,차대사람,길가장자리구역통행중,안전운전불이행,승용,여,51세,상해없음,...,5,2019,1,1,0,대구광역시,중구,대신동,단일로,기타
1,화요일,흐림,건조,차대사람,보도통행중,기타,승용,남,39세,상해없음,...,3,2019,1,1,0,대구광역시,달서구,감삼동,단일로,기타
2,화요일,맑음,건조,차대사람,차도통행중,안전운전불이행,승용,남,70세,상해없음,...,3,2019,1,1,1,대구광역시,수성구,두산동,단일로,기타
3,화요일,맑음,건조,차대차,추돌,안전운전불이행,승용,남,49세,상해없음,...,5,2019,1,1,2,대구광역시,북구,복현동,단일로,기타
4,화요일,맑음,건조,차대차,추돌,안전운전불이행,승용,남,30세,상해없음,...,3,2019,1,1,4,대구광역시,동구,신암동,단일로,기타


Unnamed: 0,요일,기상상태,노면상태,사고유형,연,월,일,시간,도시,구,동,도로형태1,도로형태2
0,토요일,맑음,건조,차대사람,2022,1,1,1,대구광역시,수성구,상동,교차로,교차로안
1,토요일,맑음,건조,차대사람,2022,1,1,1,대구광역시,수성구,지산동,단일로,기타
2,토요일,맑음,건조,차대차,2022,1,1,4,대구광역시,수성구,수성동2가,교차로,교차로안
3,토요일,맑음,건조,차대차,2022,1,1,4,대구광역시,수성구,신매동,단일로,기타
4,토요일,맑음,건조,차대차,2022,1,1,6,대구광역시,달서구,감삼동,교차로,교차로안


In [8]:
X_test = test_df.copy()
X_train = train_df[X_test.columns].copy()
y_train = train_df['ECLO'].copy()

In [9]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
# list(X_train.dtypes[X_train.dtypes == "object"].index) : ['요일', '기상상태', '노면상태', '사고유형', '도시', '구', '동', '도로형태1', '도로형태2']
# 도시는 대구광역시로 동일하기 때문에 제거
# 연도별 추세는 중요하지 않다고 판단하여 제거
X_train.drop(['도시', '연'], axis=1, inplace=True)
X_test.drop(['도시', '연'], axis=1, inplace=True)
categorical_features = ['요일', '기상상태', '노면상태', '사고유형', '구', '동', '도로형태1', '도로형태2']
# 추출된 문자열 변수 확인
data = pd.concat([X_train, X_test])

# from sklearn.preprocessing import LabelEncoder
for i in categorical_features:
    # Get dummies
    dummies = pd.get_dummies(data[i], prefix=i)
    # Drop the original column
    data.drop([i], axis=1, inplace=True)
    # Concatenate the dummy variables
    data = pd.concat([data, dummies], axis=1)

# Split the data back into train and test dataframes
X_train = data[:len(X_train)]
X_test = data[len(X_train):]

Modeling

In [16]:
from sklearn.model_selection import train_test_split
X_tra, X_val, y_tra, y_val = train_test_split(X_train, y_train, test_size=0.2)

Prediction

In [117]:
# Final Model
model = 
model.fit(X_train, y_train)

prediction = model.predict(X_test)
prediction

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000947 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 435
[LightGBM] [Info] Number of data points in the train set: 39609, number of used features: 186
[LightGBM] [Info] Start training from score 4.726704


array([4.89619759, 4.02794758, 6.04610207, ..., 4.96844638, 5.06336034,
       5.13113592])

submission

In [118]:
baseline_submission = sample_submission.copy()
baseline_submission['ECLO'] = prediction
baseline_submission 

Unnamed: 0,ID,ECLO
0,ACCIDENT_39609,4.896198
1,ACCIDENT_39610,4.027948
2,ACCIDENT_39611,6.046102
3,ACCIDENT_39612,5.519666
4,ACCIDENT_39613,5.078919
...,...,...
10958,ACCIDENT_50567,6.867764
10959,ACCIDENT_50568,4.786608
10960,ACCIDENT_50569,4.968446
10961,ACCIDENT_50570,5.063360


In [119]:
baseline_submission.to_csv('.csv', index=False)