In [None]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score

from sklearn.preprocessing import LabelEncoder

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
sample_submission_df = pd.read_csv("sample_submission.csv")

In [None]:
test_df.shape

(10963, 8)

In [None]:
test_df.columns

Index(['ID', '사고일시', '요일', '기상상태', '시군구', '도로형태', '노면상태', '사고유형'], dtype='object')

In [None]:
test_features = test_df.drop(['ID', '사고일시'], axis = 1)
features_columns = test_features.columns
train_features = train_df[features_columns]
train_target = train_df['ECLO']

In [None]:
train_features.shape

(39609, 6)

In [None]:
test_features['요일'].value_counts()

금요일    1743
토요일    1638
목요일    1636
화요일    1599
수요일    1592
월요일    1587
일요일    1168
Name: 요일, dtype: int64

In [None]:
Q1, Q3 = train_df["ECLO"].quantile([0.25,0.75])

In [None]:
Q1

3.0

In [None]:
train_features.isnull().sum()

요일      0
기상상태    0
시군구     0
도로형태    0
노면상태    0
사고유형    0
dtype: int64

In [None]:
train_df.isnull().sum()

ID               0
사고일시             0
요일               0
기상상태             0
시군구              0
도로형태             0
노면상태             0
사고유형             0
사고유형 - 세부분류      0
법규위반             0
가해운전자 차종         0
가해운전자 성별         0
가해운전자 연령         0
가해운전자 상해정도       0
피해운전자 차종       991
피해운전자 성별       991
피해운전자 연령       991
피해운전자 상해정도     991
사망자수             0
중상자수             0
경상자수             0
부상자수             0
ECLO             0
dtype: int64

In [None]:
train_df.isnull().sum()/train_df.count()

ID             0.000000
사고일시           0.000000
요일             0.000000
기상상태           0.000000
시군구            0.000000
도로형태           0.000000
노면상태           0.000000
사고유형           0.000000
사고유형 - 세부분류    0.000000
법규위반           0.000000
가해운전자 차종       0.000000
가해운전자 성별       0.000000
가해운전자 연령       0.000000
가해운전자 상해정도     0.000000
피해운전자 차종       0.025662
피해운전자 성별       0.025662
피해운전자 연령       0.025662
피해운전자 상해정도     0.025662
사망자수           0.000000
중상자수           0.000000
경상자수           0.000000
부상자수           0.000000
ECLO           0.000000
dtype: float64

In [None]:
train_df = train_df.fillna(0)

In [None]:
cat_features = list(train_features.dtypes[train_features.dtypes=="object"].index)

for i in cat_features:
  le = LabelEncoder()
  le = le.fit(train_features[i])
  train_features[i]=le.transform(train_features[i])

  for case in np.unique(test_features[i]):
    if case not in le.classes_:
      le.classes_ = np.append(le.classes_, case)
  test_features[i]=le.transform(test_features[i])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_features[i]=le.transform(train_features[i])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_features[i]=le.transform(train_features[i])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_features[i]=le.transform(train_features[i])
A value is trying to be set on a copy of a slice from a

In [None]:
test_features

Unnamed: 0,요일,기상상태,시군구,도로형태,노면상태,사고유형
0,5,2,131,1,0,0
1,5,2,143,6,0,0
2,5,2,134,1,0,1
3,5,2,138,6,0,1
4,5,2,4,1,0,1
...,...,...,...,...,...,...
10958,5,2,0,8,0,1
10959,5,2,137,6,0,1
10960,5,2,139,6,0,1
10961,5,2,127,0,0,1


In [None]:
from sklearn.model_selection import KFold

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
skf = StratifiedKFold(n_splits = 6, shuffle=True, random_state = 42)

In [None]:
hp = {
    "random_state" : 42,
    "verbose" : 0
}

model = RandomForestRegressor(**hp)
n_iter = 0
rmse_list = []

for train_index, valid_index in skf.split(train_features, train_target):
  n_iter+=1
  train_x, valid_x = train_features.iloc[train_index], train_features.iloc[valid_index]
  train_y, valid_y = train_target.iloc[train_index], train_target.iloc[valid_index]

  model.fit(train_x,train_y)
  valid_pred = model.predict(valid_x)

  rmse = mean_squared_error(valid_y, valid_pred, squared=False)
  rmse_list.append(rmse)
  print(f'{n_iter}번째 Stratified K-Fold RMSE: {rmse}')

print('-'*50)
print(f'교차 검증 RMSE: {np.mean(rmse_list)}')



1번째 Stratified K-Fold RMSE: 3.3758517232146863
2번째 Stratified K-Fold RMSE: 3.3740583006331732
3번째 Stratified K-Fold RMSE: 3.407300570054935
4번째 Stratified K-Fold RMSE: 3.357323762975529
5번째 Stratified K-Fold RMSE: 3.4319689266576123
6번째 Stratified K-Fold RMSE: 3.3881077373469695
--------------------------------------------------
교차 검증 RMSE: 3.389101836813818


In [None]:
prediction =  model.predict(test_features)

In [None]:
sample_submission_df['ECLO'] = prediction
sample_submission_df.to_csv("LeeSubmission.csv", index=False)

In [None]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
sub_df = pd.read_csv("sample_submission.csv")

In [None]:
test_features = test_df.drop(['ID', '사고일시'], axis = 1)
sa_columns = test_features.columns
train_features = train_features[sa_columns]
train_target = train_df['ECLO']

In [None]:
cat_feature = list(train_features.dtypes[train_features.dtypes=='object'].index)

for i in cat_feature:
  le = LabelEncoder()
  le = le.fit(train_features[i])
  train_features[i] = le.transform(train_features[i])

  for case in np.unique(test_features[i]):
    if case not in le.classes_:
      le.classes_ = np.append(le.classes_, case)
  test_features[i] = le.transform(test_features[i])

In [None]:
skf = StratifiedKFold(n_splits = 6, shuffle=True, random_state=42)

hp = {
    "random_state" : 42,
    "verbose" : 0
}

model = RandomForestRegressor(**hp)
n_iter = 0
rmse_lst =[]

In [None]:
for train_index, valid_index in skf.split(train_features, train_target):
  n_iter+=1
  train_x, valid_x = train_features.iloc[train_index], train_features.iloc[valid_index]
  train_y, valid_y = train_target.iloc[train_index], train_target.iloc[valid_index]

  model.fit(train_x, train_y)
  pred = model.predict(valid_x)
  rmse = mean_squared_error(valid_y, pred, squared=False)
  rmse_lst.append(rmse)

  print(f'{n_iter}번째 Stratified rmse : {rmse}')

print('-'*50)
print(f'교차 검증 RMSE: {np.mean(rmse_list)}')

In [None]:
prediction = model.predict(test_features)
sub_df['ECLO'] = prediction
sub_df.to_csv("Leesubmission.csv", index=False)