## Import data

In [101]:
import pandas as pd 
import os
import random
import numpy as np
import matplotlib.pyplot as plt


path = "/mnt/d/data/accident/"

train_org = pd.read_csv(path + 'train.csv') 
test_org = pd.read_csv(path + 'test.csv')

sample_submission = pd.read_csv(path+"sample_submission.csv")

## Set seed

In [102]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

## train, test 데이터 기간 확인

In [103]:
display(f"train : {train_org.iloc[0]['사고일시']} ~ {train_org.iloc[-1]['사고일시']}")
display(f"test : {test_org.iloc[0]['사고일시']} ~ {test_org.iloc[-1]['사고일시']}")     

'train : 2019-01-01 00 ~ 2021-12-31 23'

'test : 2022-01-01 01 ~ 2022-12-31 21'

# **데이터 전처리**  

In [104]:
train_df = train_org.copy()
test_df = test_org.copy()

time_pattern = r'(\d{4})-(\d{1,2})-(\d{1,2}) (\d{1,2})' 

# train_df[['연', '월', '일', '시간']] = train_org['사고일시'].str.extract(time_pattern)
# train_df[['연', '월', '일', '시간']] = train_df[['연', '월', '일', '시간']].apply(pd.to_numeric) # 추출된 문자열을 수치화해줍니다 
# train_df = train_df.drop(columns=['사고일시']) # 정보 추출이 완료된 '사고일시' 컬럼은 제거합니다 

# 해당 과정을 test_x에 대해서도 반복해줍니다 
# test_df[['연', '월', '일', '시간']] = test_org['사고일시'].str.extract(time_pattern)
# test_df[['연', '월', '일', '시간']] = test_df[['연', '월', '일', '시간']].apply(pd.to_numeric)
# test_df = test_df.drop(columns=['사고일시'])

location_pattern = r'(\S+) (\S+) (\S+)'

train_df[['도시', '구', '동']] = train_org['시군구'].str.extract(location_pattern)
train_df = train_df.drop(columns=['시군구'])

test_df[['도시', '구', '동']] = test_org['시군구'].str.extract(location_pattern)
test_df = test_df.drop(columns=['시군구'])

road_pattern = r'(.+) - (.+)'

train_df[['도로형태1', '도로형태2']] = train_org['도로형태'].str.extract(road_pattern)
train_df = train_df.drop(columns=['도로형태'])

test_df[['도로형태1', '도로형태2']] = test_org['도로형태'].str.extract(road_pattern)
test_df = test_df.drop(columns=['도로형태'])

## Use additional data

In [105]:
light_df = pd.read_csv(os.path.join(path, "external_open/light.csv"), encoding='cp949')[['설치개수', '소재지지번주소']]

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

light_df[['도시', '구', '동', '번지']] = light_df['소재지지번주소'].str.extract(location_pattern)
light_df = light_df.drop(columns=['소재지지번주소', '번지'])

light_df = light_df.groupby(['도시', '구', '동']).sum().reset_index()
light_df.reset_index(inplace=True, drop=True)

  light_df = pd.read_csv(os.path.join(path, "external_open/light.csv"), encoding='cp949')[['설치개수', '소재지지번주소']]


In [106]:
child_area_df = pd.read_csv(os.path.join(path, "external_open/child.csv"), encoding='cp949')[['CCTV설치대수', '소재지지번주소']]
child_area_df['보호구역수'] = 1

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

child_area_df[['도시', '구', '동', '번지']] = child_area_df['소재지지번주소'].str.extract(location_pattern)
child_area_df = child_area_df.drop(columns=['소재지지번주소', '번지'])

child_area_df = child_area_df.groupby(['도시', '구', '동']).sum().reset_index()
child_area_df.reset_index(inplace=True, drop=True)

In [107]:
parking_df = pd.read_csv(os.path.join(path, "external_open/parking.csv"), encoding='cp949')[['소재지지번주소', '급지구분', "주차구획수"]]
parking_df = pd.get_dummies(parking_df, columns=['급지구분'])

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

parking_df[['도시', '구', '동', '번지']] = parking_df['소재지지번주소'].str.extract(location_pattern)
parking_df = parking_df.drop(columns=['소재지지번주소', '번지'])

parking_df = parking_df.groupby(['도시', '구', '동']).sum().reset_index()
parking_df.reset_index(inplace=True, drop=True)

### Merge with original data

In [108]:
train_df = pd.merge(train_df, light_df, how='left', on=['도시', '구', '동'])
train_df = pd.merge(train_df, child_area_df, how='left', on=['도시', '구', '동'])
train_df = pd.merge(train_df, parking_df, how='left', on=['도시', '구', '동'])

test_df = pd.merge(test_df, light_df, how='left', on=['도시', '구', '동'])
test_df = pd.merge(test_df, child_area_df, how='left', on=['도시', '구', '동'])
test_df = pd.merge(test_df, parking_df, how='left', on=['도시', '구', '동'])

## Drop labels not included in test_x

In [109]:
test_x = test_df.drop(columns=['ID']).copy()
train_x = train_df[test_x.columns].copy()
train_y = train_df['ECLO'].copy()

## **범주형(Categorical) 변수, 수치형 변수로 변환하기**

모델 학습을 위해 train_x의 문자열 타입의 컬럼들을 추출하고, LabelEncoder를 활용하여 이 컬럼들을 모두 수치형 변수로 변환해 보겠습니다

In [110]:
from sklearn.preprocessing import LabelEncoder

categorical_features = list(train_x.dtypes[train_x.dtypes == "object"].index)
categorical_features.remove("사고일시")
# 추출된 문자열 변수 확인
display(categorical_features)

for i in categorical_features:
    le = LabelEncoder()
    le=le.fit(train_x[i]) 
    train_x[i]=le.transform(train_x[i])
    
    test_x[i]=le.transform(test_x[i])

['요일', '기상상태', '노면상태', '사고유형', '도시', '구', '동', '도로형태1', '도로형태2']

In [111]:
train_x.fillna(0, inplace=True)
test_x.fillna(0, inplace=True)

# 시계열 데이터 변환

In [112]:
from  autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

In [113]:
train_x

Unnamed: 0,사고일시,요일,기상상태,노면상태,사고유형,도시,구,동,도로형태1,도로형태2,설치개수,CCTV설치대수,보호구역수,주차구획수,급지구분_1,급지구분_2,급지구분_3
0,2019-01-01 00,6,2,0,0,0,7,40,2,5,391.0,13.0,2.0,500.0,11.0,0.0,0.0
1,2019-01-01 00,6,5,0,0,0,1,4,2,5,932.0,0.0,0.0,114.0,0.0,1.0,3.0
2,2019-01-01 01,6,2,0,0,0,6,66,2,5,473.0,0.0,5.0,0.0,0.0,0.0,0.0
3,2019-01-01 02,6,2,0,1,0,4,79,2,5,534.0,32.0,11.0,374.0,0.0,9.0,5.0
4,2019-01-01 04,6,2,0,1,0,3,129,2,5,2057.0,0.0,0.0,63.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,2021-12-31 19,0,2,0,1,0,6,118,0,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0
39605,2021-12-31 19,0,2,0,1,0,1,103,2,5,843.0,0.0,0.0,466.0,0.0,0.0,5.0
39606,2021-12-31 21,0,2,0,1,0,1,144,0,3,164.0,0.0,0.0,32.0,0.0,1.0,0.0
39607,2021-12-31 22,0,2,0,1,0,1,158,1,5,210.0,0.0,0.0,188.0,0.0,0.0,1.0


기상상태, 노면상태, 도로형태등의 정보는 동별로 다르지만 기존의 데이터에서 추가된 설치개수~ 이후의 정보는 모두 동일  
또한 feature importance는 시계열 정보 -> 추가된 데이터이므로 기존의 train_x의 데이터는 생략하고 진행  
**요일은 datetime 모듈을 이용해 다시 추가해줄것**

In [114]:
#참고
display(f"train : {train_org.iloc[0]['사고일시']} ~ {train_org.iloc[-1]['사고일시']}")
display(f"test : {test_org.iloc[0]['사고일시']} ~ {test_org.iloc[-1]['사고일시']}")     

'train : 2019-01-01 00 ~ 2021-12-31 23'

'test : 2022-01-01 01 ~ 2022-12-31 21'

In [115]:
train_x = train_x[['사고일시', '요일', '동', '설치개수', 'CCTV설치대수', '보호구역수', '주차구획수', '급지구분_1', '급지구분_2', '급지구분_3']]
test_x = test_x[['사고일시', '요일', '동', '설치개수', 'CCTV설치대수', '보호구역수', '주차구획수', '급지구분_1', '급지구분_2', '급지구분_3']]

In [116]:
train_x = pd.concat([train_x, train_y], axis=1)

In [117]:
train_x["사고일시"] = pd.to_datetime(train_x["사고일시"])
# temp["사고일시"] = temp["사고일시"].dt.date

In [121]:
train_x = (train_x.groupby(["동"])['사고일시']
   .apply(lambda x:pd.date_range(start="2019-01-01", end="2021-12-31", freq="D"))
   .explode()
   .reset_index()
   .merge(train_x, how='left')
)

In [125]:
train_x

Unnamed: 0,동,사고일시,요일,설치개수,CCTV설치대수,보호구역수,주차구획수,급지구분_1,급지구분_2,급지구분_3,ECLO
0,0.0,2019-01-01,,,,,,,,,
1,0.0,2019-01-02,,,,,,,,,
2,0.0,2019-01-03,,,,,,,,,
3,0.0,2019-01-04,,,,,,,,,
4,0.0,2019-01-05,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
214819,195.0,2021-12-27,,,,,,,,,
214820,195.0,2021-12-28,,,,,,,,,
214821,195.0,2021-12-29,,,,,,,,,
214822,195.0,2021-12-30,,,,,,,,,


In [123]:
temp = []
for i in range(196):
    train_x[train_x["동"]==i] = train_x[train_x["동"]==i].drop_duplicates(subset=["사고일시"])
    # if cnt!=1096:
        # temp.append(i)

In [132]:
train_x = train_x.dropna(axis=0, how="all")

In [89]:
temp = []
for i in range(196):
    cnt = len(train_x[train_x["동"]==i])
    if cnt!=1096:
        temp.append(i)

In [88]:
train_x

Unnamed: 0,동,사고일시,요일,설치개수,CCTV설치대수,보호구역수,주차구획수,급지구분_1,급지구분_2,급지구분_3,ECLO
0,0.0,2019-01-01,,,,,,,,,
1,0.0,2019-01-02,,,,,,,,,
2,0.0,2019-01-03,,,,,,,,,
3,0.0,2019-01-04,,,,,,,,,
4,0.0,2019-01-05,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
214819,195.0,2021-12-27,,,,,,,,,
214820,195.0,2021-12-28,,,,,,,,,
214821,195.0,2021-12-29,,,,,,,,,
214822,195.0,2021-12-30,,,,,,,,,


In [133]:
train_x["ECLO"] = train_x["ECLO"].fillna(0)
train_x = train_x.bfill().ffill()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_x["ECLO"] = train_x["ECLO"].fillna(0)


In [135]:
train_x = train_x.rename(columns={"ECLO":"target", "사고일시":"timestamp", "동":"item_id"})

In [136]:
display(f"test : {test_org.iloc[0]['사고일시']} ~ {test_org.iloc[-1]['사고일시']}")     


'test : 2022-01-01 01 ~ 2022-12-31 21'

In [138]:
train_x["item_id"] = train_x["item_id"].astype(int)

## Train Timeseries model

In [139]:
data = TimeSeriesDataFrame(train_x)
predictor = TimeSeriesPredictor( 
    prediction_length=365,
    target="target",
    eval_metric="RMSE",
)
# seed 고정
predictor.fit(data, num_val_windows=1)

TimeSeriesPredictor.fit() called
Fitting with arguments:
{'enable_ensemble': True,
 'evaluation_metric': 'RMSE',
 'excluded_model_types': None,
 'hyperparameter_tune_kwargs': None,
 'hyperparameters': 'default',
 'num_val_windows': 1,
 'prediction_length': 365,
 'random_seed': None,
 'target': 'target',
 'time_limit': None,
 'verbosity': 2}
Provided training data set with 214816 rows, 196 items (item = single time series). Average time series length is 1096.0. Data frequency is 'D'.
AutoGluon will save models to AutogluonModels/ag-20231128_113909
AutoGluon will gauge predictive performance using evaluation metric: 'RMSE'
	This metric's sign has been flipped to adhere to being 'higher is better'. The reported score can be multiplied by -1 to get the metric value.

Provided dataset contains following columns:
	target:           'target'
	past covariates:  ['요일', '설치개수', 'CCTV설치대수', '보호구역수', '주차구획수', '급지구분_1', '급지구분_2', '급지구분_3']

Starting training. Start time is 2023-11-28 20:39:09
Model

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x7f15786b7790>

In [140]:
predictor.leaderboard()

              model  score_val  pred_time_val  fit_time_marginal  fit_order
0  WeightedEnsemble  -0.352106     100.022938          15.838647          7
1            DeepAR  -0.352791      90.359732        2349.450203          6
2             Naive  -0.352791       3.567806           0.085556          1
3           AutoETS  -0.353049       9.663206           0.079193          4
4             Theta  -0.362146      16.197487           0.079719          3
5     SeasonalNaive  -0.497857       0.161065           0.087413          2
6  RecursiveTabular  -0.555632       7.651725           2.517506          5


Unnamed: 0,model,score_val,pred_time_val,fit_time_marginal,fit_order
0,WeightedEnsemble,-0.352106,100.022938,15.838647,7
1,DeepAR,-0.352791,90.359732,2349.450203,6
2,Naive,-0.352791,3.567806,0.085556,1
3,AutoETS,-0.353049,9.663206,0.079193,4
4,Theta,-0.362146,16.197487,0.079719,3
5,SeasonalNaive,-0.497857,0.161065,0.087413,2
6,RecursiveTabular,-0.555632,7.651725,2.517506,5


In [141]:
pred = predictor.predict(train_x)

Global seed set to 123
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


In [167]:
test_x["사고일시"] = pd.to_datetime(test_x["사고일시"], format="%Y-%m-%d").dt.floor('D')

In [168]:
test_x

Unnamed: 0,사고일시,요일,동,설치개수,CCTV설치대수,보호구역수,주차구획수,급지구분_1,급지구분_2,급지구분_3
0,2022-01-01,5,99,700.0,0.0,5.0,0.0,0.0,0.0,0.0
1,2022-01-01,5,168,0.0,0.0,10.0,183.0,0.0,0.0,2.0
2,2022-01-01,5,117,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2022-01-01,5,126,0.0,0.0,7.0,153.0,0.0,2.0,1.0
4,2022-01-01,5,4,932.0,0.0,0.0,114.0,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...
10958,2022-12-31,5,38,5377.0,106.0,26.0,709.0,20.0,1.0,0.0
10959,2022-12-31,5,123,0.0,0.0,5.0,0.0,0.0,0.0,0.0
10960,2022-12-31,5,134,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10961,2022-12-31,5,77,0.0,0.0,7.0,0.0,0.0,0.0,0.0


In [181]:
pred = pred.rename(columns={"timestamp":"사고일시", "item_id":"동"})

In [187]:
test_x

Unnamed: 0,사고일시,요일,동,설치개수,CCTV설치대수,보호구역수,주차구획수,급지구분_1,급지구분_2,급지구분_3
0,2022-01-01,5,99,700.0,0.0,5.0,0.0,0.0,0.0,0.0
1,2022-01-01,5,168,0.0,0.0,10.0,183.0,0.0,0.0,2.0
2,2022-01-01,5,117,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2022-01-01,5,126,0.0,0.0,7.0,153.0,0.0,2.0,1.0
4,2022-01-01,5,4,932.0,0.0,0.0,114.0,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...
10958,2022-12-31,5,38,5377.0,106.0,26.0,709.0,20.0,1.0,0.0
10959,2022-12-31,5,123,0.0,0.0,5.0,0.0,0.0,0.0,0.0
10960,2022-12-31,5,134,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10961,2022-12-31,5,77,0.0,0.0,7.0,0.0,0.0,0.0,0.0


In [186]:
pd.merge(test_x, pred, left_on=["사고일시","동"], right_on=["사고일시","동"],how="left")

Unnamed: 0,사고일시,요일,동,설치개수,CCTV설치대수,보호구역수,주차구획수,급지구분_1,급지구분_2,급지구분_3,mean,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,2022-01-01,5,99,700.0,0.0,5.0,0.0,0.0,0.0,0.0,0.020977,-0.344892,-0.219292,-0.128730,-0.051348,0.020979,0.093307,0.170688,0.261251,0.386847
1,2022-01-01,5,168,0.0,0.0,10.0,183.0,0.0,0.0,2.0,0.018880,-0.322254,-0.205145,-0.120705,-0.048556,0.018882,0.086318,0.158468,0.242907,0.360017
2,2022-01-01,5,117,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.010995,-0.268277,-0.172407,-0.103279,-0.044212,0.010995,0.066203,0.125270,0.194398,0.290265
3,2022-01-01,5,126,0.0,0.0,7.0,153.0,0.0,2.0,1.0,0.013445,-0.275785,-0.176496,-0.104903,-0.043730,0.013446,0.070622,0.131795,0.203388,0.302675
4,2022-01-01,5,4,932.0,0.0,0.0,114.0,0.0,1.0,3.0,0.021935,-0.303429,-0.191737,-0.111200,-0.042383,0.021936,0.086255,0.155070,0.235606,0.347307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10958,2022-12-31,5,38,5377.0,106.0,26.0,709.0,20.0,1.0,0.0,0.078021,-0.523201,-0.316800,-0.167983,-0.040826,0.078023,0.196877,0.324041,0.472854,0.679253
10959,2022-12-31,5,123,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.010921,-0.172850,-0.109765,-0.064276,-0.025408,0.010921,0.047249,0.086117,0.131606,0.194693
10960,2022-12-31,5,134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008833,-0.264183,-0.170459,-0.102881,-0.045138,0.008832,0.062803,0.120547,0.188126,0.281846
10961,2022-12-31,5,77,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.005452,-0.131864,-0.084726,-0.050736,-0.021693,0.005452,0.032598,0.061640,0.095630,0.142768


In [149]:
pred = pred.reset_index()

# Model Train & Prediction

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor() 
model.fit(train_x, train_y)

prediction = model.predict(test_x)

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor

train_x = TabularDataset(train_x)
train_x["label"] = train_y
predictor = TabularPredictor(label="label", eval_metric="root_mean_squared_error", problem_type="regression").fit(train_x)

No path specified. Models will be saved in: "AutogluonModels/ag-20231128_094953"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231128_094953"
AutoGluon Version:  0.8.2
Python Version:     3.10.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Oct 5 21:02:42 UTC 2023
Disk Space Avail:   1936.82 GB / 2000.40 GB (96.8%)
Train Data Rows:    39609
Train Data Columns: 20
Label Column: label
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    10630.94 MB
	Train Data (Original)  Memory Usage: 6.34 MB (0.1% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatur

In [None]:
predictor.leaderboard()

                  model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -3.154248       0.095030  20.662273                0.000226           0.159438            2       True         12
1              CatBoost  -3.154663       0.001983   0.657348                0.001983           0.657348            1       True          6
2              LightGBM  -3.161942       0.001762   0.201109                0.001762           0.201109            1       True          4
3               XGBoost  -3.162130       0.004546   0.389998                0.004546           0.389998            1       True          9
4       NeuralNetFastAI  -3.168732       0.019781  16.570037                0.019781          16.570037            1       True          8
5            LightGBMXT  -3.171463       0.002489   0.584277                0.002489           0.584277            1       True          3
6         LightGBMLarge  -3

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-3.154248,0.09503,20.662273,0.000226,0.159438,2,True,12
1,CatBoost,-3.154663,0.001983,0.657348,0.001983,0.657348,1,True,6
2,LightGBM,-3.161942,0.001762,0.201109,0.001762,0.201109,1,True,4
3,XGBoost,-3.16213,0.004546,0.389998,0.004546,0.389998,1,True,9
4,NeuralNetFastAI,-3.168732,0.019781,16.570037,0.019781,16.570037,1,True,8
5,LightGBMXT,-3.171463,0.002489,0.584277,0.002489,0.584277,1,True,3
6,LightGBMLarge,-3.174114,0.002002,0.339778,0.002002,0.339778,1,True,11
7,NeuralNetTorch,-3.203236,0.016381,12.109413,0.016381,12.109413,1,True,10
8,RandomForestMSE,-3.273199,0.068493,2.885451,0.068493,2.885451,1,True,5
9,ExtraTreesMSE,-3.287109,0.067201,1.597476,0.067201,1.597476,1,True,7


In [None]:
prediction = predictor.predict(test_x)
y_pred.head()

  self._init_pool(data, label, cat_features, text_features, embedding_features, pairs, weight,


NameError: name 'y_pred' is not defined

## **Submission 양식 확인**

sample_submission.csv 화일 데이터(sample_submission)를 그대로 복사한 후, 
양식의 'ECLO' 컬럼에 test_x에 대한 ECLO(y) 예측값을 입력합니다 

In [None]:
baseline_submission

Unnamed: 0,ID,ECLO
0,ACCIDENT_39609,0
1,ACCIDENT_39610,0
2,ACCIDENT_39611,0
3,ACCIDENT_39612,0
4,ACCIDENT_39613,0
...,...,...
10958,ACCIDENT_50567,0
10959,ACCIDENT_50568,0
10960,ACCIDENT_50569,0
10961,ACCIDENT_50570,0


In [146]:
baseline_submission = sample_submission.copy()
baseline_submission['ECLO'] = prediction
baseline_submission 

## **답안지 저장 및 제출하기**

In [None]:
baseline_submission.to_csv('baseline_submit.csv', index=False)