## Import & Data Load

In [29]:
# Optuna Libraries
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
import pandas as pd
from tqdm import tqdm
import numpy as np

# LGBM Regressor
from lightgbm import LGBMRegressor

# train_test_split
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Evaluation Score
from sklearn.metrics import mean_absolute_error

In [30]:
train = pd.read_csv('train.csv').drop(columns=['SAMPLE_ID'])
test = pd.read_csv('test.csv').drop(columns=['SAMPLE_ID'])

In [31]:
train.columns

Index(['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'DIST', 'ATA', 'ID',
       'BREADTH', 'BUILT', 'DEADWEIGHT', 'DEPTH', 'DRAUGHT', 'GT', 'LENGTH',
       'SHIPMANAGER', 'FLAG', 'U_WIND', 'V_WIND', 'AIR_TEMPERATURE', 'BN',
       'ATA_LT', 'PORT_SIZE', 'CI_HOUR'],
      dtype='object')

In [32]:
# datetime 컬럼 처리
import bisect

from sklearn.preprocessing import LabelEncoder
train['ATA'] = pd.to_datetime(train['ATA'])
test['ATA'] = pd.to_datetime(test['ATA'])

# datetime을 여러 파생 변수로 변환
for df in [train, test]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    df['minute'] = df['ATA'].dt.minute
    df['weekday'] = df['ATA'].dt.weekday

# datetime 컬럼 제거
train.drop(columns='ATA', inplace=True)
test.drop(columns='ATA', inplace=True)

# Categorical 컬럼 인코딩
categorical_features = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'ID', 'SHIPMANAGER', 'FLAG']
encoders = {}

for feature in tqdm(categorical_features, desc="Encoding features"):
    le = LabelEncoder()
    train[feature] = le.fit_transform(train[feature].astype(str))
    le_classes_set = set(le.classes_)
    test[feature] = test[feature].map(lambda s: '-1' if s not in le_classes_set else s)
    le_classes = le.classes_.tolist()
    bisect.insort_left(le_classes, '-1')
    le.classes_ = np.array(le_classes)
    test[feature] = le.transform(test[feature].astype(str))
    encoders[feature] = le

# 결측치 처리
train.fillna(train.mean(), inplace=True)
test.fillna(train.mean(), inplace=True)

Encoding features: 100%|██████████| 6/6 [00:01<00:00,  4.74it/s]


# Optuna

In [33]:
X = train.drop(columns='CI_HOUR')
y = train['CI_HOUR']

In [34]:
X

Unnamed: 0,ARI_CO,ARI_PO,SHIP_TYPE_CATEGORY,DIST,ID,BREADTH,BUILT,DEADWEIGHT,DEPTH,DRAUGHT,...,AIR_TEMPERATURE,BN,ATA_LT,PORT_SIZE,year,month,day,hour,minute,weekday
0,17,21,2,30.881018,24710,30.0,24,24300,10.0,10.0,...,18.862968,2.706992,5,0.002615,2018,12,17,21,29,0
1,7,81,0,0.000000,23140,30.0,13,35900,10.0,10.0,...,18.862968,2.706992,12,0.000217,2014,9,23,6,59,1
2,4,14,2,0.000000,19009,50.0,12,146000,30.0,20.0,...,18.862968,2.706992,6,0.001614,2015,2,3,22,0,1
3,8,101,2,0.000000,24048,20.0,18,6910,10.0,10.0,...,6.700000,2.629350,13,0.000356,2020,1,17,4,2,4
4,17,21,2,27.037650,911,50.0,10,116000,20.0,10.0,...,25.600000,2.495953,15,0.002615,2020,1,26,7,51,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391934,8,66,2,0.000000,23490,20.0,27,6820,10.0,10.0,...,18.862968,2.706992,14,0.000552,2017,6,6,5,2,1
391935,17,21,0,5.884603,10196,10.0,12,3160,10.0,10.0,...,27.300000,1.253491,8,0.002615,2019,10,16,0,36,2
391936,21,61,0,70.660241,8823,30.0,8,60300,20.0,10.0,...,21.100000,4.766257,18,0.000155,2021,3,23,22,35,1
391937,19,35,2,9.448179,9246,30.0,29,23800,10.0,10.0,...,18.862968,2.706992,15,0.000990,2015,1,8,7,15,3


In [35]:
val = pd.read_csv('corrected_test_with_target_v2.csv')
val.drop(['SAMPLE_ID'],axis=1,inplace=True)

# datetime 컬럼 처리
import bisect

from sklearn.preprocessing import LabelEncoder
val['ATA'] = pd.to_datetime(val['ATA'])
# datetime을 여러 파생 변수로 변환
for df in [val]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    df['minute'] = df['ATA'].dt.minute
    df['weekday'] = df['ATA'].dt.weekday

# datetime 컬럼 제거
val.drop(columns='ATA', inplace=True)

# Categorical 컬럼 인코딩
categorical_features = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'ID', 'SHIPMANAGER', 'FLAG']
encoders = {}

for feature in tqdm(categorical_features, desc="Encoding features"):
    le = LabelEncoder()
    val[feature] = le.fit_transform(val[feature].astype(str))
    le_classes_set = set(le.classes_)
    le_classes = le.classes_.tolist()
    bisect.insort_left(le_classes, '-1')
    le.classes_ = np.array(le_classes)
    encoders[feature] = le

# 결측치 처리
val.fillna(val.mean(), inplace=True)
val

Encoding features: 100%|██████████| 6/6 [00:00<00:00, 22.29it/s]


Unnamed: 0,ARI_CO,ARI_PO,SHIP_TYPE_CATEGORY,DIST,ID,BREADTH,BUILT,DEADWEIGHT,DEPTH,DRAUGHT,...,BN,ATA_LT,PORT_SIZE,CI_HOUR,year,month,day,hour,minute,weekday
0,17,21,2,1.826589,8984,50,18,117000,30,20,...,1.587063,19,0.002615,4.561948,2020,6,18,11,58,3
1,4,90,1,25.399386,3860,10,13,3810,10,10,...,2.663972,6,0.001028,49.284748,2021,5,26,22,20,2
2,4,45,2,111.079467,5139,20,26,10900,10,10,...,3.255315,8,0.001743,21.803541,2019,12,16,0,9,0
3,2,19,0,9.175258,690,30,9,55800,20,10,...,2.709961,2,0.000182,166.918365,2015,11,16,5,30,0
4,8,66,2,0.000000,787,30,19,39800,20,10,...,2.709961,10,0.000552,0.000000,2018,10,24,1,11,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220486,1,76,0,21.712733,4223,60,15,298000,30,20,...,2.709961,13,0.000080,137.437531,2017,4,4,15,53,1
220487,2,23,0,4.870490,14465,30,9,80500,20,10,...,2.709961,13,0.000039,71.664566,2016,8,22,16,22,0
220488,8,88,3,17.068286,17634,10,14,1200,0,0,...,2.405268,23,0.000264,18.464554,2022,7,10,14,53,6
220489,4,62,1,0.000000,1930,10,27,3420,10,0,...,2.199039,22,0.000595,0.000000,2020,12,28,14,38,0


In [36]:
X_val = val.drop(columns='CI_HOUR')
y_val = val['CI_HOUR']
X_train = X.copy()
y_train = y.copy()

In [5]:
#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
# random sampler
sampler = TPESampler(seed=42)

# define function
def objective(trial):

    lgbm_param = {
        'objective': 'regression',
        'verbose': -1,
        'metric': 'mae', 
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=1, log=True), 
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
        'max_depth': trial.suggest_int('max_depth',3, 15),
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-8, 1e-2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_loguniform('subsample', 0.4, 1),
    }

    # Generate model
    model_lgbm = LGBMRegressor(**lgbm_param)
    model_lgbm = model_lgbm.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                           )
                           
    # * 평기 지표이다.
    # 원하는 평가 지표에 따라 사용하면 된다.                         
    MAE = mean_absolute_error(y_val, model_lgbm.predict(X_val))
    return MAE

optuna_lgbm = optuna.create_study(direction='minimize', sampler=sampler)

# * n_trials의 경우 optuna를 몇번 실행하여 hyper parameter를 찾을 것인지를 정한다.
# 50으로 설정해도 유의미한 값이 나온다.
optuna_lgbm.optimize(objective, n_trials=100)

[I 2023-10-26 19:39:33,941] A new study created in memory with name: no-name-3c419312-8d48-4a24-8f12-68efa3bbe4df
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
  'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
  'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
  'learning_rate': trial.suggest_loguniform("learning_rate", 1e-8, 1e-2),
  'subsample': trial.suggest_loguniform('subsample', 0.4, 1),
[I 2023-10-26 19:39:37,155] Trial 0 finished with value: 79.98095765257591 and parameters: {'num_leaves': 17, 'colsample_bytree': 0.9852142919229748, 'reg_alpha': 0.7319939418114051, 'reg_lambda': 5.986584841970366, 'max_depth': 5, 'learning_rate': 8.62913219007185e-08, 'n_estimators': 268, 'min_child_samples': 88, 'subsample': 0.6938533737439828}. Best is trial 0 with value: 79.98095765257591.
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
  'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
  'reg_lambda': 

In [38]:
lgbm_trial = optuna_lgbm.best_trial
lgbm_trial_params = lgbm_trial.params
print('Best Trial: score {},\nparams {}'.format(lgbm_trial.value, lgbm_trial_params))

Best Trial: score 51.995740803557055,
params {'num_leaves': 1009, 'colsample_bytree': 0.9999456145376505, 'reg_alpha': 0.5249384735373511, 'reg_lambda': 7.513652050945574, 'max_depth': 14, 'learning_rate': 0.009529301960336817, 'n_estimators': 990, 'min_child_samples': 15, 'subsample': 0.970568917121908}


In [7]:
lgbm_trial = optuna_lgbm.best_trial
lgbm_trial_params = lgbm_trial.params
print('Best Trial: score {},\nparams {}'.format(lgbm_trial.value, lgbm_trial_params))

Best Trial: score 51.19124792593405,
params {'num_leaves': 704, 'colsample_bytree': 0.8741346064818083, 'reg_alpha': 0.06678417604777233, 'reg_lambda': 6.182491934564087, 'max_depth': 15, 'learning_rate': 0.00793088008159897, 'n_estimators': 1751, 'min_child_samples': 11, 'subsample': 0.4169490454656619}


## K-Fold Model Fitting & Validation

In [41]:
X_train = train.drop(columns='CI_HOUR')
y_train = train['CI_HOUR']
X_train_reduced = X_train
X_test_reduced = test

In [42]:
lgbm = LGBMRegressor(**lgbm_trial_params)

# 5-Fold 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 각 fold의 모델로부터의 예측을 저장할 리스트와 MAE 점수 리스트
ensemble_predictions = []
scores = []

for train_idx, val_idx in tqdm(kf.split(X_train_reduced), total=5, desc="Processing folds"):
    X_t, X_val = X_train_reduced.iloc[train_idx], X_train_reduced.iloc[val_idx]
    y_t, y_val = y_train[train_idx], y_train[val_idx]
    
    # 두 모델 모두 학습
    lgbm.fit(X_t, y_t)
    
    # 각 모델로부터 Validation set에 대한 예측을 평균내어 앙상블 예측 생성
    val_pred = lgbm.predict(X_val)
    
    # Validation set에 대한 대회 평가 산식 계산 후 저장
    scores.append(mean_absolute_error(y_val, val_pred))
    
    # test 데이터셋에 대한 예측 수행 후 저장
    lgbm_pred = lgbm.predict(X_test_reduced)
    lgbm_pred = np.where(lgbm_pred < 0, 0, lgbm_pred)
    
    ensemble_predictions.append(lgbm_pred)

# K-fold 모든 예측의 평균을 계산하여 fold별 모델들의 앙상블 예측 생성
final_predictions = np.mean(ensemble_predictions, axis=0)

# 각 fold에서의 Validation Metric Score와 전체 평균 Validation Metric Score출력
print("Validation : MAE scores for each fold:", scores)
print("Validation : MAE:", np.mean(scores))

Processing folds:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019314 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2914
[LightGBM] [Info] Number of data points in the train set: 313551, number of used features: 26
[LightGBM] [Info] Start training from score 62.157481


Processing folds:  20%|██        | 1/5 [00:46<03:05, 46.47s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021287 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2916
[LightGBM] [Info] Number of data points in the train set: 313551, number of used features: 26
[LightGBM] [Info] Start training from score 61.815772


Processing folds:  40%|████      | 2/5 [01:36<02:25, 48.47s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021749 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2914
[LightGBM] [Info] Number of data points in the train set: 313551, number of used features: 26
[LightGBM] [Info] Start training from score 61.753606


Processing folds:  60%|██████    | 3/5 [02:27<01:39, 49.54s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021798 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2911
[LightGBM] [Info] Number of data points in the train set: 313551, number of used features: 26
[LightGBM] [Info] Start training from score 61.959403


Processing folds:  80%|████████  | 4/5 [03:18<00:50, 50.20s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021864 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2911
[LightGBM] [Info] Number of data points in the train set: 313552, number of used features: 26
[LightGBM] [Info] Start training from score 62.017911


Processing folds: 100%|██████████| 5/5 [04:09<00:00, 49.99s/it]

Validation : MAE scores for each fold: [51.00459706214756, 52.5298912443541, 52.2935035015231, 51.572641734521625, 51.63261184621404]
Validation : MAE: 51.806649077752084





## Submission

In [43]:
submit = pd.read_csv('sample_submission.csv')
submit['CI_HOUR'] = final_predictions

In [44]:
submit['DIST'] = test['DIST'].apply(lambda x: 0 if x==0 else 1)
submit['CI_HOUR'] = submit['CI_HOUR']*submit['DIST']
submit.drop(['DIST'],axis=1,inplace=True)
submit

Unnamed: 0,SAMPLE_ID,CI_HOUR
0,TEST_000000,137.171407
1,TEST_000001,21.745652
2,TEST_000002,191.331494
3,TEST_000003,126.685870
4,TEST_000004,0.000000
...,...,...
220486,TEST_220486,116.179246
220487,TEST_220487,85.581130
220488,TEST_220488,94.552352
220489,TEST_220489,0.000000


In [45]:
submit.to_csv('lgbm_testing_1.csv', index=False)

In [122]:
test = pd.read_csv('test.csv')
a = pd.read_csv('autogluon_stack3.csv')
#b = pd.read_csv('corrected_test_with_target_v2.csv')
a = a[['CI_HOUR']]
#b = b[['CI_HOUR']]
a = a.rename(columns={'CI_HOUR': 'CI_HOUR_NEW'})
#b = b.rename(columns={'CI_HOUR': 'CI_HOUR_OLD'})
test = test[['ARI_CO','ARI_PO','ATA','U_WIND', 'V_WIND','AIR_TEMPERATURE','BN']]
test = pd.concat([test,a],axis=1)
#test = pd.concat([test,b],axis=1)
#test['OLD-NEW'] = test['CI_HOUR_OLD'] - test['CI_HOUR_NEW']
test

Unnamed: 0,ARI_CO,ARI_PO,ATA,U_WIND,V_WIND,AIR_TEMPERATURE,BN,CI_HOUR_NEW
0,SG,GIW5,2020-06-18 11:58,0.37,1.63,27.1,1.587063,4.018523
1,CN,WEY7,2021-05-26 22:20,-2.79,-2.33,14.2,2.663972,14.224622
2,CN,NGG6,2019-12-16 0:09,0.04,-4.91,9.3,3.255315,25.379988
3,CA,FFM2,2015-11-16 5:30,,,,,84.764893
4,JP,QYY1,2018-10-24 1:11,,,,,0.000000
...,...,...,...,...,...,...,...,...
220486,BR,TMW2,2017-04-04 15:53,,,,,58.233528
220487,CA,GRQ5,2016-08-22 16:22,,,,,43.925217
220488,JP,VYJ1,2022-07-10 14:53,-2.87,1.22,27.4,2.405268,25.632092
220489,CN,QQW1,2020-12-28 14:38,-2.65,-0.64,8.2,2.199039,0.000000


In [64]:
# 각 ARI_CO 값에 따른 데이터프레임 저장
file_paths = []
unique_ari_co = test['ARI_CO'].unique()
test = test.sort_values('OLD-NEW')
for ari_co in unique_ari_co:
    subset_df = test[test['ARI_CO'] == ari_co]
    file_path = f"OLD-NEW_{ari_co}.csv"
    subset_df.to_csv(file_path, index=False)
    file_paths.append(file_path)

In [80]:
mapping_dict_global

{('SG', 0.37, 1.63, 27.1, 1.587063314): (Timestamp('2020-06-18 06:48:00'),
  2.6007273),
 ('JP', nan, nan, nan, nan): (Timestamp('2018-10-24 01:11:00'), 0.012787551),
 ('BR', 0.0, 0.0, 28.3, 0.0): (Timestamp('2019-03-19 03:59:00'), 0.34415573),
 ('CN', nan, nan, nan, nan): (Timestamp('2019-01-03 23:26:00'), 0.012787553),
 ('RU', nan, nan, nan, nan): (Timestamp('2018-02-09 05:30:00'), 2.3347576),
 ('CN', -4.53, 6.68, 26.6, 4.534044032): (Timestamp('2021-07-28 04:57:00'),
  0.012787512),
 ('MY', 1.04, 1.34, 27.6, 1.602700967): (Timestamp('2020-05-05 05:01:00'),
  0.012795658),
 ('TW', 0.06, -0.67, 18.2, 0.865104709): (Timestamp('2022-02-07 06:57:00'),
  0.012789183),
 ('JP', 0.44, -1.69, 16.6, 1.63411088): (Timestamp('2022-10-28 02:27:00'),
  0.012789421),
 ('CN', nan, nan, nan, nan): (Timestamp('2018-03-27 00:12:00'), 0.012787503),
 ('CN', -1.63, -9.89, -0.6, 5.238494693): (Timestamp('2020-12-17 22:39:00'),
  0.012787493),
 ('CN', nan, nan, nan, nan): (Timestamp('2016-11-10 20:18:00'), 

In [123]:
import pandas as pd

# 데이터 불러오기
#full_df = pd.read_csv("OLD-NEW.csv")
full_df = test.copy()

full_df['ATA'] = pd.to_datetime(full_df['ATA'])

# 매핑 딕셔너리 생성
'''
mapping_dict_global = {(row['ARI_PO'], row['U_WIND'], row['V_WIND'], row['AIR_TEMPERATURE'], row['BN']): (row['ATA'], row['CI_HOUR_NEW'])
                       for _, row in full_df.iterrows() if row['CI_HOUR_NEW'] <= 5}
'''
mapping_dict_global = {
    (row['ARI_PO'], row['U_WIND'], row['V_WIND'], row['AIR_TEMPERATURE'], row['BN']): (row['ATA'], row['CI_HOUR_NEW'])
    for _, row in full_df.iterrows() 
    if row['CI_HOUR_NEW'] <= 1 and not any(pd.isna([row['ARI_PO'], row['U_WIND'], row['V_WIND'], row['AIR_TEMPERATURE'], row['BN']]))
}

# 새로운 타겟 값 계산 함수
def compute_new_target_global(row):
    key = (row['ARI_PO'], row['U_WIND'], row['V_WIND'], row['AIR_TEMPERATURE'], row['BN'])
    if key in mapping_dict_global and mapping_dict_global[key][0] > row['ATA']:
        time_difference = (mapping_dict_global[key][0] - row['ATA']).total_seconds() / 3600
        return time_difference
    return None

# 새로운 타겟 값 계산
full_df['New_Target'] = full_df.apply(compute_new_target_global, axis=1)
import numpy as np
full_df['New_Target'] = np.where((full_df['New_Target'] > 2159) | full_df['New_Target'].isna(), np.nan, full_df['New_Target'])

# 결과 데이터프레임 저장
#file_path = "PostProcessed_OLD-NEW.csv"
#full_df.to_csv(file_path, index=False)

In [124]:
full_df['CI_HOUR_NEW'] = full_df['CI_HOUR_NEW'].where(full_df['New_Target'].isna(), full_df['New_Target'])
full_df

Unnamed: 0,ARI_CO,ARI_PO,ATA,U_WIND,V_WIND,AIR_TEMPERATURE,BN,CI_HOUR_NEW,New_Target
0,SG,GIW5,2020-06-18 11:58:00,0.37,1.63,27.1,1.587063,4.018523,
1,CN,WEY7,2021-05-26 22:20:00,-2.79,-2.33,14.2,2.663972,14.224622,
2,CN,NGG6,2019-12-16 00:09:00,0.04,-4.91,9.3,3.255315,25.379988,
3,CA,FFM2,2015-11-16 05:30:00,,,,,84.764893,
4,JP,QYY1,2018-10-24 01:11:00,,,,,0.000000,
...,...,...,...,...,...,...,...,...,...
220486,BR,TMW2,2017-04-04 15:53:00,,,,,58.233528,
220487,CA,GRQ5,2016-08-22 16:22:00,,,,,43.925217,
220488,JP,VYJ1,2022-07-10 14:53:00,-2.87,1.22,27.4,2.405268,25.632092,
220489,CN,QQW1,2020-12-28 14:38:00,-2.65,-0.64,8.2,2.199039,0.000000,


In [125]:
sub = pd.read_csv('sample_submission.csv')
sub['CI_HOUR'] = full_df['CI_HOUR_NEW']
sub.to_csv("SOTATEST.csv",index=False)
sub

Unnamed: 0,SAMPLE_ID,CI_HOUR
0,TEST_000000,4.018523
1,TEST_000001,14.224622
2,TEST_000002,25.379988
3,TEST_000003,84.764893
4,TEST_000004,0.000000
...,...,...
220486,TEST_220486,58.233528
220487,TEST_220487,43.925217
220488,TEST_220488,25.632092
220489,TEST_220489,0.000000
