In [None]:
!pip install pytorch-tabnet
import pandas as pd
import numpy  as np
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.model_selection import KFold

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.0-py3-none-any.whl (41 kB)
[K     |████████████████████████████████| 41 kB 542 kB/s 
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-4.0


In [None]:
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc

csv to parquet()


-> 메모리에 효율적인 데이터 유형을 사용하여 용량을 크게 줄이고 빠른 작업이 가능

In [None]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
csv_to_parquet('/content/drive/MyDrive/3-2 인공지능/jeju_data/train.csv', 'train')
csv_to_parquet('/content/drive/MyDrive/3-2 인공지능/jeju_data/test.csv', 'test')

train Done.
test Done.


데이터 불러오기

In [None]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')

### 데이터 전처리

데이터 자료형 변형

: RAM 초과 방지

In [None]:
to_int32 = ["base_date", "base_hour", "lane_count", "road_rating", "multi_linked", "connect_code", "road_type"]
to_float32 = ["vehicle_restricted", "height_restricted", "maximum_speed_limit", "weight_restricted", "target"]

for i in to_int32:
    train[i] = train[i].astype("int32")
for j in to_float32:
    train[j] = train[j].astype("float32")

LabelEncoder

: 카테고리형 데이터를 수치형으로 변환

- (OneHotEncoder 사용하려 했지만, RAM 초과로 실행 안 됨)
```
train = pd.get_dummies(train)
test = pd.get_dummies(test)
```



In [None]:
str_col = ['day_of_week',
           'base_hour',
           'lane_count',
           'maximum_speed_limit',
           'start_latitude',
           'start_longitude',
           'end_latitude',
           'end_longitude',
           'road_rating',
           'weight_restricted',
           'start_turn_restricted',
           'end_turn_restricted',
           'start_node_name',
           'end_node_name',
           'road_type',
           'road_name',
           'connect_code',
           'multi_linked']
for i in str_col:

    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])

    for label in np.unique(test[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])

값이 하나인 컬럼 삭제

In [None]:
train = train.drop(['id', 'vehicle_restricted', 'height_restricted'], axis = 1)
test = test.drop(['id', 'vehicle_restricted', 'height_restricted'], axis = 1)

target 값 속도 100km/h 이상 제거

속도 EDA부분을 봤을 때 최고 제한속도가 80km/h이기 때문에 차이가 심하게나는 극단치(100km/h 이상)만을 제거

In [None]:
train = train[train.target<100]

In [None]:
y = train["target"]

In [None]:
train = train.drop(['target'], axis = 1)

Data Scaling

- StandardScaler() : best_val_0_mse = 0.13558

- MinMaxScaler(): best_val_0_mse = 34.38201904296875

- MaxAbsScaler() : best_val_0_mse = 31.513229370117188

- RobustScaler() :  best_val_0_mse = 33.38386917114258

라는 결과 나와  MaxAbsScaler 사용

In [None]:
from sklearn.preprocessing import MaxAbsScaler

mas = MaxAbsScaler()

mas.fit(train)
train.loc[:, :] = mas.transform(train)
test.loc[:, :] = mas.transform(test)

In [None]:
X_test = test

In [None]:
X_test

Unnamed: 0,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,weight_restricted,road_type,start_node_name,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted
0,1.000005,0.166667,0.739130,1.0,1.0,0.533333,0.0,0.0,0.8,0.0,0.0,0.335391,0.880342,0.577778,0.0,0.829218,0.895726,0.593162,1.0
1,1.000004,1.000000,0.521739,0.5,0.0,0.583333,0.0,0.0,0.8,0.0,1.0,0.847737,0.155556,0.280342,0.0,0.045267,0.153846,0.254701,0.0
2,1.000004,0.000000,0.086957,0.0,0.0,0.600000,0.0,0.0,0.6,0.0,0.0,0.199588,0.160684,0.376068,0.0,0.195473,0.164103,0.376068,0.0
3,1.000004,0.166667,1.000000,1.0,0.0,0.566667,0.0,0.0,0.8,0.0,0.0,0.539095,0.726496,0.596581,0.0,0.720165,0.712821,0.596581,0.0
4,1.000004,0.333333,0.739130,1.0,0.5,0.200000,0.0,0.0,0.8,0.0,0.0,0.314815,0.897436,0.685470,0.0,0.298354,0.870085,0.707692,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291236,1.000005,0.833333,0.217391,0.0,0.0,0.600000,0.0,0.0,1.0,0.0,0.0,0.693416,0.723077,0.358974,0.0,0.185185,0.705983,0.357265,0.0
291237,1.000004,0.000000,0.869565,0.5,0.0,0.566667,0.0,0.0,0.6,0.0,0.0,0.590535,0.309402,0.733333,0.0,0.927984,0.304274,0.736752,0.0
291238,1.000004,0.000000,0.478261,0.0,1.0,0.383333,0.0,0.0,0.0,0.0,0.0,0.244856,0.842735,0.437607,1.0,0.899177,0.873504,0.441026,1.0
291239,1.000004,0.000000,0.304348,0.5,1.0,0.016667,0.0,0.0,0.6,0.0,0.0,0.082305,0.114530,0.454701,0.0,0.711934,0.129915,0.458120,0.0


In [None]:
X = train

In [None]:
X = X.to_numpy()
y = y.to_numpy().reshape(-1, 1)
X_test = X_test.to_numpy()

kFold 사용해서 train과 Validation 나눠주기

In [None]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)
predictions_array =[]
CV_score_array    =[]
for train_index, test_index in kf.split(X):
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]

### 모델 선언 및 학습

파라미터 튜닝해주기

In [None]:
clf = TabNetRegressor(n_steps=3, gamma = 1.3,
                                optimizer_params=dict(lr=2e-2),
                                mask_type='entmax',
                                lambda_sparse=0
                      )

TabNet 모델 사용

In [None]:
from pytorch_tabnet.tab_model import TabNetRegressor

clf = TabNetRegressor()
clf.fit(
  X_train, y_train,
  eval_set=[(X_valid, y_valid)],
  batch_size=1024, virtual_batch_size=128,
  max_epochs=200, patience=10
)



epoch 0  | loss: 74.99986| val_0_mse: 46.47909927368164|  0:03:28s
epoch 1  | loss: 48.02594| val_0_mse: 41.7305793762207|  0:07:41s
epoch 2  | loss: 45.25614| val_0_mse: 40.704261779785156|  0:11:35s
epoch 3  | loss: 44.18535| val_0_mse: 39.683109283447266|  0:16:35s
epoch 4  | loss: 43.5769 | val_0_mse: 40.272361755371094|  0:20:49s
epoch 5  | loss: 43.16204| val_0_mse: 39.282318115234375|  0:24:23s
epoch 6  | loss: 42.82725| val_0_mse: 38.48244094848633|  0:27:44s
epoch 7  | loss: 42.47354| val_0_mse: 38.47930145263672|  0:31:05s
epoch 8  | loss: 42.24093| val_0_mse: 37.78300094604492|  0:34:29s
epoch 9  | loss: 41.9733 | val_0_mse: 37.48252868652344|  0:37:45s
epoch 10 | loss: 41.79403| val_0_mse: 37.782039642333984|  0:41:03s
epoch 11 | loss: 41.65425| val_0_mse: 37.7052001953125|  0:44:26s
epoch 12 | loss: 41.50367| val_0_mse: 37.81813049316406|  0:47:34s
epoch 13 | loss: 41.3439 | val_0_mse: 37.05210876464844|  0:50:42s
epoch 14 | loss: 41.28077| val_0_mse: 37.053218841552734|  



### 추론

In [None]:
preds = clf.predict(X_test)

In [None]:
sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission['target'] = preds
sample_submission.to_csv("./submit_deep_max.csv", index = False)