In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from datetime import datetime

import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier

# Data preprocessing

In [3]:
train_df = pd.read_csv(f"/content/drive/MyDrive/aigames_data/train_wind_clean_v4.csv")
test_df = pd.read_csv("/content/drive/MyDrive/aigames_data/test_wind_clean_v4.csv")

### Aliasing
Merge routes with the same procedures

In [4]:
alias = {
    'NORMY':'ALPSE',
    'PAE': 'ALPSE',
    'SEA7 SEA': 'ALPSE',
    'SUMMA2 LKV': 'ALPSE',
    'SUMMA2 SUMMA': 'ALPSE',
    'OZWLD1 OZWLD': 'JEFPO1 JEFPO',
    'ELMAA3 CVO': 'ELMAA3 ELMAA'
}

def apply_aliasing(df: pd.DataFrame) -> pd.DataFrame:
    for index, row in df.iterrows():
        id = row['route_id']
        if id in alias:
            df.at[index, 'route_id'] = alias[id]
    return df


In [5]:
train_df = apply_aliasing(train_df)
test_df = apply_aliasing(test_df)

### Encode time

In [6]:
def day_idf(y, m, d):
    return (datetime(y, m, d) - datetime(y, 1, 1)).days + 1 + 366 if (y == 2021) else 0


def day_of_year(y, m, d):
    return (datetime(y, m, d) - datetime(y, 1, 1)).days + 1


def day_of_week(y, m, d):
    return datetime(y, m, d).weekday()


def add_date_time(df: pd.DataFrame) -> pd.DataFrame:
    for index, row in df.iterrows():
        df.at[index, 'timestamp'] = row['timestamp'][:14] + "00:00+00:00"
    df["date"] = [value.split("+")[0].split(" ")[0] for value in df["timestamp"].values]
    df["time"] = [value.split("+")[0].split(" ")[1] for value in df["timestamp"].values]

    return df


def encode_date(df: pd.DataFrame) -> pd.DataFrame:
    days_of_week = []
    for date in df["date"].values:
        year, month, day = [int(x) for x in date.split("-")]
        days_of_week += [day_of_week(year,month,day)]
    df["day_of_week"] = days_of_week

    day_id = []
    for date in df["date"].values:
        year, month, day = [int(x) for x in date.split("-")]
        day_id += [day_idf(year,month,day)]
    df["day_id"] = day_id
    
    return df


def encode_time(df: pd.DataFrame) -> pd.DataFrame:
    times_float = []
    for time in df["time"]:
        hour, minutes, seconds = [float(x) for x in time.split(":")]
        time_float = hour + minutes / 60.0 + seconds / 60.0 / 60.0
        times_float += [time_float]

    df["time_float"] = times_float

    return df


In [7]:
train_df = add_date_time(train_df)
train_df = encode_date(train_df)
train_df = encode_time(train_df)
train_df = train_df.sort_values(by="date")

test_df = add_date_time(test_df)
test_df = encode_date(test_df)
test_df = encode_time(test_df)

### Encode labels

In [8]:
label_encoders = []
labels = ['route_id', 'skyc1', 'airport', 'route_type']

for label in labels:
    le = LabelEncoder()
    train_df[label] = le.fit_transform(train_df[label])
    test_df[label] = le.transform(test_df[label])


### Remove unnecessary columns

In [9]:
columns_to_drop = [
    "timestamp", "date", "time", "day_id", "angle_diff", "direction",
    "tmpf" , "dwpf"  , "relh",  "drct", "sknt", "p01i"
]

train_df = train_df.drop(columns_to_drop, axis=1)
test_df = test_df.drop(columns_to_drop, axis=1)

# Training

In [10]:
# shuffle
train_df = train_df.sample(frac=1)

# split labels
X_train = train_df.drop(['status'], axis=1)
y_train = train_df['status']

# normalization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# build model
xgb = XGBClassifier(
    objective='binary:logistic', max_depth=9,
    n_estimators=180, learning_rate=0.1
)

# train
xgb = xgb.fit(X_train, y_train)

# Prediction

In [12]:
# normalization
X_test = scaler.transform(test_df)

# prediction
y_test = xgb.predict(X_test)

# save results
status = ["OPEN" if y == 1 else "CLSD" for y in y_test]
result = pd.DataFrame(
    data={'observation_id': test_df["observation_id"], 'status': status}
)
result.to_csv('result.csv', index=False)