In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv('train.csv')
dataset.head()

Unnamed: 0,id,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_t31,frp,daynight
0,80378,-16.051,141.996,322.8,2.0,1.4,2019-12-03,1335,Terra,MODIS,100,6.0NRT,298.5,42.8,N
1,79962,-32.855,150.711,335.0,1.2,1.1,2019-12-03,405,Aqua,MODIS,84,6.0NRT,305.4,29.1,D
2,9680,-12.216,132.732,346.1,1.4,1.2,2019-10-12,435,Aqua,MODIS,90,6.0NRT,312.3,52.1,D
3,61999,-32.991,150.507,358.8,1.0,1.0,2019-11-19,350,Aqua,MODIS,99,6.0NRT,313.7,68.8,D
4,44632,-12.938,136.14,324.4,1.0,1.0,2019-11-10,1330,Terra,MODIS,100,6.0NRT,295.0,21.8,N


In [3]:
dataset = dataset.drop(columns = ['id', 'version'])
dataset.head()

Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,bright_t31,frp,daynight
0,-16.051,141.996,322.8,2.0,1.4,2019-12-03,1335,Terra,MODIS,100,298.5,42.8,N
1,-32.855,150.711,335.0,1.2,1.1,2019-12-03,405,Aqua,MODIS,84,305.4,29.1,D
2,-12.216,132.732,346.1,1.4,1.2,2019-10-12,435,Aqua,MODIS,90,312.3,52.1,D
3,-32.991,150.507,358.8,1.0,1.0,2019-11-19,350,Aqua,MODIS,99,313.7,68.8,D
4,-12.938,136.14,324.4,1.0,1.0,2019-11-10,1330,Terra,MODIS,100,295.0,21.8,N


In [4]:
dataset['acq_datetime'] = pd.to_datetime(dataset['acq_date'] + ' ' + dataset['acq_time'].astype(str).str.zfill(4), format='%Y-%m-%d %H%M')

In [5]:
dataset['year'] = dataset['acq_datetime'].dt.year
dataset['month'] = dataset['acq_datetime'].dt.month
dataset['day'] = dataset['acq_datetime'].dt.day
dataset['hour'] = dataset['acq_datetime'].dt.hour
dataset = dataset.drop(columns=['acq_date', 'acq_time', 'acq_datetime'])

In [6]:
x_train = dataset.drop(columns = ['confidence'])
y_train = dataset['confidence']

In [8]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [9]:
numeric_features = ['latitude', 'longitude', 'brightness', 'scan', 'track', 'bright_t31', 'frp', 'year', 'month', 'day', 'hour']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [10]:
categorical_features = ['satellite', 'instrument', 'daynight']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [12]:
x_train_preprocessed = preprocessor.fit_transform(x_train)

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [163]:
x_train_split, x_val_split, y_train_split, y_val_split = train_test_split(x_train_preprocessed, y_train, test_size=0.2, random_state=42)

#model = RandomForestRegressor(n_estimators=1000, random_state=42)
#model = XGBRegressor(n_estimators=1000, n_jobs = -1, random_state=42)
model = LGBMRegressor(n_estimators=1000, n_jobs = -1, random_state=42, force_col_wise = True, reg_alpha=1e-2, reg_lambda=1e-6, max_depth=50, num_leaves=100, colsample_bytree=0.8, subsample=0.8, subsample_freq=1, min_split_gain=0.01, metric='rmse')

model.fit(x_train_split, y_train_split)

[LightGBM] [Info] Total Bins 1393
[LightGBM] [Info] Number of data points in the train set: 117499, number of used features: 15
[LightGBM] [Info] Start training from score 75.049932


In [164]:
y_val_pred = model.predict(x_val_split)

In [165]:
mse = mean_squared_error(y_val_split, y_val_pred)
r2 = r2_score(y_val_split, y_val_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 180.50166276355841
R^2 Score: 0.7105052490811998


In [15]:
test_dataset = pd.read_csv('test.csv')

In [16]:
test_ids = test_dataset['id']

test_dataset = test_dataset.drop(columns=['version', 'id'])

In [17]:
test_dataset['acq_datetime'] = pd.to_datetime(test_dataset['acq_date'] + ' ' + test_dataset['acq_time'].astype(str).str.zfill(4), format='%Y-%m-%d %H%M')

test_dataset['year'] = test_dataset['acq_datetime'].dt.year
test_dataset['month'] = test_dataset['acq_datetime'].dt.month
test_dataset['day'] = test_dataset['acq_datetime'].dt.day
test_dataset['hour'] = test_dataset['acq_datetime'].dt.hour
test_dataset = test_dataset.drop(columns=['acq_date', 'acq_time', 'acq_datetime'])

In [18]:
x_test_preprocessed = preprocessor.transform(test_dataset)

In [166]:
y_test_pred = model.predict(x_test_preprocessed)

In [167]:
predictions = pd.DataFrame({'id': test_ids, 'confidence': y_test_pred})
predictions.to_csv('predictions.csv', index=False)