In [12]:
import pandas as pd
import numpy as np

from xgboost import XGBClassifier

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score, classification_report

import warnings
warnings.filterwarnings("ignore")

In [6]:
necessary_col = ['windspeedKmph', 'winddirDegree','precipMM',
       'visibility', 'pressure', 'cloudcover', 'DewPointF', 'WindGustKmph',
       'tempF', 'WindChillF', 'humidity', 'ArrDel15']

In [7]:
data = pd.read_csv('../cleaned_data.csv')
Df = data[necessary_col]

In [8]:
Df.iloc[:, 5:17] = Df.iloc[:, 5:17].astype('float32')
Df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1851422 entries, 0 to 1851421
Data columns (total 12 columns):
 #   Column         Dtype  
---  ------         -----  
 0   windspeedKmph  int64  
 1   winddirDegree  int64  
 2   precipMM       float64
 3   visibility     int64  
 4   pressure       int64  
 5   cloudcover     float32
 6   DewPointF      float32
 7   WindGustKmph   float32
 8   tempF          float32
 9   WindChillF     float32
 10  humidity       float32
 11  ArrDel15       float32
dtypes: float32(7), float64(1), int64(4)
memory usage: 120.1 MB


In [9]:
Df.head()

Unnamed: 0,windspeedKmph,winddirDegree,precipMM,visibility,pressure,cloudcover,DewPointF,WindGustKmph,tempF,WindChillF,humidity,ArrDel15
0,15,123,0.0,10,1028,0.0,17.0,26.0,29.0,21.0,59.0,0.0
1,15,123,0.0,10,1028,0.0,17.0,26.0,29.0,21.0,59.0,0.0
2,15,38,0.0,10,1020,0.0,29.0,17.0,59.0,58.0,39.0,0.0
3,15,38,0.0,10,1020,0.0,29.0,17.0,59.0,58.0,39.0,0.0
4,15,38,0.0,10,1020,0.0,29.0,17.0,59.0,58.0,39.0,0.0


In [14]:
X_train, X_test, y_train, y_test = train_test_split(Df.iloc[:, :-1], Df.iloc[:, -1], train_size=0.8, stratify=Df.iloc[:, -1].values, random_state=42)


In [15]:
X_train.shape, y_train.shape

((1481137, 11), (1481137,))

In [16]:
y_train.value_counts()

0.0    1170693
1.0     310444
Name: ArrDel15, dtype: int64

In [38]:
# Negative to positive ratio
ratio = 1170693/310444
ratio

3.7710279470693586

In [53]:
xgb = XGBClassifier(max_delta_step=10, n_estimators=400, colsample_bytree= 0.8, gamma= 0,learning_rate= 0.6, max_depth= 10,reg_lambda= 15,scale_pos_weight= 3.77, subsample= 0.8)


In [94]:
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=23)
X_resampled, y_resampled = ros.fit_resample(X_train.values, y_train.values)

In [95]:
np.bincount(y_resampled.astype('int'))

array([310444, 310444])

In [105]:
sc = StandardScaler()
X_train_processed = sc.fit_transform(X_train)
X_train_processed[0]

array([-0.51030525, -0.14872616, -0.26053836,  0.30915351,  0.92716028,
       -1.07393088,  0.53464748, -0.296085  ,  0.15856761,  0.21498097,
        0.24999958])

In [None]:
%%time
model = xgb.fit(X_train_processed ,y_train.values)

In [98]:
X_test_processed = sc.transform(X_test)
X_test_processed[0]

array([-0.85954868,  0.48731847, -0.28385724,  0.32373772, -0.7532913 ,
        1.4239963 , -0.78668725, -0.34399134, -1.07327025, -0.99849695,
        0.20458912])

In [99]:
prediction = model.predict(X_test_processed)

In [100]:
prediction[:100]

array([0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0])

In [102]:
y_test.values[:100]

array([1., 1., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1.,
       1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0.],
      dtype=float32)

In [103]:
accuracy_score(prediction, y_test.values )

0.37050650174865307

In [104]:
f1_score(prediction, y_test.values )

0.37475657319434974

In [75]:
print(classification_report(prediction, y_test))

              precision    recall  f1-score   support

           0       0.30      0.89      0.45     98906
           1       0.86      0.25      0.38    271379

    accuracy                           0.42    370285
   macro avg       0.58      0.57      0.42    370285
weighted avg       0.71      0.42      0.40    370285

