# Data preprocessing

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.preprocessing import minmax_scale, PolynomialFeatures

In [5]:
from sklearn.metrics import classification_report

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [7]:
def preprocess_numerical_columns(df, columns):
    new_cols = []
    pf = PolynomialFeatures(2)
    for col in columns:
        value = minmax_scale(df[[col]])
        df.drop(columns=col, inplace=True)
        poly_value = pf.fit_transform(value)
        for i in range(poly_value.shape[1]):
            df[f'{col}_{i}'] = poly_value[:, i]
            new_cols.append(f'{col}_{i}')
    return new_cols

In [8]:
def encode_date(data, columns, max_values):
    for col, max_val in zip(columns, max_values):
        data[col + '_sin'] = np.sin(2 * np.pi * data[col] / max_val)
        data[col + '_cos'] = np.cos(2 * np.pi * data[col] / max_val)
    data.drop(columns=col, inplace=True)
    return data

In [9]:
data = pd.read_csv("weatherAUS.csv")

In [10]:
data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [11]:
data.isna().sum().sort_values() / data.shape[0]

Date             0.000000
Location         0.000000
MaxTemp          0.008669
MinTemp          0.010209
Temp9am          0.012148
WindSpeed9am     0.012148
Humidity9am      0.018246
WindSpeed3pm     0.021050
Rainfall         0.022419
RainToday        0.022419
RainTomorrow     0.022460
Temp3pm          0.024811
WindDir3pm       0.029066
Humidity3pm      0.030984
WindGustSpeed    0.070555
WindGustDir      0.070989
WindDir9am       0.072639
Pressure3pm      0.103314
Pressure9am      0.103568
Cloud9am         0.384216
Cloud3pm         0.408071
Evaporation      0.431665
Sunshine         0.480098
dtype: float64

In [12]:
data = data.dropna()

In [13]:
data['Date'] = pd.to_datetime(data['Date'])
data['year'] = data.Date.dt.year
data['month'] = data.Date.dt.month
data['day'] = data.Date.dt.day
data.drop(columns='Date', inplace=True)

In [14]:
data = encode_date(data, ['day', 'month', 'year'], [365, 12, 2021])

In [15]:
data['RainToday'] = data['RainToday'].apply(lambda x: 0 if x == 'No' else 1)

In [16]:
data['RainTomorrow'] = data['RainTomorrow'].apply(lambda x: 0 if x == 'No' else 1)

In [17]:
categorical_columns = ["Location", "WindGustDir", "WindDir9am", "WindDir3pm"]

data = pd.get_dummies(data, columns=categorical_columns)

In [18]:
numerical_columns = [
    "MinTemp",
    "MaxTemp",
    "Rainfall",
    "WindGustSpeed",
    "WindSpeed9am",
    "WindSpeed3pm",
    "Humidity9am",
    "Humidity3pm",
    "Pressure9am",
    "Pressure3pm",
    "Temp9am",
    "Temp3pm",
]

In [19]:
corr =  data[numerical_columns].corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm
MinTemp,1.0,0.749687,0.1108,0.118054,0.108522,0.137273,-0.174991,0.070827,-0.475661,-0.495577,0.906295,0.727222
MaxTemp,0.749687,1.0,-0.069981,0.035531,-0.008171,0.012266,-0.495781,-0.448116,-0.35098,-0.451691,0.893387,0.984841
Rainfall,0.1108,-0.069981,1.0,0.106308,0.050584,0.044112,0.263625,0.277625,-0.180606,-0.137302,0.013713,-0.074627
WindGustSpeed,0.118054,0.035531,0.106308,1.0,0.608852,0.685236,-0.19341,-0.042653,-0.430363,-0.383683,0.08552,-0.000382
WindSpeed9am,0.108522,-0.008171,0.050584,0.608852,1.0,0.502226,-0.236795,-0.058449,-0.201518,-0.155484,0.053749,-0.018357
WindSpeed3pm,0.137273,0.012266,0.044112,0.685236,0.502226,1.0,-0.100626,0.031843,-0.293155,-0.252095,0.114043,-0.009436
Humidity9am,-0.174991,-0.495781,0.263625,-0.19341,-0.236795,-0.100626,1.0,0.685697,0.114575,0.172972,-0.423598,-0.487758
Humidity3pm,0.070827,-0.448116,0.277625,-0.042653,-0.058449,0.031843,0.685697,1.0,-0.063454,0.024109,-0.151614,-0.497245
Pressure9am,-0.475661,-0.35098,-0.180606,-0.430363,-0.201518,-0.293155,0.114575,-0.063454,1.0,0.961538,-0.44341,-0.310774
Pressure3pm,-0.495577,-0.451691,-0.137302,-0.383683,-0.155484,-0.252095,0.172972,0.024109,0.961538,1.0,-0.500509,-0.421318


In [20]:
corr_var = set([numerical_columns[x] for x, y in zip(*np.where(corr.abs() > 0.8)) if x != y and x < y])

In [21]:
data.drop(columns=list(corr_var), inplace=True)

In [22]:
numerical_columns = list(set(numerical_columns) - corr_var)

In [23]:
numerical_columns = preprocess_numerical_columns(data, numerical_columns)

In [24]:
answer = data['RainTomorrow']
data.drop(columns='RainTomorrow', inplace=True)

In [25]:
x_train, x_test, y_train, y_test = train_test_split(data, answer, test_size=0.25, shuffle=False)

# GridSearchCV

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [27]:
grid = {"C": np.logspace(-3, 3, 7), "penalty": ["l1", "l2"]}

logreg = LogisticRegression()
logreg_cv = GridSearchCV(logreg, grid, cv=10, scoring='roc_auc', n_jobs=-1, verbose=1)
logreg_cv.fit(x_train, y_train)

print("Best parameters: ", logreg_cv.best_params_)
print("Metric: ", logreg_cv.best_score_)

Fitting 10 folds for each of 14 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.5s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:   58.0s finished


Best parameters:  {'C': 0.01, 'penalty': 'l2'}
Metric:  0.8556448688850671


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
clf = LogisticRegression(**{'C': 0.01, 'penalty': 'l2'})

In [29]:
clf.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=0.01)

In [30]:
print(classification_report(y_test, clf.predict(x_test)))

              precision    recall  f1-score   support

           0       0.88      0.97      0.92     11427
           1       0.77      0.45      0.57      2678

    accuracy                           0.87     14105
   macro avg       0.82      0.71      0.74     14105
weighted avg       0.86      0.87      0.86     14105



In [31]:
grid = {'var_smoothing': np.logspace(0, -9, num=100)}

nb = GaussianNB()
nb = GridSearchCV(nb, grid, cv=10, scoring='roc_auc', n_jobs=-1, verbose=1)
nb.fit(x_train, y_train)

print("Best parameters: ", nb.best_params_)
print("Metric: ", nb.best_score_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   47.8s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  1.9min finished


Best parameters:  {'var_smoothing': 0.001}
Metric:  0.8207790164306777


In [32]:
nb = GaussianNB(**{'var_smoothing': 0.001})
nb.fit(x_train, y_train)

GaussianNB(var_smoothing=0.001)

In [33]:
print(classification_report(y_test, nb.predict(x_test)))

              precision    recall  f1-score   support

           0       0.91      0.90      0.91     11427
           1       0.59      0.64      0.62      2678

    accuracy                           0.85     14105
   macro avg       0.75      0.77      0.76     14105
weighted avg       0.85      0.85      0.85     14105



In [34]:
grid = {'n_neighbors': [3, 4, 5, 10], 
        'weights': ['uniform', 'distance'],
        'algorithm' : ['auto', 'kd_tree', 'brute']
        }

knn = KNeighborsClassifier()
knn = GridSearchCV(knn, grid, cv=10, scoring='roc_auc', n_jobs=-1, verbose=1)
knn = knn.fit(x_train, y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  8.6min finished


In [35]:
print("Best parameters: ", knn.best_params_)
print("Metric: ", knn.best_score_)

Best parameters:  {'algorithm': 'auto', 'n_neighbors': 10, 'weights': 'distance'}
Metric:  0.8071349098387213


In [36]:
knn = KNeighborsClassifier(**{'algorithm': 'auto', 'n_neighbors': 10, 'weights': 'distance'})
knn.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=10, weights='distance')

In [37]:
print(classification_report(y_test, knn.predict(x_test)))

              precision    recall  f1-score   support

           0       0.85      0.96      0.90     11427
           1       0.61      0.30      0.40      2678

    accuracy                           0.83     14105
   macro avg       0.73      0.63      0.65     14105
weighted avg       0.81      0.83      0.81     14105



# Timing

In [38]:
%%time
clf.predict(x_test)

CPU times: user 9.19 ms, sys: 3.05 ms, total: 12.2 ms
Wall time: 9.36 ms


array([0, 0, 0, ..., 0, 0, 0])

In [39]:
%%time
nb.predict(x_test)

CPU times: user 49 ms, sys: 15.1 ms, total: 64.1 ms
Wall time: 35.8 ms


array([0, 0, 0, ..., 0, 0, 0])

In [40]:
%%time
knn.predict(x_test)

CPU times: user 8.81 s, sys: 33.9 ms, total: 8.84 s
Wall time: 8.75 s


array([0, 0, 0, ..., 0, 0, 0])

# Conclusion

В качестве метрики оценивания была выбрана roc auc. После чего были подобныны лучшее ппараметры для следующих моделей:
Байесовских классификаторов
Логистической регрессии
Метода ближайших соседей

Лучший результам показала логистическая регрессия (0.8556) с параметрами {'C': 0.01, 'penalty': 'l2'}

Так же сравнивалась скорость работы моделей, лучше результат показала логистическая регрессия - 9.36ms