In [1]:
import os
import sys
from pathlib import Path

from math import sqrt
from dateutil.parser import parse
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score, accuracy_score
from sklearn.metrics import confusion_matrix, classification_report, RocCurveDisplay
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, FunctionTransformer

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, GradientBoostingClassifier, GradientBoostingRegressor, StackingClassifier, StackingRegressor
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor

In [2]:
import warnings


if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

In [3]:
seed = 73

In [4]:
wine = pd.read_csv('../data/winequality.csv', sep=';')
rain = pd.read_csv('../data/weatherAUS.csv', sep=',')

## Предобработка

### Винишко

In [5]:
wine.head(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red


In [6]:
wine_target = 'quality'
wine_cat = ['wine type',]
wine_num = list(set(wine.columns) - set(wine_cat) - {wine_target})

wine['wine type'] = wine['wine type'].map({'red': 0, 'white': 1})
wine[wine_num] = StandardScaler().fit_transform(wine[wine_num])

wine.head(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine type
0,0.142473,2.188833,-2.192833,-0.744778,0.569958,-1.10014,-1.446359,1.034993,1.81309,0.193097,-0.915464,5,0
1,0.451036,3.282235,-2.192833,-0.59764,1.197975,-0.31132,-0.862469,0.701486,-0.115073,0.999579,-0.580068,5,0
2,0.451036,2.5533,-1.917553,-0.660699,1.026697,-0.874763,-1.092486,0.768188,0.25812,0.797958,-0.580068,5,0


### Дождик

In [7]:
rain.head(3)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No


In [8]:
rain_target = 'RainTomorrow'
rain_cat = ('RainToday', 'RainTomorrow', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'Location')
rain_num = tuple(set(rain.columns) - set(rain_cat) - {rain_target, 'Date'})

def convert_to_datetime(X):
    X = X.astype(str)
    return pd.to_datetime(X)

from datetime import datetime
def extract_year(X):
    print(type(X))
    print(X.astype(str).apply(lambda x: datetime.strptime(x.astype(str), '%Y-%m-%d')))
    return X.astype(str).apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
    return X.apply(lambda x: parse(str(x)).year)
    return X.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
    return X.apply(lambda x: datetime.strptime(x, '%Y-%m-%d').year)

# def extract_month(X):
#     return X.apply(lambda x: datetime.strptime(x, '%Y-%m-%d').month)

# def extract_day(X):
#     return X.apply(lambda x: datetime.strptime(x, '%Y-%m-%d').day)


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

rain_preprocessor = ColumnTransformer(transformers=[
    # ('drop', FunctionTransformer(
    #     pd.DataFrame.drop,
    #     kw_args={'columns': ['Date']}), ['Date']),
    ('date', FunctionTransformer(convert_to_datetime), ['Date']),
    # ('year', FunctionTransformer(extract_year), ['Date']),
    # ('month', FunctionTransformer(extract_month), ['Date']),
    # ('day', FunctionTransformer(extract_day), ['Date']),
    # ('num', numeric_transformer, rain_num),
    # ('cat', categorical_transformer, rain_cat)
], remainder='passthrough')

# rain = rain_preprocessor.fit_transform(rain)
# display(rain)

In [9]:
rain['Date'] = rain['Date'].apply(parse)
rain['Day'] = rain['Date'].apply(lambda x: x.day)
rain['Month'] = rain['Date'].apply(lambda x: x.month)
rain.drop(columns=['Date'], inplace=True)

for col in (
    'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am',
    'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm'
    ):
    rain[col].fillna(rain[col].mean(), inplace=True)

for col in ('WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow'):
    rain[col].fillna(rain[col].mode()[0], inplace=True)

for col in ('RainToday', 'RainTomorrow'):
    rain[col] = rain[col].map({'No': 0, 'Yes': 1})

dirs = rain['WindGustDir'].unique()
for col in ('WindGustDir', 'WindDir9am', 'WindDir3pm'):
    rain[col] = rain[col].map({k: v for v, k in enumerate(dirs)})

# rain = pd.get_dummies(rain, ['Location'])
rain['Location'] = rain['Location'].map({k: v for v, k in enumerate(rain['Location'].unique())})

for col in set(rain.columns) - {'RainToday', 'RainTomorrow'}:
    rain[col] = (rain[col] - np.mean(rain[col])) / np.std(rain[col])

In [10]:
rain.head(3)

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Day,Month
0,-1.689897,0.189447,-0.045336,-0.210072,-2.809331e-16,-6.50805e-16,-1.429626,0.302234,-1.677294,-1.424246,...,-1.478788,-1.223882,1.568037,-8.487347e-16,-0.014053,0.017023,0,0,-1.672844,1.634076
1,-1.689897,-0.753101,0.265043,-0.28165,-2.809331e-16,-6.50805e-16,-1.225773,0.302234,-0.770458,-1.208871,...,-1.047782,-1.118839,-3.920285e-16,-8.487347e-16,0.032464,0.381985,0,0,-1.55914,1.634076
2,-1.689897,0.110901,0.349692,-0.28165,-2.809331e-16,-6.50805e-16,-1.02192,0.454694,-1.677294,-1.208871,...,-1.49365,-0.983784,-3.920285e-16,-1.199232,0.621685,0.221402,0,0,-1.445436,1.634076


## Разделение

In [11]:
X_wine = wine.drop(['quality'], axis=1)
y_wine = wine['quality']
X_wine.shape, y_wine.shape

((6497, 12), (6497,))

In [12]:
X_wine_train, X_wine_test, y_wine_train, y_wine_test = train_test_split(X_wine, y_wine, test_size=0.33)

In [13]:
rain['RainTomorrow'].value_counts()

0    113583
1     31877
Name: RainTomorrow, dtype: int64

In [14]:
rain0 = rain[rain['RainTomorrow'] == 0]
rain1 = rain[rain['RainTomorrow'] == 1]

rain0 = resample(rain0,
                 replace=False,
                 n_samples=len(rain1),
                 random_state=73)

rain = pd.concat([rain1, rain0])
print(rain['RainTomorrow'].value_counts())

1    31877
0    31877
Name: RainTomorrow, dtype: int64


In [15]:
X_rain = rain.drop(columns=['RainTomorrow'])
y_rain = rain['RainTomorrow']
X_rain.shape, y_rain.shape

((63754, 23), (63754,))

In [16]:
# sm = SMOTE(random_state=73, k_neighbors=5)
# X_rain, y_rain = sm.fit_resample(X_rain, y_rain)
# X_rain.shape, y_rain.shape

In [17]:
X_rain_train, X_rain_test, y_rain_train, y_rain_test = train_test_split(X_rain, y_rain, test_size=0.33)

In [18]:
def print_r_metrics(y_test: np.ndarray, y_pred: np.ndarray):
    print(
        f'MAE:\t{mean_absolute_error(y_test, y_pred)}',
        f'MSE:\t{mean_squared_error(y_test, y_pred)}',
        f'RMSE:\t{sqrt(mean_squared_error(y_test, y_pred))}',
        f'MAPE:\t{mean_absolute_percentage_error(y_test, y_pred)}',
        f'R^2:\t{r2_score(y_test, y_pred)}',
        sep='\n'
    )


def print_c_metrics(y_test: np.ndarray, y_pred: np.ndarray):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))


## Обучение

In [22]:
params = {
    'random_state': [seed],
    'max_depth': range(5, 8),
}

dtr = GridSearchCV(DecisionTreeRegressor(), params).fit(X_wine_train, y_wine_train)

print(dtr.best_params_)
print_r_metrics(y_wine_test, dtr.predict(X_wine_test))

{'max_depth': 5, 'random_state': 73}
MAE:	0.5863417091087443
MSE:	0.5829123852857983
RMSE:	0.76348699090803
MAPE:	0.10529532603202965
R^2:	0.27054778723951267


In [23]:
params = {
    'random_state': [seed],
    'max_depth': range(5, 8),
}

dtc = GridSearchCV(DecisionTreeClassifier(), params).fit(X_rain_train, y_rain_train)

print(dtc.best_params_)
print_c_metrics(y_rain_test, dtc.predict(X_rain_test))

{'max_depth': 7, 'random_state': 73}
[[8458 2062]
 [2973 7546]]
              precision    recall  f1-score   support

           0       0.74      0.80      0.77     10520
           1       0.79      0.72      0.75     10519

    accuracy                           0.76     21039
   macro avg       0.76      0.76      0.76     21039
weighted avg       0.76      0.76      0.76     21039



In [24]:
params = {
    'random_state': [seed],
    'n_estimators': range(9, 12),
}

br = GridSearchCV(BaggingRegressor(), params).fit(X_wine_train, y_wine_train)

print(br.best_params_)
print_r_metrics(y_wine_test, br.predict(X_wine_test))

{'n_estimators': 11, 'random_state': 73}
MAE:	0.47247298156389067
MSE:	0.43530023695312947
RMSE:	0.6597728677000362
MAPE:	0.08559559968650877
R^2:	0.45526852906901016


In [26]:
params = {
    'random_state': [seed],
    'n_estimators': range(9, 12),
}

bc = GridSearchCV(BaggingClassifier(), params).fit(X_rain_train, y_rain_train)

print(bc.best_params_)
print_c_metrics(y_rain_test, bc.predict(X_rain_test))

{'n_estimators': 11, 'random_state': 73}
[[8218 2302]
 [2371 8148]]
              precision    recall  f1-score   support

           0       0.78      0.78      0.78     10520
           1       0.78      0.77      0.78     10519

    accuracy                           0.78     21039
   macro avg       0.78      0.78      0.78     21039
weighted avg       0.78      0.78      0.78     21039



In [27]:
params = {
    'random_state': [seed],
    'max_depth': range(3, 4),
}

gbr = GridSearchCV(GradientBoostingRegressor(), params).fit(X_wine_train, y_wine_train)

In [28]:
print(gbr.best_params_)
print_r_metrics(y_wine_test, gbr.predict(X_wine_test))

{'max_depth': 3, 'random_state': 73}
MAE:	0.5489570371107776
MSE:	0.49831127143718495
RMSE:	0.7059116597968792
MAPE:	0.09830577982428541
R^2:	0.3764169903250083


In [29]:
params = {
    'random_state': [seed],
    'max_depth': range(3, 4),
}

gbc = GridSearchCV(GradientBoostingClassifier(), params).fit(X_rain_train, y_rain_train)

print(gbc.best_params_)
print_c_metrics(y_rain_test, gbc.predict(X_rain_test))

{'max_depth': 3, 'random_state': 73}
[[8407 2113]
 [2367 8152]]
              precision    recall  f1-score   support

           0       0.78      0.80      0.79     10520
           1       0.79      0.77      0.78     10519

    accuracy                           0.79     21039
   macro avg       0.79      0.79      0.79     21039
weighted avg       0.79      0.79      0.79     21039



In [32]:
estimators = [
    ('br', br),
    ('gbr', gbr)
]

sr = StackingRegressor(estimators=estimators).fit(X_wine_train, y_wine_train)

print_r_metrics(y_wine_test, sr.predict(X_wine_test))

MAE:	0.48920552412921936
MSE:	0.4308106079553712
RMSE:	0.656361644183579
MAPE:	0.08806149290917074
R^2:	0.4608868173221973


In [None]:
estimators = [
    ('bc', bc),
    ('gbc', gbc)
]

sc = StackingClassifier(estimators=estimators).fit(X_rain_train, y_rain_train)

print_c_metrics(y_rain_test, sc.predict(X_rain_test))

In [34]:
params = {
    'random_state': [seed],
    'max_depth': range(3, 8),
}

xgbr = GridSearchCV(XGBRegressor(), params).fit(X_wine_train, y_wine_train)

print(xgbr.best_params_)
print_r_metrics(y_wine_test, xgbr.predict(X_wine_test))

{'max_depth': 5, 'random_state': 73}
MAE:	0.5038853796489867
MSE:	0.45673025122992883
RMSE:	0.6758182087143915
MAPE:	0.09102038478526688
R^2:	0.4284511689848943


In [35]:
params = {
    'random_state': [seed],
    'max_depth': range(3, 8),
}

xgbc = GridSearchCV(XGBClassifier(), params).fit(X_rain_train, y_rain_train)

print(xgbc.best_params_)
print_c_metrics(y_rain_test, xgbc.predict(X_rain_test))

{'max_depth': 6, 'random_state': 73}
[[8480 2040]
 [2164 8355]]
              precision    recall  f1-score   support

           0       0.80      0.81      0.80     10520
           1       0.80      0.79      0.80     10519

    accuracy                           0.80     21039
   macro avg       0.80      0.80      0.80     21039
weighted avg       0.80      0.80      0.80     21039



In [39]:
cbr = CatBoostRegressor(random_state=seed, verbose=False).fit(X_wine_train, y_wine_train)

print_r_metrics(y_wine_test, cbr.predict(X_wine_test))

MAE:	0.5049634905368013
MSE:	0.44321237513873185
RMSE:	0.6657419733941461
MAPE:	0.09062923992697167
R^2:	0.44536733833633313


In [40]:
cbc = CatBoostClassifier(random_state=seed, verbose=False).fit(X_rain_train, y_rain_train)

print_c_metrics(y_rain_test, cbc.predict(X_rain_test))

[[8530 1990]
 [2078 8441]]
              precision    recall  f1-score   support

           0       0.80      0.81      0.81     10520
           1       0.81      0.80      0.81     10519

    accuracy                           0.81     21039
   macro avg       0.81      0.81      0.81     21039
weighted avg       0.81      0.81      0.81     21039



## Свой сад

In [19]:
if (dir := str(Path(os.getcwd()).parent)) not in sys.path:
    sys.path.append(dir)
sys.path

from mylib.cart import CART

In [63]:
cart_r = CART('R', max_depth=5).fit(np.array(X_wine_train), np.array(y_wine_train))
cart_r.predict(X_wine_test)

array([6.0046729 , 6.16091954, 5.37987013, ..., 6.76549865, 5.1120332 ,
       6.54545455])

In [64]:
print_r_metrics(y_wine_test, cart_r.predict(X_wine_test))

MAE:	0.5877446794647379
MSE:	0.5898198382576405
RMSE:	0.7679972905275386
MAPE:	0.10556227637995373
R^2:	0.26190385209241696


In [55]:
cart_c = CART('C', max_depth=5).fit(np.array(X_rain_train[:1000]), np.array(y_rain_train[:1000]))
cart_c.predict(X_rain_test)

array([0., 0., 1., ..., 1., 0., 0.])

In [56]:
print_c_metrics(y_rain_test, cart_c.predict(X_rain_test))

[[7942 2578]
 [3055 7464]]
              precision    recall  f1-score   support

           0       0.72      0.75      0.74     10520
           1       0.74      0.71      0.73     10519

    accuracy                           0.73     21039
   macro avg       0.73      0.73      0.73     21039
weighted avg       0.73      0.73      0.73     21039



In [20]:
cart_c = CART('C', max_depth=5).fit(np.array(X_rain_train[:100]), np.array(y_rain_train[:100]))
cart_c.predict(X_rain_test)

array([1., 0., 0., ..., 1., 1., 0.])

In [21]:
print_c_metrics(y_rain_test, cart_c.predict(X_rain_test))

[[7322 3218]
 [4102 6397]]
              precision    recall  f1-score   support

           0       0.64      0.69      0.67     10540
           1       0.67      0.61      0.64     10499

    accuracy                           0.65     21039
   macro avg       0.65      0.65      0.65     21039
weighted avg       0.65      0.65      0.65     21039



## Бэггинг

In [20]:
from mylib.bagging import Bagging

In [25]:
my_br = Bagging('R', DecisionTreeRegressor, n_estimators=11).fit(X_wine_train, y_wine_train)
print_r_metrics(y_wine_test, my_br.predict(np.array(X_wine_test)))

MAE:	0.4770502225047679
MSE:	0.42477412394767766
RMSE:	0.6517469784722271
MAPE:	0.08606269151723696
R^2:	0.4323654628604827


In [26]:
my_br = Bagging('R', BaggingRegressor, n_estimators=11).fit(X_wine_train, y_wine_train)
print_r_metrics(y_wine_test, my_br.predict(np.array(X_wine_test)))

MAE:	0.46920957830048743
MSE:	0.39758716214914563
RMSE:	0.6305451309376242
MAPE:	0.08482435241526151
R^2:	0.46869596796121327


In [28]:
my_bc = Bagging('C', DecisionTreeClassifier, n_estimators=11).fit(X_rain_train, y_rain_train)
print_c_metrics(y_rain_test, my_bc.predict(np.array(X_rain_test)))

[[8160 2318]
 [2409 8152]]
              precision    recall  f1-score   support

           0       0.77      0.78      0.78     10478
           1       0.78      0.77      0.78     10561

    accuracy                           0.78     21039
   macro avg       0.78      0.78      0.78     21039
weighted avg       0.78      0.78      0.78     21039

