# Lab 03. Weather Predict Competition

This lab consists of only one task: train a model to predict if it is going to rain the next day:
- Classification metric is **roc-auc**
- Score should be calculated on a hold-out set (don't use it for training)


#### Evaluation
- Score > 0.860 - 3 points
- Score > 0.865 - 5 points
- Score > 0.870 - 7 points
- Score > 0.875 - 9 points
- Score > 0.880 - 11 points
- Score > 0.885 - 13 points
- Score > 0.890 - 15 points
- You are encouraged to use various methods and tricks to get extra points

#### Do not use any external data or models that were not covered in the course (no gradient boosting and  neural nets).


#### How to submit
- Name your file according to this convention: `2022_lab03_GroupNumber_Surname_Name.ipynb`, for example 
    - `2022_lab03_404_Sheipak_Sviat.ipynb`
    - `2022_lab03_NoGroup_Sheipak_Sviat.ipynb`
- Attach your .ipynb to an email with topic `2022_lab02_GroupNumber_Surname_Name`
- Send it to `cosmic.research.ml@yandex.ru`
- Deadline is ` 22022-11-03 23:00:00 +03:00`

#### The Data:
- All the datasets you need are here: https://github.com/cosmic-research-ml-edu/intro_ml_2022/blob/main/homeworks/hw03/train.csv

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
import tqdm as tqdm

In [None]:
db_train_original = pd.read_csv("holdout_data/train.csv")
db_test_original = pd.read_csv("holdout_data/holdout.csv")

db_train = db_train_original.copy()
db_test = db_test_original.copy()

db_train.drop(columns=['row_id'], inplace=True)
db_test.drop(columns=['row_id'], inplace=True)

target_train = db_train['raintomorrow']
db_train.drop(columns=['raintomorrow'], inplace=True)

target_test = db_test['raintomorrow']
db_test.drop(columns=['raintomorrow'], inplace=True)

In [None]:
def LE_create_features(db):
    lc = LabelEncoder()
    for series in db:
        if db[series].dtype == 'object':
            db[series] = lc.fit_transform(db[series])

    return db

# Deal with NaNs

Obviosly that we should do somthing with NaNs to increase the score  

In [None]:
db_test.isnull().sum().sort_values(ascending=False)

In [None]:
db_train.fillna(db_train.median(), inplace=True)
db_test.fillna(db_test.median(), inplace=True)

db_train.fillna(db_train.mode(dropna=True).iloc[0], inplace=True)
db_test.fillna(db_test.mode(dropna=True).iloc[0], inplace=True)

# Feature generation

In [None]:
plt.figure(figsize=(12,7))
sns.heatmap(db_train.corr(), annot=True, cmap='coolwarm')

**Temperature range**

In [None]:
db_train['temp_range'] = db_train['maxtemp'] - db_train['mintemp']
db_test['temp_range'] = db_test['maxtemp'] - db_test['mintemp']

**Winddir changing (not intresting)**

In [None]:
# db_train['wind_dir_changed'] = pd.Series([True if db_train['winddir9am'][i] != db_train['winddir3pm'][i] else False for i in range(len(db_train))])
# db_test['wind_dir_changed'] = pd.Series([True if db_test['winddir9am'][i] != db_test['winddir3pm'][i] else False for i in range(len(db_test))])

**Wind speed reduction**

In [None]:
db_train['windspeed_reduction'] = db_train['windspeed3pm']-db_train['windspeed9am']
db_test['windspeed_reduction'] = db_test['windspeed3pm']-db_test['windspeed9am']

**Pressure reduction**

In [None]:
db_train['pressure_reduction'] = db_train['pressure3pm']-db_train['pressure9am']
db_test['pressure_reduction'] = db_test['pressure3pm']-db_test['pressure9am']

**Humidity reduction**

In [None]:
db_train['humidity_reduction'] = db_train['humidity3pm']-db_train['humidity9am']
db_test['humidity_reduction'] = db_test['humidity3pm']-db_test['humidity9am']

**Cloud reduction (not intresting)**

In [None]:
# db_train['cloud_reduction'] = db_train['cloud3pm']-db_train['cloud9am']
# db_test['cloud_reduction'] = db_test['cloud3pm']-db_test['cloud9am']

In [None]:
db_train

# Encoding

**One-hot (bad idea)**

In [None]:
db_train_one_hot = pd.get_dummies(db_train)
db_test_one_hot = pd.get_dummies(db_test)
db_train_one_hot

**Label Encoding (good idea)**

In [None]:
def LE_create_features(db):
    lc = LabelEncoder()
    for series in db:
        if db[series].dtype == 'object':
            db[series] = lc.fit_transform(db[series])

    return db

db_train_lc = LE_create_features(db_train)
db_test_lc = LE_create_features(db_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

RANDOM_STATE = 24

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(db_train_lc)
y_train = target_train.values

X_test = scaler.fit_transform(db_test_lc)
y_test = target_test.values

In [None]:
# clf1 = svm.SVC(probability=True) <-- bad idea
clf2 = LogisticRegression(solver='saga')
clf3 = DecisionTreeClassifier(max_features='auto')
clf4 = RandomForestClassifier()
classifiers = [clf2, clf3, clf4]

# Params for random forest (Huge grid)
n_estimators = [int(x) for x in np.linspace(2, 150, 25)]
max_features = ['log2', 'sqrt']
max_depth = [int(x) for x in np.linspace(1, 15, 15)]
min_samples_split = [int(x) for x in np.linspace(2, 50, 10)]
min_samples_leaf = [int(x) for x in np.linspace(2, 50, 10)]
bootstrap = [True, False]
params_rf = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# All params
grids = [{'penalty': ['l1', 'l2', 'elasticnet', 'none'],
         'C': np.unique(np.array([i for i in map(round, np.logspace(0.2, 1.7, 20))])),
        'l1_ratio': np.linspace(0.001, 0.999, 10)},
        {'max_depth': np.arange(2, 11),
         'criterion': ['gini', 'entropy', 'log_loss']},
         params_rf]

In [None]:
def params_tuning(X_train, y_train, X_test, y_test, classifiers, grids, isPCA=False):
    if isPCA: # <-- bad idea
        pca = PCA(n_components=60)
        X_train = pca.fit_transform(X_train)
        X_test = pca.fit_transform(X_test)

    roc_grid_scores = []
    grid_scores = []
    clfs_new = []
    for clf, grid in zip(classifiers, grids):
        cls_gr = RandomizedSearchCV(clf, grid, n_iter=150, cv = 7, verbose = 1, n_jobs=-1).fit(X_train, y_train)
        clfs_new.append(cls_gr.best_estimator_)
        grid_scores.append(cls_gr.best_score_)
        roc_grid_scores.append(roc_auc_score(y_test, cls_gr.predict_proba(X_test)[:, 1]))
    return clfs_new, roc_grid_scores

In [None]:
clfs_new, roc_grid_scores = params_tuning(X_train, y_train, X_test, y_test, classifiers, grids, isPCA=False)

In [None]:
names = ['LogisticRegression', 'DecisionTreeClassifier', 'RandomForestClassifier']
fig, axes = plt.subplots(3, 1, figsize=(30, 20))
for clf, name, score, ax in zip(clfs_new, names, roc_grid_scores, axes):
    if name == 'LogisticRegression':
        clf_model_coefs = clf.coef_[0]
    else:
        clf_model_coefs = clf.feature_importances_
    ncoef = clf_model_coefs.shape[0]
    default_x = np.arange(ncoef)
    ax.bar(default_x - 0.1, clf_model_coefs, label=f'{name}\n{score}', width=1, color = 'blue')
    ax.set_xticks(default_x, [*db_train.columns], rotation=45)
    ax.set_xlabel('Coefficient Index')
    ax.set_ylabel('Coefficient Magnitude')
    ax.legend(loc='upper right')
    ax.grid()
plt.title('Importants features magnitude')
plt.show()

**The end result (score)**

In [None]:
lst = dict(zip(names, roc_grid_scores))
lst = sorted(lst.items(), reverse=True)
print(f'Roc-auc by {lst[0][0]} is {lst[0][1]}')

**The end result (score)**

In [363]:
lst = dict(zip(names, roc_grid_scores))
lst = sorted(lst.items(), reverse=True)
print(f'Roc-auc by {lst[0][0]} is {lst[0][1]}')

Roc-auc by RandomForestClassifier is 0.8667478343258782
