In [None]:
import pandas as pd
import numpy as np
from itertools import product
from tqdm import tqdm

from imblearn import over_sampling

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import f1_score

from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv("train_with_features.csv")
test = pd.read_csv("test_with_features.csv")

## ONLY MATCHES

In [None]:
f1_score(train['is_duplicate'], train['label'])

In [None]:
sample_sub = pd.read_csv("submissions/sample_submission.csv", index_col="pair_id")
print(len(sample_sub.is_duplicate))

sample_sub["is_duplicate"] = test['label']
print(sample_sub.is_duplicate.value_counts(dropna=False))
sample_sub["is_duplicate"] = sample_sub["is_duplicate"].fillna(0)
print(len(sample_sub.is_duplicate))

sample_sub.to_csv("submissions/only_matches.csv")

## MODELS

In [None]:
# random state
RS = 42
# трешхолд на вероятности
THR = 0.3
# стратегия кросс-валидации
NFOLDS = 3
SKF = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=RS)

In [None]:
# признаки, на которых обучаемся
FTS = ['label',
       'hamming',
       'levenshtein',
       'damerau_levenshtein',
       'jaro_winkler',
       'strcmp95',
       'lcsseq',
       'lcsstr',
       'gotoh',
       'smith_waterman',
       'ratcliff_obershelp',
       'cosine',
       'jaccard',
       'ratio',
       'partial_ratio',
       'token_sort_ratio',
       'token_set_ratio'
      ]

In [None]:
X = train.drop(['is_duplicate', 'name_1', 'name_2', 'org_name_1', 'org_name_2'], axis=1)
y = train['is_duplicate']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, stratify=y,
                                                      random_state=RS, shuffle=True)

X_test = test.drop(['name_1', 'name_2', 'org_name_1', 'org_name_2'], axis=1)

In [None]:
f1_score(X_valid['label'], y_valid)

## Random Forest Classifier

Decision tree algorithms are efficient in eliminating columns that don’t add value in predicting the output. In some cases, we are even able to see how a prediction was derived by backtracking the tree. However, this algorithm doesn’t perform individually when the trees are huge and hard to interpret. Such models are often referred to as weak models. The model performance is improvised by taking an average of several such decision trees derived from the subsets of the training data. This approach is called the random forest classification.

### Baseline

In [None]:
default_params = {
    'n_estimators':100, # to tune
    'max_depth':None, # to tune
    'criterion':'gini',
    'bootstrap':True,
    'random_state': RS,
    'class_weight':'balanced'
}

In [None]:
rf_clf_default = RandomForestClassifier(**default_params, n_jobs=-1)

In [None]:
# оценка по кросс-валидации со стандартным трешхолдом
rf_default_res = cross_val_score(rf_clf_default, X_train[FTS], y_train, cv=SKF, scoring='f1').mean()
rf_default_res

In [None]:
# оценка на отложенной выборке
rf_clf_default.fit(X_train[FTS], y_train)
f1_score(rf_clf_default.predict(X_valid[FTS]), y_valid)

На сайте соревнования скор = `0.5464`

### Feature Selection

Попробуем отобрать признаки:

In [None]:
from sklearn.feature_selection import RFECV

In [None]:
print(FTS)

In [None]:
%%time
rf_clf_default = RandomForestClassifier(**default_params, n_jobs=-1)
selector = RFECV(rf_clf_default, step=1, cv=SKF, scoring='f1', n_jobs=-1)
selector = selector.fit(X_train[FTS], y_train)

In [None]:
print("Optimal number of features : %d" % selector.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (f1_score)")
plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)
plt.show()

In [None]:
best_fts = np.array(FTS)[selector.get_support()].tolist()
len(best_fts)

In [None]:
# оценка по кросс-валидации со стандартным трешхолдом
rf_clf_default = RandomForestClassifier(**default_params, n_jobs=-1)
rf_default_res = cross_val_score(rf_clf_default, X_train[best_fts], y_train, cv=SKF, scoring='f1').mean()
rf_default_res

In [None]:
# оценка на отложенной выборке
rf_clf_default = RandomForestClassifier(**default_params, n_jobs=-1)
rf_clf_default.fit(X_train[best_fts], y_train)
f1_score(rf_clf_default.predict(X_valid[best_fts]), y_valid)

In [None]:
rf_clf_default = RandomForestClassifier(**default_params, n_jobs=-1)
rf_clf_default.fit(X[best_fts], y)
test_predictions = rf_clf_default.predict(X_test[best_fts])

sample_sub = pd.read_csv("submissions/sample_submission.csv", index_col="pair_id")
print(len(sample_sub.is_duplicate))

sample_sub["is_duplicate"] = test_predictions
print(sample_sub.is_duplicate.value_counts(dropna=False))
sample_sub["is_duplicate"] = sample_sub["is_duplicate"].fillna(0)
print(len(sample_sub.is_duplicate))

sample_sub.to_csv("submissions/rf_best_features.csv")

## Настройка гиперпараметров
В том числе трешхолда

In [None]:
def get_f1_cv_score(model, X_train, y_train, FTS, treshold=THR, cv_split=SKF):
    f1_cv_folds = []

    for tridx, cvidx in list(cv_split.split(X_train[FTS], y_train)):
        
        model.fit(X_train.iloc[tridx][FTS].values, y_train.iloc[tridx].values)
        # предсказания для отложенного фолда
        cv_preds = model.predict_proba(X_train.iloc[cvidx][FTS].values)
        #в зависимости от трешхолда определяем класс
        cv_preds_label = (cv_preds[:, 1] > treshold).astype(np.int) 
        # считаем метрику
        cv_real_label = y_train.iloc[cvidx].values
        f1_cv = f1_score(cv_real_label, cv_preds_label)
        f1_cv_folds.append(f1_cv)
        
    f1_cv_mean = np.mean(f1_cv_folds) # средний скор на кросс-валидации
    
    return f1_cv_mean

In [None]:
# сетка параметров
n_estimators = [50,100,150, 200]
max_depth = [12, 20]
tresholds = np.arange(0.4,1,0.1)

params = list(product(n_estimators, max_depth, tresholds))
print(len(params))

In [None]:
%%time
# куда сохраняем результаты
result = []
indexes = []

for n_estimators, max_depth, treshold in tqdm(params):
    params = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'random_state': RS,
        'class_weight':'balanced',
    }
    rf_clf = RandomForestClassifier(**params, n_jobs=-1)
    f1_cv_mean = get_f1_cv_score(rf_clf, X_train, y_train, treshold=treshold, FTS=best_fts)
    
    indexes.append('_'.join([str(n_estimators), str(max_depth), str(treshold)]))
    result.append(f1_cv_mean)

In [None]:
result_df = pd.DataFrame(result, index=indexes)
result_df.columns = ['f1_cv_mean']
result_df.sort_values(by='f1_cv_mean', ascending=False).head(10)

In [None]:
best_params = result_df['f1_cv_mean'].idxmax()
best_params

In [None]:
best_thr = 0.6
best_params = {
    'n_estimators':200,
    'max_depth': 20,
    'criterion':'gini',
    'bootstrap':True,
    'random_state': RS,
    'class_weight':'balanced',
}

In [None]:
# оценка на отложенной выборке
rf_clf = RandomForestClassifier(**best_params, n_jobs=-1)
rf_clf.fit(X_train[best_fts], y_train)
valid_predictions = rf_clf.predict_proba(X_valid[best_fts])
valid_labels = (valid_predictions[:, 1] > best_thr).astype(np.int) 
f1_score(valid_labels, y_valid)

In [None]:
# submission
rf_clf = RandomForestClassifier(**best_params, n_jobs=-1)
rf_clf.fit(X[FTS], y)
test_predictions = rf_clf.predict_proba(X_test[FTS])
test_labels = (test_predictions[:, 1] > best_thr).astype(np.int) 

sample_sub = pd.read_csv("submissions/sample_submission.csv", index_col="pair_id")
print(len(sample_sub.is_duplicate))

sample_sub["is_duplicate"] = test_labels
print(sample_sub.is_duplicate.value_counts(dropna=False))
sample_sub["is_duplicate"] = sample_sub["is_duplicate"].fillna(0)
print(len(sample_sub.is_duplicate))

sample_sub.to_csv("submissions/rf_best_params.csv")

Скор в лидерборде: `0.5867`