# ДЗ 6

### 1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)

In [1]:
import pandas as pd
import numpy as np
#from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
#from sklearn.feature_extraction.text import TfidfVectorizer
#import itertools
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score, precision_recall_curve

#import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
# беру данные оттока с предыдущего урока
df = pd.read_csv("churn_data.csv")
df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [3]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

### 2. сделать feature engineering

In [4]:
categorical_columns = ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']
continuous_columns = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [5]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    final_transformers.append((cont_col, cont_transformer))

In [6]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

### 3. обучить любой классификатор (какой вам нравится)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df, df['Exited'], random_state=0)

In [8]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(random_state = 0)),
])

In [9]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [10]:
preds = pipeline.predict_proba(X_test)[:, 1]
preds[:10]

array([0.37, 0.2 , 0.15, 0.03, 0.02, 0.75, 0.03, 0.09, 0.17, 0.78])

In [11]:
y_test

9394    0
898     1
2398    0
5906    0
2343    0
       ..
8764    0
4359    0
2041    0
1108    0
3332    0
Name: Exited, Length: 2500, dtype: int64

In [12]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)

ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))


Best Threshold=0.370000, F-Score=0.648, Precision=0.645, Recall=0.650


In [13]:
scoring = pd.DataFrame(columns=['model','f1', 'precision', 'recall',])
scoring = scoring.append(
    { 'model' : 'normal', 'f1': fscore[ix], 'precision': precision[ix], 'recall': recall[ix]}, 
    ignore_index=True)

In [14]:
scoring

Unnamed: 0,model,f1,precision,recall
0,normal,0.64775,0.645224,0.650295


### 4. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть

In [15]:
# df_c = df.copy()

In [16]:
df['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [17]:
df[df['Exited'] == 1]

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
16,17,15737452,Romeo,653,Germany,Male,58,1,132602.88,1,1,0,5097.67,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9981,9982,15672754,Burbidge,498,Germany,Male,42,3,152039.70,1,1,1,53445.17,1
9982,9983,15768163,Griffin,655,Germany,Female,46,7,137145.12,1,1,0,115146.40,1
9991,9992,15769959,Ajuluchukwu,597,France,Female,53,4,88381.21,1,1,0,69384.71,1
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1


In [18]:
def create_unlabeled(df, pos_frac=0.2):

    sdf = df.copy()
#     pos_mask = (df['target'] == 1)
    positives = df[df['Exited'] == 1].sample(frac=pos_frac).index
    unlabeled = df[~df.index.isin(positives)].index

    df.loc[positives, 'is_labeled'] = 1
    df.loc[unlabeled, 'is_labeled'] = 0
    df['is_labeled'] = df['is_labeled'].astype(int)
    return df

In [19]:
rns_df = create_unlabeled(df, pos_frac=0.2)

In [20]:
# positives = df_c[df_c['Exited'] == 1].sample(frac=0.2).index

In [21]:
# unlabeled = df_c[~df_c.index.isin(positives)].index

### 5. применить random negative sampling для построения классификатора в новых условиях

In [22]:
# rns_df.sample(frac=1)

In [23]:
def get_rns_samples(rns_df):
    
    rns_df = rns_df.sample(frac=1)

    pos_sample = rns_df[rns_df['is_labeled'] == 1]
    neg_sample = rns_df[rns_df['is_labeled'] == 0][:pos_sample.shape[0]]
    train_samples = pd.concat([neg_sample, pos_sample]).sample(frac=1)
    test_samples = rns_df[rns_df['is_labeled'] == 0][pos_sample.shape[0]:]
    
    return train_samples, test_samples

In [24]:
train_samples, test_samples = get_rns_samples(rns_df)

In [25]:
train_samples

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,is_labeled
9962,9963,15594612,Flynn,702,Spain,Male,44,9,0.00,1,0,0,59207.41,1,0
1323,1324,15629244,Bryant,635,Spain,Male,50,7,159453.64,2,0,0,54560.79,1,0
3731,3732,15568573,Graham,554,Germany,Female,51,7,105701.91,1,0,1,179797.79,1,1
7395,7396,15808386,Cocci,721,Germany,Female,45,7,138523.20,1,0,0,59604.45,1,1
9589,9590,15669611,Mott,632,France,Male,71,3,83116.68,1,1,1,27597.76,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9337,9338,15797751,Pai,466,Germany,Female,47,5,102085.72,1,1,1,183536.24,1,1
5784,5785,15738063,Shen,631,France,Male,29,2,0.00,2,1,1,18581.84,0,0
2118,2119,15774857,Synnot,460,France,Female,27,7,0.00,2,1,0,156150.08,1,1
3175,3176,15764604,Sutherland,586,France,Female,35,7,164769.02,3,1,0,119814.25,1,1


In [26]:
pipeline.fit(train_samples.iloc[:, :-2], train_samples['is_labeled'])

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [27]:
preds = pipeline.predict_proba(test_samples.iloc[:, :-2])[:, 1]
preds

array([0.39, 0.58, 0.2 , ..., 0.35, 0.17, 0.39])

In [28]:
test_samples['Exited']

9491    0
4967    1
1481    0
3569    1
4213    0
       ..
2014    0
4432    0
3948    0
5647    0
5184    0
Name: Exited, Length: 9186, dtype: int64

In [29]:
precision, recall, thresholds = precision_recall_curve(test_samples['Exited'], preds)

fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.590000, F-Score=0.518, Precision=0.480, Recall=0.564


In [30]:
scoring = scoring.append(
    { 'model' : 'RNS', 'f1': fscore[ix], 'precision': precision[ix], 'recall': recall[ix]}, 
    ignore_index=True)

### 6. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)

In [31]:
scoring

Unnamed: 0,model,f1,precision,recall
0,normal,0.64775,0.645224,0.650295
1,RNS,0.518257,0.479564,0.563741


### 7. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [32]:
def evaluate_model(pipeline, X_train, y_train, X_test, y_test):
    
    pipeline.fit(X_train, y_train)
    
    preds = pipeline.predict_proba(X_test)[:, 1]
    
    precision, recall, thresholds = precision_recall_curve(y_test, preds)
    fscore = (2 * precision * recall) / (precision + recall)
    ix = np.argmax(fscore)
#     print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
#                                                                         fscore[ix],
#                                                                         precision[ix],
#                                                                         recall[ix]))
    
    return {'f1': [fscore[ix]], 'precision': [precision[ix]], 'recall': [recall[ix]]}

In [33]:
rns_metrics = pd.DataFrame(columns=['f1', 'precision', 'recall'])

fracs = np.linspace(0.1, 0.8, 8)
for frac in fracs:
    train_samples, test_samples = get_rns_samples(create_unlabeled(df, pos_frac=frac))
#     frac_metrics = 
    frac_metrics = evaluate_model(pipeline,
                                 train_samples.iloc[:, :-2],
                                 train_samples['is_labeled'],
                                 test_samples.iloc[:, :-2],
                                 test_samples['Exited'])
    rns_metrics = rns_metrics.append(pd.DataFrame(frac_metrics))

  fscore = (2 * precision * recall) / (precision + recall)
  fscore = (2 * precision * recall) / (precision + recall)


In [34]:
rns_metrics.index = fracs

In [35]:
rns_metrics

Unnamed: 0,f1,precision,recall
0.1,,0.0,0.0
0.2,0.505646,0.44241,0.589974
0.3,0.522171,0.467417,0.591454
0.4,0.505119,0.477419,0.536232
0.5,0.493743,0.504065,0.483835
0.6,0.455274,0.42625,0.488539
0.7,,0.0,0.0
0.8,0.379412,0.369628,0.389728


По всей видимости, при увеличении размера немаркированных - качество падает.