In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import itertools

import matplotlib.pyplot as plt

%matplotlib inline

In [33]:
df = pd.read_csv("train.csv")
df.head(3)

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y
0,13829,29,technician,single,tertiary,no,18254,no,no,cellular,11,may,2,-1,0,unknown,no
1,22677,26,services,single,secondary,no,512,yes,yes,unknown,5,jun,3,-1,0,unknown,no
2,10541,30,management,single,secondary,no,135,no,no,cellular,14,aug,2,-1,0,unknown,no


In [34]:
df['y']=df['y'].replace({'no': 0, 'yes': 1})

In [35]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['y']), df['y'], random_state=0)

In [36]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    

class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [37]:
df.head(3)

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y
0,13829,29,technician,single,tertiary,no,18254,no,no,cellular,11,may,2,-1,0,unknown,0
1,22677,26,services,single,secondary,no,512,yes,yes,unknown,5,jun,3,-1,0,unknown,0
2,10541,30,management,single,secondary,no,135,no,no,cellular,14,aug,2,-1,0,unknown,0


In [38]:
categorical_columns = ['job', 'marital', 'education', 'default', 'housing','loan','contact','month','poutcome']
continuous_columns = ['ID', 'age', 'balance', 'day', 'campaign','pdays','previous']

In [39]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

In [40]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    
    final_transformers.append((cont_col, cont_transformer))

In [41]:
feats = FeatureUnion(final_transformers)

In [42]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('features', feats),
    ('classifier', RandomForestClassifier(random_state=42)),
])

In [43]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('job',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='job')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='job'))])),
                                                ('marital',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='marital')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='marital'))])),
                                                ('education',
                                                 Pipeline(steps=[('selector',
                       

In [44]:
preds = pipeline.predict_proba(X_test)[:, 1]
preds[:10]

array([0.23, 0.25, 0.33, 0.33, 0.33, 0.14, 0.42, 0.19, 0.31, 0.17])

In [45]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

In [46]:
metrics_df = pd.DataFrame(columns=['model', 'thresh', 'F-Score', 'Precision', 'Recall', 'ROC AUC'])
metrics_df

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC


In [47]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.39, F-Score=0.624, Precision=0.630, Recall=0.618


In [48]:
roc_auc = roc_auc_score(y_test, preds)
roc_auc

0.7833107380222029

In [49]:
metrics_df = metrics_df.append({
    'model': 'supervised',
    'thresh': thresholds[ix],
    'F-Score': fscore[ix],
    'Precision': precision[ix],
    'Recall': recall[ix],
    'ROC AUC': roc_auc
}, ignore_index=True)

metrics_df

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,supervised,0.39,0.623645,0.629823,0.617587,0.783311


In [50]:
mod_data = X_train.copy()
mod_data['y'] = y_train
mod_data.head()

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y
5963,10542,26,student,single,secondary,no,132,no,no,cellular,1,oct,1,119,1,success,1
2412,15568,32,services,married,secondary,no,15,yes,no,cellular,15,may,1,-1,0,unknown,0
11076,16805,50,blue-collar,married,unknown,no,123,yes,no,cellular,7,aug,2,-1,0,unknown,0
2740,18582,36,technician,single,secondary,no,4136,yes,no,unknown,30,may,3,-1,0,unknown,1
11671,15751,32,technician,single,tertiary,no,1094,yes,no,cellular,11,may,7,-1,0,unknown,0


In [52]:
pos_ind = mod_data[mod_data['y'] == 1].sample(frac=1, random_state=42).index

# leave just 25% of the positives marked
perc = 0.25
pos_sample_len = int(np.ceil(perc * len(pos_ind)))

print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 748/2989 as positives and unlabeling the rest


In [53]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample, 'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    8904
 1     748
Name: class_test, dtype: int64


In [54]:
mod_data.head(5)

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y,class_test
5963,10542,26,student,single,secondary,no,132,no,no,cellular,1,oct,1,119,1,success,1,1
2412,15568,32,services,married,secondary,no,15,yes,no,cellular,15,may,1,-1,0,unknown,0,-1
11076,16805,50,blue-collar,married,unknown,no,123,yes,no,cellular,7,aug,2,-1,0,unknown,0,-1
2740,18582,36,technician,single,secondary,no,4136,yes,no,unknown,30,may,3,-1,0,unknown,1,-1
11671,15751,32,technician,single,tertiary,no,1094,yes,no,cellular,11,may,7,-1,0,unknown,0,-1


In [55]:
mod_data = mod_data.sample(frac=1, random_state=42)


data_N = mod_data[mod_data['class_test'] == -1]
data_P = mod_data[mod_data['class_test'] == 1]

neg_sample = data_N[:data_P.shape[0]]
pos_sample = data_P.copy()

print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1, random_state=42)

(748, 18) (748, 18)


In [56]:
sample_train.loc[sample_train['class_test'] == -1, 'class_test'] = 0

pipeline = Pipeline([
    ('features', feats),
    ('classifier', RandomForestClassifier(random_state=42)),
])


pipeline.fit(sample_train.drop(columns=['class_test', 'y']), 
             sample_train['class_test'])

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('job',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='job')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='job'))])),
                                                ('marital',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='marital')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='marital'))])),
                                                ('education',
                                                 Pipeline(steps=[('selector',
                       

In [57]:
preds = pipeline.predict_proba(X_test)[:, 1]
preds[:10]

array([0.55, 0.44, 0.73, 0.51, 0.52, 0.46, 0.48, 0.31, 0.44, 0.46])

In [58]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.47, F-Score=0.573, Precision=0.474, Recall=0.722


In [59]:
roc_auc = roc_auc_score(y_test, preds)
roc_auc

0.7437198729184925

In [60]:
metrics_df = metrics_df.append({
    'model': 'pu-learning',
    'thresh': thresholds[ix],
    'F-Score': fscore[ix],
    'Precision': precision[ix],
    'Recall': recall[ix],
    'ROC AUC': roc_auc
}, ignore_index=True)

In [61]:
metrics_df

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,supervised,0.39,0.623645,0.629823,0.617587,0.783311
1,pu-learning,0.47,0.572587,0.474462,0.721881,0.74372


In [62]:
from tqdm import tqdm

metrics_df = pd.DataFrame(columns=['frac', 'thresh', 'F-Score', 'Precision', 'Recall', 'ROC AUC'])


for frac in tqdm(np.linspace(0.1, 1, 9)):
    mod_data = X_train.copy()
    mod_data['y'] = y_train
    mod_data.head()

    # get the indices of the positives samples
    pos_ind = mod_data[mod_data['y'] == 1].sample(frac=1, random_state=42).index

    pos_sample_len = int(np.ceil(frac * len(pos_ind)))
    pos_sample = pos_ind[:pos_sample_len]
    
    mod_data['class_test'] = -1
    mod_data.loc[pos_sample,'class_test'] = 1
    
    mod_data = mod_data.sample(frac=1, random_state=42)


    data_N = mod_data[mod_data['class_test'] == -1]
    data_P = mod_data[mod_data['class_test'] == 1]

    neg_sample = data_N[:data_P.shape[0]]
    pos_sample = data_P.copy()

    sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1, random_state=42)

    sample_train.loc[sample_train['class_test'] == -1, 'class_test'] = 0

    pipeline = Pipeline([
        ('features', feats),
        ('classifier', RandomForestClassifier(random_state=42)),
    ])

    pipeline.fit(sample_train.drop(columns=['class_test','y']), 
                 sample_train['class_test'])
    
    # наши прогнозы для тестовой выборки
    preds = pipeline.predict_proba(X_test)[:, 1]
    preds[:10]

    precision, recall, thresholds = precision_recall_curve(y_test, preds)

    fscore = (2 * precision * recall) / (precision + recall)
    # locate the index of the largest f score
    ix = np.argmax(fscore)
    roc_auc = roc_auc_score(y_test, preds)

    metrics_df = metrics_df.append({
        'frac': frac,
        'thresh': thresholds[ix],
        'F-Score': fscore[ix],
        'Precision': precision[ix],
        'Recall': recall[ix],
        'ROC AUC': roc_auc
    }, ignore_index=True)

metrics_df

100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:19<00:00,  2.17s/it]


Unnamed: 0,frac,thresh,F-Score,Precision,Recall,ROC AUC
0,0.1,0.49,0.551461,0.470716,0.665644,0.722112
1,0.2125,0.46,0.570193,0.463259,0.741309,0.738642
2,0.325,0.51,0.584601,0.532326,0.648262,0.747993
3,0.4375,0.49,0.576389,0.500754,0.678937,0.754981
4,0.55,0.49,0.600606,0.52063,0.709611,0.766288
5,0.6625,0.5,0.600645,0.546521,0.666667,0.769547
6,0.775,0.99,,0.0,0.0,0.768814
7,0.8875,0.53,0.611514,0.580349,0.646217,0.776513
8,1.0,0.56,0.621926,0.623203,0.620654,0.78048
