### Home work № 6:

In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import catboost as catb

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

In [2]:
data = pd.read_csv("bank-full.csv", sep=";")
data.head(3)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no


In [3]:
data.iloc[:, -1].value_counts(normalize=True)

no     0.883015
yes    0.116985
Name: y, dtype: float64

In [4]:
data.loc[data['y'] == 'no', 'y'] = 0
data.loc[data['y'] == 'yes', 'y'] = 1

#### Feature engineering

In [6]:
data['balance_to_age'] = data['balance'] / data['age']

In [7]:
data['balance_to_duration'] = data['balance'] / data['duration']

In [8]:
mean_balance_at_education_level = data.groupby(['education'], as_index=False).agg({'balance':'mean'})\
                       .rename(columns={'balance':'mean_balance_at_education_level'})
data = data.merge(mean_balance_at_education_level, on='education', how='left')

mean_balance_at_education_level

Unnamed: 0,education,mean_balance_at_education_level
0,primary,1250.949934
1,secondary,1154.880786
2,tertiary,1758.416435
3,unknown,1526.754443


In [9]:
data = data[[c for c in data if c not in ['y']] + ['y']]
data.head(3)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,balance_to_age,balance_to_duration,mean_balance_at_education_level,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,36.948276,8.210728,1758.416435,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,0.659091,0.192053,1154.880786,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,0.060606,0.026316,1154.880786,0


#### Train-test split

In [23]:
x_data = data.drop(columns=['y'])
y_data = data['y'].astype(int)

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=43)

In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45211 entries, 0 to 45210
Data columns (total 20 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   age                              45211 non-null  int64  
 1   job                              45211 non-null  object 
 2   marital                          45211 non-null  object 
 3   education                        45211 non-null  object 
 4   default                          45211 non-null  object 
 5   balance                          45211 non-null  int64  
 6   housing                          45211 non-null  object 
 7   loan                             45211 non-null  object 
 8   contact                          45211 non-null  object 
 9   day                              45211 non-null  int64  
 10  month                            45211 non-null  object 
 11  duration                         45211 non-null  int64  
 12  campaign          

In [25]:
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

In [26]:
%%time
# Использую балансировку по весам классов - задаётся пропорционально 1 к 8
model = catb.CatBoostClassifier(iterations=20, thread_count=2, silent=True, random_state=43)
model.fit(x_train, y_train, categorical_columns)

y_predict = model.predict(x_test)

Wall time: 1.21 s


##### Проверяем качество

In [27]:
def evaluate_results(y_test, y_predict, print_flag=True):
    f1 = f1_score(y_test, y_predict)
    roc = roc_auc_score(y_test, y_predict)
    rec = recall_score(y_test, y_predict, average='binary')
    prc = precision_score(y_test, y_predict, average='binary')
    
    if print_flag:
        print('Classification results:')
        print("f1: %.2f%%" % (f1 * 100.0))
        print("roc: %.2f%%" % (roc * 100.0))
        print("recall: %.2f%%" % (rec * 100.0))
        print("precision: %.2f%%" % (prc * 100.0))
    else:
        return [f1, roc, rec, prc]
    
metrics = []
evaluate_results(y_test, y_predict)
metrics.append(evaluate_results(y_test, y_predict, print_flag=False))

Classification results:
f1: 53.50%
roc: 70.97%
recall: 45.05%
precision: 65.84%


#### PU learning (алгоритм - SPY)

In [144]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 1323/5289 as positives and unlabeling the rest


##### Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [145]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    43888
 1     1323
Name: class_test, dtype: int64


In [146]:
mod_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,duration,campaign,pdays,previous,poutcome,balance_to_age,balance_to_duration,mean_balance_at_education_level,y,class_test
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,...,261,1,-1,0,unknown,36.948276,8.210728,1758.416435,0,-1
1,44,technician,single,secondary,no,29,yes,no,unknown,5,...,151,1,-1,0,unknown,0.659091,0.192053,1154.880786,0,-1
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,...,76,1,-1,0,unknown,0.060606,0.026316,1154.880786,0,-1
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,...,92,1,-1,0,unknown,32.042553,16.369565,1526.754443,0,-1
4,33,unknown,single,unknown,no,1,no,no,unknown,5,...,198,1,-1,0,unknown,0.030303,0.005051,1526.754443,0,-1


In [147]:
mod_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45211 entries, 0 to 45210
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   age                              45211 non-null  int64  
 1   job                              45211 non-null  object 
 2   marital                          45211 non-null  object 
 3   education                        45211 non-null  object 
 4   default                          45211 non-null  object 
 5   balance                          45211 non-null  int64  
 6   housing                          45211 non-null  object 
 7   loan                             45211 non-null  object 
 8   contact                          45211 non-null  object 
 9   day                              45211 non-null  int64  
 10  month                            45211 non-null  object 
 11  duration                         45211 non-null  int64  
 12  campaign          

In [175]:
def downcast_dtypes(df):
    '''
          Changes column types in the dataframe: 
            `float64` type to `float32`
            `int64`   type to `int32`
      '''

    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]

          # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    df['y'] = df['y'].astype(int)

    return df

In [176]:
mod_data =  downcast_dtypes(mod_data)

In [177]:
mod_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45211 entries, 25652 to 25365
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   age                              45211 non-null  int16  
 1   job                              45211 non-null  object 
 2   marital                          45211 non-null  object 
 3   education                        45211 non-null  object 
 4   default                          45211 non-null  object 
 5   balance                          45211 non-null  int16  
 6   housing                          45211 non-null  object 
 7   loan                             45211 non-null  object 
 8   contact                          45211 non-null  object 
 9   day                              45211 non-null  int16  
 10  month                            45211 non-null  object 
 11  duration                         45211 non-null  int16  
 12  campaign      

In [178]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

####  Random negative sampling

In [179]:
mod_data = mod_data.sample(frac=1)  # Случайно перемешивает всю выборку
# В neg_sample оставляем случайную выборку длиной как pos_sample
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]  # Оставшиеся U
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)  # Одинаковые по размеру выборки
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(1323, 21) (1323, 21)


In [180]:
%%time
model = catb.CatBoostClassifier(iterations=20, thread_count=2, silent=True, random_state=21)
model.fit(sample_train.iloc[:,:-2], sample_train.iloc[:,-2], categorical_columns)

y_predict = model.predict(sample_test.iloc[:,:-2])
evaluate_results(sample_test.iloc[:,-2], y_predict)
metrics.append(evaluate_results(sample_test.iloc[:,-2], y_predict, print_flag=False))

Classification results:
f1: 46.50%
roc: 84.88%
recall: 88.96%
precision: 31.48%
Wall time: 951 ms


In [181]:
metrics_comparison_df = pd.DataFrame(np.array(metrics),
                   columns=['fscore', 'roc_auc', 'recall', 'precision'],
                   index=['CatBoostClassifier', 'CatBoostClassifier + random negative sampling'])
metrics_comparison_df

Unnamed: 0,fscore,roc_auc,recall,precision
CatBoostClassifier,0.534975,0.709724,0.450518,0.658402
CatBoostClassifier + random negative sampling,0.465037,0.848808,0.889555,0.314805


Оценка модели выросла по roc_auc. При этом random negative sampling сильно улучшил Recall за счет снижения Precision. Мы можем эти параметры отрегулировать с помощью подбора порога. Но для нашей задачи высокий Recall это как раз то, что нам и нужно. Значит модель хорошо определяет класс 1 или "похожих" на него. С другой стороны упавший Precision говорит о том, что в замен стало больше FP. И вот учитывая, что мы решаем задачу PU и в данных присутствует не размеченный класс 1, то может быть это и есть искомые "look-alike" под видом FP.