In [5]:
import pandas as pd
import numpy as np

from sklearn.compose import make_column_transformer, ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import RobustScaler as SklearnRobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer as SklearnSimpleImputer
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder
from sklearn.compose import ColumnTransformer as SklearnColumnTransformer


In [2]:
pip install pandas


Collecting pandas
  Using cached pandas-1.0.5-cp38-cp38-manylinux1_x86_64.whl (10.0 MB)
Collecting pytz>=2017.2
  Downloading pytz-2020.1-py2.py3-none-any.whl (510 kB)
[K     |████████████████████████████████| 510 kB 872 kB/s eta 0:00:01
[?25hCollecting numpy>=1.13.3
  Using cached numpy-1.19.0-cp38-cp38-manylinux2010_x86_64.whl (14.6 MB)
Installing collected packages: pytz, numpy, pandas
Successfully installed numpy-1.19.0 pandas-1.0.5 pytz-2020.1
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install sklearn


Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting scikit-learn
  Using cached scikit_learn-0.23.1-cp38-cp38-manylinux1_x86_64.whl (6.7 MB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-2.1.0-py3-none-any.whl (12 kB)
Collecting joblib>=0.11
  Using cached joblib-0.15.1-py3-none-any.whl (298 kB)
Collecting scipy>=0.19.1
  Using cached scipy-1.5.0-cp38-cp38-manylinux1_x86_64.whl (25.7 MB)
Using legacy setup.py install for sklearn, since package 'wheel' is not installed.
Installing collected packages: threadpoolctl, joblib, scipy, scikit-learn, sklearn
    Running setup.py install for sklearn ... [?25ldone
[?25hSuccessfully installed joblib-0.15.1 scikit-learn-0.23.1 scipy-1.5.0 sklearn-0.0 threadpoolctl-2.1.0
Note: you may need to restart the kernel to use updated packages.


In [6]:
class RobustScaler(SklearnRobustScaler):
    def fit_transform(self, X, y=None):
        return pd.DataFrame(super().fit_transform(X), columns=X.columns, index=X.index)

class SimpleImputer(SklearnSimpleImputer):
    def fit_transform(self, X, y=None):
        return pd.DataFrame(super().fit_transform(X), columns=X.columns, index=X.index)

class OneHotEncoder(SklearnOneHotEncoder):
    def transform(self, X):
        sparse_matrix = super(OneHotEncoder, self).transform(X)
        new_columns = self.get_new_columns(X=X)
        return pd.DataFrame(sparse_matrix.toarray(), columns=new_columns, index=X.index)

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

    def get_new_columns(self, X):
        new_columns = []
        for i, column in enumerate(X.columns):
            j = 0
            while j < len(self.categories_[i]):
                new_columns.append(f'{column}_{self.categories_[i][j]}')
                j += 1
        return new_columns

class Preprocessor:
    def __init__(self, dataset):
        self.dataset = dataset
        self.numeric_features = dataset.select_dtypes(np.number)
        self.categorical_features = dataset.select_dtypes(exclude=np.number) #object, bool, category



train_data = pd.read_csv('./data/marketing_train.csv')


########################################

numeric_features = train_data.select_dtypes(np.number)
categorical_features = train_data.select_dtypes(exclude=np.number) #object, bool, category

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder()), #handle_unknown='ignore'
])

preprocessor = ColumnTransformer(
    transformers = [
        ('nums', numeric_transformer, numeric_features.columns),
        ('cats', categorical_transformer, categorical_features.columns)
    ] #n_jobs=-1
)

########################################

preprocessor_fit = preprocessor.fit_transform(train_data)
### 얘를 밑으로 내리면 error. why? 
### train_data 대신에 다른걸 넣으면 다른 결과?!

print(preprocessor_fit)

nums_names = preprocessor.named_transformers_['nums'].fit_transform(numeric_features).columns
cats_names = preprocessor.named_transformers_['cats'].fit_transform(categorical_features).columns
feat_names = np.concatenate([nums_names, cats_names])

preprocessor_df = pd.DataFrame(preprocessor_fit, columns=feat_names)


pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])


target = 'insurance_subscribe'

features_without_target = preprocessor_df.drop(target, axis=1)
target_feature = preprocessor_df[target]


train_features, test_features, train_results, test_results = train_test_split(features_without_target, target_feature)


model = LogisticRegression()

model.fit(train_features, train_results) ##이거 stop 이런거 나오는데,, 뭐지? error는 아닌 것 같은데,,
print(model.score(train_features, train_results))
print(model.score(test_features, test_results))



test_data = pd.read_csv('./data/marketing_test.csv')


# pred.csv : ID, Predicted value
# report.csv : Precision, Recall, Accuracy, F1



[[ 0.13333333  0.29580574 -0.07692308 ...  0.          0.
   1.        ]
 [ 0.66666667  0.71228845  1.07692308 ...  0.          0.
   1.        ]
 [ 0.2         2.49816041  1.15384615 ...  0.          0.
   1.        ]
 ...
 [-0.33333333  0.6401766  -0.07692308 ...  0.          0.
   1.        ]
 [-0.4        -0.20750552 -0.69230769 ...  0.          0.
   1.        ]
 [-0.06666667 -0.35908756  0.38461538 ...  0.          0.
   1.        ]]
0.9024183440241834
0.9002433090024331


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
train_features


Unnamed: 0,age,balance,last_contact_day,contact_duration,campaign,gap_between_campaigns,previous,job_admin.,job_blue-collar,job_entrepreneur,...,last_contact_month_jun,last_contact_month_mar,last_contact_month_may,last_contact_month_nov,last_contact_month_oct,last_contact_month_sep,outcome_previous_campaign_failure,outcome_previous_campaign_other,outcome_previous_campaign_success,outcome_previous_campaign_unknown
6688,-0.200000,-0.234731,-0.615385,2.572093,-0.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
22302,0.866667,1.327447,-1.000000,-0.567442,3.5,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
31979,0.066667,1.621781,-0.076923,-0.400000,-0.5,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4469,-0.533333,1.632818,-0.615385,1.074419,0.0,149.0,9.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
30937,0.066667,-0.275938,1.076923,1.316279,-0.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16126,1.000000,0.807211,-0.846154,-0.441860,2.5,197.0,3.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
29913,-0.600000,-0.176600,0.538462,-0.502326,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2180,-0.266667,0.180280,-1.000000,-0.237209,-0.5,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2849,1.000000,-0.337013,-0.307692,-0.055814,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [35]:
class Preprocessor:
    def __init__(self, dataset):
        self.dataset = dataset
        self.numeric_features = dataset.select_dtypes(np.number)
        self.categorical_features = dataset.select_dtypes(exclude=np.number) #object, bool, category
        
    def transform_numeric_features(self, numeric_features):
        return Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', RobustScaler())
        ])
    
    def transform_categorical_features(self, categorical_features):
        return Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder()), #handle_unknown='ignore'
        ])

    def preprocessor(self, dataset):
        return ColumnTransformer(
            transformers = [
                ('nums', self.transform_numeric_features, self.numeric_features.columns),
                ('cats', self.transform_categorical_features, self.categorical_features.columns)
            ]
        ) #n_jobs=-1

In [36]:
pre = Preprocessor(train_data)

In [37]:
print(pre)

<__main__.Preprocessor object at 0x7f5cd4515e80>


In [20]:
train_data

Unnamed: 0,age,job,marital_status,education,default,balance,housing_loan,personal_loan,contact_channel,last_contact_day,last_contact_month,contact_duration,campaign,gap_between_campaigns,previous,outcome_previous_campaign,insurance_subscribe
0,41,blue-collar,married,primary,no,849,yes,no,unknown,15,may,72,1,-1,0,unknown,0
1,49,technician,married,primary,no,1415,yes,no,cellular,30,jul,269,2,-1,0,unknown,0
2,42,admin.,married,secondary,no,3842,no,no,cellular,31,jul,130,4,-1,0,unknown,0
3,37,management,single,tertiary,no,-119,yes,no,unknown,11,jun,375,11,-1,0,unknown,0
4,56,blue-collar,married,primary,no,3498,no,no,cellular,15,apr,264,2,-1,0,unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36163,44,housemaid,single,primary,no,1059,no,no,unknown,18,jun,2093,1,-1,0,unknown,1
36164,23,student,single,tertiary,no,508,no,no,cellular,8,sep,210,1,92,1,failure,0
36165,34,technician,divorced,tertiary,no,1317,yes,no,cellular,15,may,239,1,-1,0,unknown,0
36166,33,retired,married,secondary,no,165,no,no,unknown,7,may,111,1,-1,0,unknown,0


In [22]:
a = preprocessor.fit_transform(test_data)

In [23]:
a


array([[ 0.06666667,  0.09427609,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.53333333,  2.38683128, -0.53846154, ...,  0.        ,
         0.        ,  1.        ],
       [-0.93333333,  0.06285073,  0.30769231, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.86666667, -0.33969323,  0.76923077, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.06666667,  0.24616536,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.13333333,  1.31313131,  0.30769231, ...,  0.        ,
         0.        ,  1.        ]])

In [24]:
b = pd.DataFrame(a, columns=feat_names)

In [25]:
b

Unnamed: 0,age,balance,last_contact_day,contact_duration,campaign,gap_between_campaigns,previous,insurance_subscribe,job_admin.,job_blue-collar,...,last_contact_month_jun,last_contact_month_mar,last_contact_month_may,last_contact_month_nov,last_contact_month_oct,last_contact_month_sep,outcome_previous_campaign_failure,outcome_previous_campaign_other,outcome_previous_campaign_success,outcome_previous_campaign_unknown
0,0.066667,0.094276,0.000000,0.046083,-0.5,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.533333,2.386831,-0.538462,-0.456221,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.933333,0.062851,0.307692,0.202765,-0.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.200000,0.986906,-0.538462,0.594470,-0.5,337.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.133333,-0.177329,0.384615,-0.281106,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9038,0.533333,1.074448,-0.615385,-0.096774,-0.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9039,-0.466667,-0.177329,-0.076923,2.350230,0.5,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9040,0.866667,-0.339693,0.769231,0.110599,-0.5,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9041,0.066667,0.246165,0.000000,-0.050691,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [27]:
target = 'insurance_subscribe'

b_features = b.drop(target, axis=1)
b_target = b[target]

In [28]:
b_features

Unnamed: 0,age,balance,last_contact_day,contact_duration,campaign,gap_between_campaigns,previous,job_admin.,job_blue-collar,job_entrepreneur,...,last_contact_month_jun,last_contact_month_mar,last_contact_month_may,last_contact_month_nov,last_contact_month_oct,last_contact_month_sep,outcome_previous_campaign_failure,outcome_previous_campaign_other,outcome_previous_campaign_success,outcome_previous_campaign_unknown
0,0.066667,0.094276,0.000000,0.046083,-0.5,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.533333,2.386831,-0.538462,-0.456221,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.933333,0.062851,0.307692,0.202765,-0.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.200000,0.986906,-0.538462,0.594470,-0.5,337.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.133333,-0.177329,0.384615,-0.281106,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9038,0.533333,1.074448,-0.615385,-0.096774,-0.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9039,-0.466667,-0.177329,-0.076923,2.350230,0.5,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9040,0.866667,-0.339693,0.769231,0.110599,-0.5,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9041,0.066667,0.246165,0.000000,-0.050691,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [29]:
b_target

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
9038    0.0
9039    1.0
9040    0.0
9041    0.0
9042    0.0
Name: insurance_subscribe, Length: 9043, dtype: float64

In [32]:
d = model.predict(b_features)

In [33]:
d

array([0., 0., 0., ..., 0., 0., 0.])

In [34]:
e = model.predict(test_data)

ValueError: could not convert string to float: 'blue-collar'