In [407]:
# coding=utf-8
import numpy as np
import pandas as pd
from sklearn.base import TransformerMixin
from collections import Counter
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

In [511]:
class CustomTransformer(TransformerMixin):
    
    def __init__(self, alpha):
        self.alpha = alpha
    
    def fit(self, X, y):
        self.cnt = Counter()
        for i in range(X.shape[1]):
            for j in range(np.max(X[:, i])+1):
                indices = X[:, i] == j
                if np.sum(indices) > 0:
                    #val = y[indices].mean()
                    val = ((np.sum(y[indices]))*np.sum(indices) + np.sum(y)*self.alpha)/\
                    (np.sum(indices) + self.alpha)
                else:
                    #val = y.mean()
                    val = np.sum(y)
                self.cnt[(i, j)] = val
         
        return self
    
#     def transform(self, X):
#         X_new = np.copy(X)
#         for i in range(X.shape[1]): 
#             for j in range(np.max(X[:, i])+1):
#                 indices = X[:, i] == j
#                 if np.sum(indices) > 0:
#                     X_new[indices, i] = self.cnt[(i, j)]
#         return X_new

    def transform(self, X):
        X_new = np.copy(X)
        for i in range(X.shape[1]):
            for j in range(np.max(X[:,i])+1):
                indices = X[:, i] == j
                if np.sum(indices) > 0:
                    X_new[indices, i] = self.cnt[(i,j)]
                    #X_new[indices, i] = (self.cnt[(i, j)]*X.shape[0] +self.global_mean*self.alpha) \
                    #/ (X.shape[0] + self.alpha)
                     
        return X_new

In [416]:
df = pd.read_csv('mushrooms.csv', header=None)

In [417]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [418]:
df.shape

(8124, 23)

In [419]:
X, y = np.array(df.loc[:, 1:]), np.array(df.loc[:, 0])

label_encoder = LabelEncoder()

for i in range(X.shape[1]):
    X[:, i] = label_encoder.fit_transform(X[:, i])
    
y = np.equal(y, 'p').astype(int)
X_data, y_data = X, y

In [420]:
col_num = 5
len(np.unique(X_data[:,col_num])) == len(np.unique(df.loc[:,col_num+1].values))

True

In [563]:
lr_params = {'penalty':'l2',
            # 'C': 0.007,
             'C':5000,
             'random_state': 778}
  #{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’},            
             #'class_weight': 'balanced'}

In [574]:
pipeline = make_pipeline(
        CustomTransformer(alpha=600),
        LogisticRegression(**lr_params)
    )

In [575]:
print( "Score:" , round(cross_val_score(pipeline, X_data, y_data, cv=3).mean(),4))

Score: 0.9589


In [547]:
# Temporary best:  0.9696( (l1, 'C':  100, rs = 778)  correct smoothed sum(traget))
# Temporary best:  0.9554 ( (l2, 'C':  0.007)  sum(traget))
# Temporary best:  0.958 ( (l2, 'C':  0.007)  smoothed sum(traget), alpha = 10 ) - bad 