In [49]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [50]:

data=pd.read_csv('crx.data')
df = data.copy()
df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [51]:
df.shape

(690, 16)

In [52]:
num_cols=['A2','A3','A8','A14','A15']
category_cols=list(set(data.columns)-set(num_cols+['A16']))

In [53]:
class miss_to_nan(TransformerMixin):
    def __init__(self):
        pass
        
    def fit(self, x, y=None):
        return self    
    
    def transform(self, df, y=None):
        df = df.replace('?', np.nan)
        return df

In [54]:
class num_to_str(TransformerMixin):
    def __init__(self, *, cols=[]):
        self.cols = cols
        
    def fit(self, x, y=None):
        return self    
    
    def transform(self, df, y=None):
        df[self.cols] = df[self.cols].astype(str)
        return df

class str_To_num(TransformerMixin):
    def __init__(self, *, cols=[]):
        self.cols = cols
        
    def fit(self, x, y=None):
        return self    
    
    def transform(self, df, y=None):
        df[self.cols] = df[self.cols].astype(float)
        return df

In [55]:

imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
z_score = StandardScaler()
num_preprocessing=Pipeline(steps=[
    ('missingvalue_median_imputer', imp_median),
    ('StandardScaler', z_score)
])

In [56]:
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
onehot = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_preprocessing=Pipeline(steps=[
    ('missingvalue_mode_imputer', imp_mode),
    ('onehotEncoder', onehot)
])

In [57]:
feature_transform = ColumnTransformer([
    ('number_preprocessing', num_preprocessing, num_cols),
    ('category_preprocessing', cat_preprocessing, category_cols)
    ])

In [58]:
param_grid = {
    'penalty': [None, 'l2'],
    'C': [0.001, 0.01, 0.1, 1],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}

# สร้าง GridSearchCV object
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)

In [59]:
test = Pipeline(steps=[('miss_to_nan', miss_to_nan()),
                       ('num_to_str', num_to_str()),
                       ('str_to_num', str_To_num(cols=num_cols)),
                       ('feature_transform', feature_transform),
                       ('logistic_model', grid_search)
                       ])

In [60]:
test

In [61]:
from sklearn.model_selection import train_test_split
X = data.drop('A16', axis=1)
y = data['A16'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [62]:
# data = test.fit_transform(df)
# print(data.shape)
# pd.DataFrame(data).head()

In [63]:
test.fit(X_train, y_train)
y_pred = test.predict(X_train)
y_pred

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Fur

array(['-', '+', '+', '+', '+', '+', '-', '-', '-', '-', '+', '-', '-',
       '-', '-', '+', '-', '+', '+', '+', '-', '-', '+', '-', '-', '-',
       '-', '-', '-', '+', '-', '+', '-', '-', '+', '-', '+', '-', '-',
       '-', '-', '-', '-', '+', '+', '-', '+', '-', '-', '+', '-', '+',
       '-', '+', '-', '+', '+', '+', '+', '+', '+', '+', '+', '+', '-',
       '+', '+', '-', '+', '+', '-', '-', '+', '-', '+', '+', '-', '-',
       '+', '-', '-', '-', '+', '+', '+', '-', '+', '-', '-', '-', '-',
       '+', '-', '-', '+', '-', '-', '+', '+', '-', '+', '-', '-', '+',
       '-', '-', '-', '+', '-', '+', '+', '-', '+', '-', '+', '-', '-',
       '-', '-', '+', '+', '-', '+', '+', '-', '+', '-', '+', '-', '-',
       '-', '+', '+', '-', '+', '+', '-', '+', '-', '-', '+', '+', '-',
       '+', '-', '-', '+', '-', '-', '-', '-', '+', '-', '+', '-', '+',
       '-', '+', '-', '+', '+', '-', '-', '-', '-', '-', '+', '-', '-',
       '+', '+', '-', '+', '+', '-', '-', '-', '-', '-', '+', '+

In [64]:
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.compose import make_column_transformer
# ColumnTransformer([
#     (StandardScaler(), ['numerical_column']),
#     (OneHotEncoder(), ['categorical_column'])])