In [None]:
import pandas as pd
import numpy as np
import math

In [None]:
#MeanImputer
class MeanImputer():
    def __init__(self, copy=True):
        self.copy = copy
    def __is_numpy(self, X):
        #X : pandas.DataFrame или numpy.ndarray
        #Is it numpy or not
        return isinstance(X, np.ndarray)
    def fit(self, X, y=None):
        self._encoder_dict = {}
        is_np = self.__is_numpy(X)
        #reshape from 1D to 2D
        if len(X.shape) == 1:
            X = X.reshape(-1, 1)
        #amount of columns
        ncols = X.shape[1]
        
        if is_np:
            for col in range(ncols):
                self._encoder_dict[col] = np.nanmean(X[:, col])
        else:
            for col in X.columns:
                self._encoder_dict[col] = X[col].mean()
                
        return self
    
    def transform(self, X):
        if self.copy:
            X = X.copy()
        is_np = self.__is_numpy(X)
        
        if len(X.shape) == 1:
            X = X.reshape(-1, 1)
        ncols = X.shape[1]
        
        if is_np:
            for col in range(ncols):
                X[:, col] = np.nan_to_num(
                    X[:, col],
                nan=self._encoder_dict[col])
        else:
            for col in X.columns:
                X[col] = np.where(X[col].isnull(),
                                    self._encoder_dict[col],
                                    X[col])
        return X    

In [None]:
toy_train = pd.DataFrame(
{'Balance': [8.3, np.NaN, 10.2, 3.1],
'Age': [23, 29, 36, np.NaN]})
toy_train

In [None]:
toy_test = pd.DataFrame(
{'Balance': [10.4, np.NaN, 22.5, 1.1],
'Age': [13, 19, 66, np.NaN]})
toy_test

In [None]:
#Using mean()
'''
for col in toy_train.columns:
    toy_train[col].fillna(toy_train[col].mean(), inplace=True)
    toy_test[col].fillna(toy_train[col].mean(), inplace=True)
print('обучающий датафрейм')
print(toy_train)
print('')
print('тестовый датафрейм')
print(toy_test)
'''

In [None]:
imp = MeanImputer()
imp.fit(toy_train)
toy_train = imp.transform(toy_train)
toy_train

In [None]:
toy_test = imp.transform(toy_test)
toy_test

In [None]:
toy_train = pd.DataFrame(
{'Balance': [8.3, np.NaN, 10.2, 3.1],
'Age': [23, 29, 36, np.NaN]})
# создаем экземпляр класса, отключив копирование
imp = MeanImputer(copy=False)
# обучаем модель
imp.fit(toy_train[['Age']])
# применяем модель
toy_train['Age'] = imp.transform(toy_train[['Age']])
toy_train

In [None]:
np_toy_train = np.array(pd.DataFrame(
{'Balance': [8.3, np.NaN, 10.2, 3.1],
'Age': [23, 29, 36, np.NaN]}))
np_toy_train

np_toy_test = np.array(pd.DataFrame(
{'Balance': [10.4, np.NaN, 22.5, 1.1],
'Age': [13, 19, 66, np.NaN]}))
np_toy_test

In [None]:
imp.fit(np_toy_train)
np_toy_train = imp.transform(np_toy_train)
np_toy_train

In [None]:
np_toy_test = imp.transform(np_toy_test)
np_toy_test

In [None]:
#KNN Model code;

class KNN_Estimator():
    def _euclidean_distance(self, x1, x2):
        distance = 0
        for i in range(len(x1)):
            distance += pow((x1[i] - x2[i]), 2)
        return math.sqrt(distance)
    
    def _vote(self, neighbor_labels):
        counts = np.bincount(neighbor_labels.astype('int'))
        return counts.argmax()
    
    def __init__(self, k=5, task='classification'):
        self.k = k
        self.task = task
        self.k_nearest_neighbors_ = []
        
    def fit(self, X, y):
        self.X_memorized = X
        self.y_memorized = y
        
    def predict(self, X):
        y_pred = np.empty(X.shape[0])
        if self.task == 'classification':
            for i, test_sample in enumerate(X):
                idx = np.argsort([self._euclidean_distance(
                    test_sample, x) for x in self.X_memorized])[:self.k]
                k_nearest_neighbors = np.array(
                    [self.y_memorized[i] for i in idx])
                self.k_nearest_neighbors_.append(k_nearest_neighbors)
                y_pred[i] = self._vote(self.k_nearest_neighbors_[i])
        if self.task == 'regression':
            for i, test_sample in enumerate(X):
                idx = np.argsort([self._euclidean_distance(
                    test_sample, x) for x in self.X_memorized])[:self.k]
                k_nearest_neighbors = np.array(
                    [self.y_memorized[i] for i in idx])
                self.k_nearest_neighbors_.append(k_nearest_neighbors)
                y_pred[i] = np.mean(self.k_nearest_neighbors_[i])
        return y_pred    
    

In [None]:
#Classification
X_trn = np.array([[0.1, 0.2, 0.3],
                                [0.7, 0.5, 0.2],
                                [0.1, 0.2, 0.2],
                                [0.9, 0.7, 3.5],
                                [0.2, 0.4, 1.4],
                                [0.4, 0.1, 0.5]])

y_trn = np.array([1, 0, 1, 0, 0, 1])

X_tst = np.array([[0.1, 0.7, 1.1],
                                [0.5, 0.3, 2.8],
                                [0.1, 0.1, 0.2],
                                [0.9, 0.7, 1.5]])

In [None]:
knn = KNN_Estimator(k=3, task='classification')
knn.fit(X_trn, y_trn)

pred = knn.predict(X_tst)
pred

In [None]:
knn.k_nearest_neighbors_

In [None]:
y_trn = np.array([1.2, 0.5, 1.4, 2.2, 3.5, 5.9])

In [None]:
knn = KNN_Estimator(k=3, task='regression')
knn.fit(X_trn, y_trn)

In [None]:
pred = knn.predict(X_tst)
pred

In [None]:
knn.k_nearest_neighbors_

In [None]:
a = [2, 4, 7, 9, 14, 20, 21, 22]
b = [3, 5, 8, 10, 14, 20, 21, 30]

def seven(a,b):
    for i in range(len(a)):
            if a[i] % 7 == 0 and b[i] % 7 == 0:
                print(a[i])
            else:
                pass

In [None]:
seven(a,b)

In [None]:
def find(lst1, lst2):
    lst1 = [i for i in lst1 if i % 7 == 0]
    lst2 = [i for i in lst2 if i % 7 == 0]
    return set(lst1) & set(lst2)

find(a,b)

In [None]:
a = ["a", "b", "c", "d", "e", "f"]
b = [1, 0, 9, 3, 2, 0]

In [None]:
def get_sorted(lst1, lst2):
    lst_tmp = [x for x in zip(lst1, lst2)]
    lst_tmp.sort(key=lambda x: x[1])
    return [x[0] for x in lst_tmp]

In [None]:
get_sorted(a,b)

In [None]:
z = zip(a,b)
z = list(z)
z

In [None]:
#Pipeline

import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [None]:
os.getcwd()

In [None]:
#Loaddataset
from sklearn.datasets import fetch_covtype
import pandas as pd
#https://scikit-learn.ru/7-2-real-world-datasets/
#https://scikit-learn.org/stable/auto_examples/miscellaneous/plot_outlier_detection_bench.html#sphx-glr-auto-examples-miscellaneous-plot-outlier-detection-bench-py

X, y = fetch_covtype(return_X_y=True, as_frame=True)
s = (y == 2) + (y == 4)
X = X.loc[s]
y = y.loc[s]
y = (y != 2).astype(np.int32)

#X, _, y, _ = train_test_split(X, y, train_size=0.05, stratify=y, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [None]:
#Scaling
standardscaler = StandardScaler()
standardscaler.fit(X_train)
# Transform ONLY to train and test, not to X. Because in X - other meanings.
X_train_standardscaled = standardscaler.transform(X_train)  
X_test_standardscaled = standardscaler.transform(X_test)

In [None]:
#TFitting
logreg = LogisticRegression(solver='lbfgs', max_iter=400)
logreg.fit(X_train_standardscaled, y_train)
print("Score train: {:.3f}".format(
logreg.score(X_train_standardscaled, y_train)))
print("Score test: {:.3f}".format(
logreg.score(X_train_standardscaled, y_train)))

In [None]:
#Getting score
train_score = logreg.score(X_train_standardscaled, y_train)
test_score = logreg.score(X_test_standardscaled, y_test)
print(f"Score train: {train_score:.3f}")
print(f"Score test: {test_score:.3f}")

In [None]:
print("Score train: %.3f" % train_score)
print("Score test: %.3f" % test_score)

In [None]:
#Predictiong
logreg_pred = logreg.predict(X_test_standardscaled)
logreg_pred

In [None]:
#Predicting probability
logreg_pred = logreg.predict_proba(X_test_standardscaled)
logreg_pred

In [None]:
#Constant
intercept = np.round(logreg.intercept_, 3)
intercept

In [None]:
#Coef
coef = np.round(logreg.coef_, 3)
coef

In [None]:
#Classes
classes = np.round(logreg.classes_, 3)
classes

In [None]:
logreg.coef_[0]

In [None]:
#Coef for features
for i,feature in zip(logreg.coef_[0], X_train.columns):
    print(feature, i)

In [None]:
#Bootstrap and out-of-bag
from IPython.display import display
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor

In [None]:
X

In [None]:
#Toy regression

var_lst = ['Aspect', 'Slope',
'Horizontal_Distance_To_Hydrology', 'Elevation']
toy_data = X[var_lst].head(10)
toy_labels = toy_data.pop('Elevation')

In [None]:
toy_data.shape

In [None]:
#Bootstrap - values can be repeated
sample_indices = np.arange(toy_data.shape[0])
rng = np.random.RandomState(42)
bootstrap_indices = rng.choice(sample_indices,
                               size=sample_indices.shape[0],
                               replace=True)
display(bootstrap_indices.shape)
toy_data_boot = toy_data.iloc[bootstrap_indices]
toy_labels_boot = toy_labels.iloc[bootstrap_indices]
display(toy_data_boot)
display(toy_labels_boot)

In [None]:
toy_labels_boot.nunique()

In [None]:
#Out-of-bag:
toy_data_out_boot = toy_data[~toy_data.index.isin(
toy_data_boot.index)]
toy_labels_out_boot = toy_labels[~toy_labels.index.isin(
toy_data_boot.index)]
display(toy_data_out_boot)
display(toy_labels_out_boot)

In [None]:
#bootstrap and out-of-bag function
def generate_bootstrap(rng, X, y, verbose=True):
        sample_indices = np.arange(X.shape[0])
        bootstrap_indices = rng.choice(sample_indices,
                                       size=sample_indices.shape[0],
                                       replace=True)
        X_boot = X.iloc[bootstrap_indices]
        y_boot = y.iloc[bootstrap_indices]
        X_out_boot = X[~X.index.isin(X_boot.index)]
        y_out_boot = y[~y.index.isin(X_boot.index)]
        if verbose:
            print(f"{i}-iteration")
            print(f"indexes in bootstrap: {X_boot.index.tolist()}")
            print(f"indexes in out-of-bag: {X_out_boot.index.tolist()}\n")
        return X_boot, y_boot, X_out_boot, y_out_boot

In [None]:
rng = np.random.RandomState(42)
standardscaler = StandardScaler()
tree = DecisionTreeRegressor(random_state=42)
test_score_lst = []
for i in range(1, 4):
    X_boot, y_boot, X_out_boot, y_out_boot = generate_bootstrap(
    rng, toy_data, toy_labels)
    tree.fit(X_boot, y_boot)
    test_score = tree.score(X_out_boot, y_out_boot)
    test_score_lst.append(test_score)

In [None]:
print(test_score_lst)

In [None]:
mean_r2 = sum(test_score_lst) / len(test_score_lst)
print("Mean Value R2: %.3f" % mean_r2)

In [None]:
#R2 in list
import statistics
mean_r2 = statistics.fmean(test_score_lst)
print("Mean Value R2: %.3f" % mean_r2)

In [None]:

rng = np.random.RandomState(42)

standardscaler = StandardScaler()

logreg = LogisticRegression(solver='lbfgs', max_iter=200)

test_score_lst = []

for i in range(15):
    X_boot, y_boot, X_out_boot, y_out_boot = generate_bootstrap(
        rng, X_train, y_train, verbose=False)
    standardscaler.fit(X_boot)
    X_boot_scaled = standardscaler.transform(X_boot)
    X_out_boot_scaled = standardscaler.transform(X_out_boot)
    logreg.fit(X_boot_scaled, y_boot)
   #     X_out_boot_scaled)
    test_score = logreg.score(
        X_out_boot_scaled, y_out_boot)
    test_score_lst.append(test_score)
    

In [None]:
mean_acc = statistics.fmean(test_score_lst)
print("Mean_acc: %.3f" % mean_acc)

In [None]:
test_score_lst

In [None]:
#bias_variance

from mlxtend.data import boston_housing_data
boston_housing_data()
X, y = boston_housing_data()
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=123)

#generating bootstrap samples based on a training sample
def _draw_bootstrap_sample(rng, X, y):
    sample_indices = np.arange(X.shape[0])
    bootstrap_indices = rng.choice(sample_indices,
                                    size=sample_indices.shape[0],
                                    replace=True)
    return X[bootstrap_indices], y[bootstrap_indices]

In [None]:
#function that calculates the average expected value of the function of losses, averaged bias, averaged variance

def bias_variance_decomp(estimator, X_train, y_train, X_test, y_test,
                        num_rounds=200, random_seed=None):
#Author: Sebastian Raschka https://github.com/rasbt

        rng = np.random.RandomState(random_seed)
        all_pred = np.zeros((num_rounds, y_test.shape[0]), dtype=int)

        for i in range(num_rounds):
            X_boot, y_boot = _draw_bootstrap_sample(rng, X_train, y_train)
            pred = estimator.fit(X_boot, y_boot).predict(X_test)
            all_pred[i] = pred
            

        avg_expected_loss = np.apply_along_axis(
            lambda x:
            ((x - y_test) ** 2).mean(),
            axis=1,
            arr=all_pred).mean()

        main_predictions = np.mean(all_pred, axis=0)
        avg_bias = np.sum((main_predictions - y_test) ** 2) / y_test.size
        avg_var = np.sum((main_predictions - all_pred) ** 2) / all_pred.size
        return avg_expected_loss, avg_bias, avg_var

In [None]:
#max depth tree
tree = DecisionTreeRegressor(random_state=123)
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
    tree, X_train, y_train, X_test, y_test,
    random_seed=123)

ttl = "avg_expected_loss: %.3f"
print(ttl % avg_expected_loss)
print("avg_bias: %.3f" % avg_bias)
print("avg_var: %.3f" % avg_var)

In [None]:
#depth=1 tree
tree2 = DecisionTreeRegressor(random_state=123, max_depth=1)
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
    tree2, X_train, y_train, X_test, y_test,
    random_seed=123)

print(ttl % avg_expected_loss)
print("avg_bias: %.3f" % avg_bias)
print("avg_var: %.3f" % avg_var)

In [None]:
#what is bias_variance_decomp
X_train = np.array([[29.1, 19000.28, 15],
                    [67.3, 48800.81, 45],
                    [77.9, 89800.55, 188]])
X_test = np.array([[11.9, 89900.28, 199],
                    [37.8, 10600.82, 95],
                    [77.2, 99700.22, 87]])
y_train = np.array([22.6, 89.5, 17.3])
y_test = np.array([12.4, 96.9, 107.9])

rng = np.random.RandomState(123)

all_pred = np.zeros((3, y_test.shape[0]), dtype=int)
all_pred

In [None]:
for i in range(3):
    X_boot, y_boot = _draw_bootstrap_sample(rng, X_train, y_train)
    pred = tree.fit(X_boot, y_boot).predict(X_test)
    all_pred[i] = pred
    
display(all_pred)
y_test

In [None]:
mse_first_iter = (((17 - 12.4)**2) + ((89 - 96.9)**2) +
                 ((89 - 107.9)**2)) / 3
mse_first_iter

In [None]:
mse = np.apply_along_axis(
    lambda x:
    ((x - y_test)**2).mean(),
    axis=1,
    arr=all_pred)
mse

In [None]:
avg_expected_loss = np.apply_along_axis(
    lambda x:
    ((x - y_test)**2).mean(),
    axis=1,
    arr=all_pred).mean()
avg_expected_loss

In [None]:
main_predictions = np.mean(all_pred, axis=0)
main_predictions

In [None]:
avg_bias = np.sum((main_predictions - y_test)**2) / y_test.size
avg_bias

In [None]:
avg_var = np.sum((main_predictions - all_pred)**2) / all_pred.size
avg_var

In [None]:
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
    tree, X_train, y_train, X_test, y_test, num_rounds=3,
    random_seed=123)

print(ttl % avg_expected_loss)
print("avg_bias: %.3f" % avg_bias)
print("avg_var: %.3f" % avg_var)

In [None]:
#MissingIndicator SimpleImputer
from sklearn.impute import MissingIndicator, SimpleImputer

data = pd.read_csv('./HPData.csv')

data.head()

In [None]:
train, test, y_train, y_test = train_test_split(
    data.drop('SalePrice', axis=1),
    data['SalePrice'],
    test_size=.3,
    random_state=100)

In [None]:
train.info()

In [None]:
train['LotFrontage'].isnull().sum()

In [None]:
train.isnull().sum()

In [None]:
miss_ind = MissingIndicator()
miss_ind.fit(train[['LotFrontage']])

In [None]:
train['miss_ind_LotFrontage'] = miss_ind.transform(train[['LotFrontage']])
test['miss_ind_LotFrontage'] = miss_ind.transform(test[['LotFrontage']])

In [None]:
train[train["miss_ind_LotFrontage"] == True]

In [None]:
train["LotFrontage"].median()

In [None]:
#For numerical
simp = SimpleImputer(strategy='median')
simp.fit(train[['LotFrontage']])

test['LotFrontage'] = simp.transform(test[['LotFrontage']])
test['LotFrontage'] = simp.transform(test[['LotFrontage']])

In [None]:
train['LotFrontage'].isnull().sum()

In [None]:
test['LotFrontage'].isnull().sum()

In [None]:
train[train['LotFrontage'] == 69]

In [None]:
train['Alley']

In [None]:
#For catagorical
cat_cols = ['Alley']
simp2 = SimpleImputer(strategy='most_frequent')
simp2.fit(train[cat_cols])

test[cat_cols] = simp2.transform(test[cat_cols])
train[cat_cols] = simp2.transform(train[cat_cols])

In [None]:
train['Alley'].isnull().sum()

In [None]:
#Dummy with OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe_train = train[['MSZoning']].copy()
ohe.fit(ohe_train)
ohe_train_transformed = ohe.transform(ohe_train)
ohe_train_transformed[:100]

In [None]:
ohe_test = train[['MSZoning']].copy()
ohe_test.iloc[0, 0] = 'new_category'
ohe_test_transformed = ohe.transform(ohe_test)
ohe_test_transformed[:3]

In [None]:
#min_frequency
train = pd.DataFrame(
    {'City': ['MSK', 'MSK', 'MSK', 'SPB',
            'EKB', 'EKB', 'EKB',
            'EKB', 'EKB']})
train

In [None]:
train['City'].value_counts()

In [None]:
ohe = OneHotEncoder(
    min_frequency=3,
    sparse=False,
    handle_unknown='infrequent_if_exist')
ohe.fit(train)
ohe.infrequent_categories_

In [None]:
ohe.transform(train)

In [None]:
test = pd.DataFrame(
    {'City': ['NSK', 'MSK', 'NSK', 'MSK',
        'SPB', 'EKB', 'SPB',
        'EKB', 'SPB']})
test

In [None]:
#NSK = SPB = 0-0-1
ohe.transform(test)

In [None]:
train

In [None]:
train = pd.get_dummies(train)
train.head

In [None]:
#CRS format - compressed row storage
from scipy.sparse import csr_matrix
A = np.array([[0, 0, 0, 0],
             [5, 8, 0, 0],
             [0, 0, 3, 0],
             [0, 6, 0, 4]])


In [None]:
S = csr_matrix(A)
print(S)

In [None]:
D = S.todense()
print(D)

In [None]:
#Example https://www.kaggle.com/competitions/cat-in-the-dat-ii/overview
data = pd.read_csv('./CFECtrain.csv')
data['target'].value_counts()
data.shape

In [None]:
for col in ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']:
    abs_freq = data[col].value_counts(dropna=False)
    data[col] = np.where(
        data[col].isin(abs_freq[abs_freq >= 100].index.tolist()),
        data[col], 'Other')

In [None]:
data.columns

In [None]:
data.drop('id', axis=1, inplace=True)
labels = data.pop('target').values
data.info()

In [None]:
for col in data.columns:
    data[col + '_isnan'] = np.where(data[col].isnull(), 'T', 'F')

In [None]:
data

In [None]:
data['ord_5a'] = data['ord_5'].str[0]
data['ord_5b'] = data['ord_5'].str[1]

In [None]:
data['ord_5b']

In [144]:
columns = [col for col in data.columns]
X_train, X_test, y_train, y_test = train_test_split(
    data,
    labels,
    test_size=0.3,
    stratify=labels,
    random_state=42)

In [145]:
#Dummy no sparse
X_tr_non_sparse = pd.get_dummies(
    X_train,
    columns=columns,
    drop_first=True,
    sparse=False)
X_tst_non_sparse = pd.get_dummies(
    X_test,
    columns=columns,
    drop_first=True,
    sparse=False)

In [147]:
#Dummy with sparse
X_tr_sparse = pd.get_dummies(
    X_train,
    columns=columns,
    drop_first=True,
    sparse=True)
X_tst_sparse = pd.get_dummies(
    X_test,
    columns=columns,
    drop_first=True,
    sparse=True)

In [148]:
print('non_sparse:', X_tr_non_sparse.shape, X_tst_non_sparse.shape)
print('sparse:', X_tr_sparse.shape, X_tst_sparse.shape)

non_sparse: (420000, 5026) (180000, 5026)
sparse: (420000, 5026) (180000, 5026)


In [None]:
#XX_tr_sparse = X_tr_non_sparse.iloc[:,:2500]
#XX_tst_sparse = X_tst_non_sparse.iloc[:,:2500]

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
'''
%time
logreg = LogisticRegression(solver='liblinear').fit(
    X_tr_sparse, y_train)
print("AUC на обучающей выборке: {:.3f}".format(
    roc_auc_score(y_train, logreg.predict_proba(
        X_tr_sparse)[:, 1])))
print("AUC на тестовой выборке: {:.3f}".format(
    roc_auc_score(y_test, logreg.predict_proba(
        X_tst_sparse)[:, 1])))
'''

In [None]:
%time
'''
logreg = LogisticRegression(solver='liblinear').fit(
    X_tr_non_sparse, y_train)
print("AUC на обучающей выборке: {:.3f}".format(
    roc_auc_score(y_train, logreg.predict_proba(
        X_tr_non_sparse)[:, 1])))
print("AUC на тестовой выборке: {:.3f}".format(
    roc_auc_score(y_test, logreg.predict_proba(
        X_tst_non_sparse)[:, 1])))
'''

In [None]:
display(logreg.penalty)
display(logreg.l1_ratio)

In [122]:
#PIPELINE
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

In [123]:
pipe = Pipeline([('scaler', StandardScaler()),
                 ('logreg', LogisticRegression(
                    solver='lbfgs', max_iter=200))])

In [124]:
#It gives automatic name
pipe_quick = make_pipeline(
    StandardScaler(),
    LogisticRegression(solver='lbfgs', max_iter=200))

In [125]:
pipe.steps

[('scaler', StandardScaler()), ('logreg', LogisticRegression(max_iter=200))]

In [126]:
pipe_quick.steps

[('standardscaler', StandardScaler()),
 ('logisticregression', LogisticRegression(max_iter=200))]

In [127]:
pipe.verbose

False

In [152]:
#To avoid memory error
XXX_tr_sparse = X_tr_sparse.iloc[:100000,:]
XXX_tst_sparse = X_tst_sparse.iloc[:100000,:]
yyy_train = y_train[0:100000]
yyy_test = y_test[0:100000]

In [158]:
pipe.fit(XXX_tr_sparse, yyy_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [165]:
print("Train_score: {:.3f}".format(
    pipe.score(XXX_tr_sparse, yyy_train)))
print("Test_score: {:.3f}".format(
    pipe.score(XXX_tst_sparse, yyy_test)))
intercept = np.round(pipe.named_steps['logreg'].intercept_[0], 3)
coef = np.round(pipe.named_steps['logreg'].coef_, 3)
print(f'Intercept: {intercept}')
print(f'Coef: {coef}')


Train_score: 0.838
Test_score: 0.815
Intercept: -2.075
Coef: [[-0.218 -0.071  0.21  ...  0.007 -0.016  0.005]]


In [167]:
feat_labels = XXX_tr_sparse.columns
print("Intercept:", intercept)
print("Coefs:")

for i, columns in zip(coef[0], feat_labels):
    print(columns, i)

Intercept: -2.075
Coefs:
bin_0_1.0 -0.218
bin_1_1.0 -0.071
bin_2_1.0 0.21
bin_3_T 0.014
bin_4_Y 0.053
nom_0_Green -0.044
nom_0_Red -0.052
nom_1_Polygon 0.094
nom_1_Square -0.034
nom_1_Star -0.087
nom_1_Trapezoid 0.15
nom_1_Triangle -0.092
nom_2_Cat -0.057
nom_2_Dog -0.055
nom_2_Hamster -0.123
nom_2_Lion 0.023
nom_2_Snake -0.064
nom_3_China -0.03
nom_3_Costa Rica 0.165
nom_3_Finland 0.059
nom_3_India 0.016
nom_3_Russia 0.21
nom_4_Oboe -0.052
nom_4_Piano -0.136
nom_4_Theremin -0.123
nom_5_0054c0c3a -0.011
nom_5_00a731d2e -0.022
nom_5_014770cf0 -0.047
nom_5_0165aa0c3 -0.055
nom_5_023ed7074 -0.008
nom_5_024efa364 -0.045
nom_5_0276cf712 -0.044
nom_5_0286dc1e1 -0.015
nom_5_0289ab250 -0.005
nom_5_028a6acde -0.021
nom_5_029d67ae5 -0.036
nom_5_0388c582c -0.026
nom_5_03c739608 -0.021
nom_5_03cbd5a22 -0.022
nom_5_03ea75c83 -0.056
nom_5_03f2a3450 -0.003
nom_5_045558e43 -0.021
nom_5_053a1f28a 0.002
nom_5_0549ab935 -0.034
nom_5_0568087f1 -0.04
nom_5_05d793448 -0.023
nom_5_05eecb19a -0.023
nom_5_0618