In [25]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [16]:
data = pd.read_csv(r".\DATA\_prepared_data.csv")
data.head(3)

Unnamed: 0,SWE Store Key,Region,Distrib,Office,FFDSL,TSE/MTDE,Уровень Торг Региона 1,Уровень Торг Региона 2,Filial Name,Filial Ship To,...,Chain. Chain Sub Tier MWC,Channel Name 2018,Outlet Type 2018,Trade Structure,From Dc,Segment MWC. Segment Name,Cluster MWC,Ship-to TO BE (загружено RSS),Latitude,Longitude
0,1000201031,Siberia,ООО Гермес,MWC ООО Гермес (Новокузнецк),MWC FFDL Kuzbass (Vetrova Marina),MWC TSE Novokuznetsk (Sandler Kseniya),TL MWC: Север Кузбасса (Мокроусов Илья),Агент МарсРигли-08 (Киселевск-2)(Погребных А.)...,MWC ООО Гермес (Новокузнецк),10340352.0,...,Other,Traditional Independent Trade,BTC,TT,False,C1,Cluster 2,10340352,54.00231,86.543748
1,100050000000002,North-West,ЗАО Денди,MWC Денди ЗАО,MWC FFDL SPb (Brus Roman),MWC TSE St.Petersburg 1 (Baranov Igor),TL: 4323 - Svetlichniys (DISTR: Денди) (Svetli...,SR: 98069 - Кукко_d (Кукко Наталья_d),MWC Денди ЗАО,10407751.0,...,Other,Traditional Independent Trade,BTC,TT,True,C0,Cluster 3,10407751,59.838353,30.387724
2,100050000000031,North-West,ЗАО Денди,MWC Денди ЗАО,MWC FFDL SPb (Brus Roman),MWC TSE St.Petersburg 1 (Baranov Igor),TL: 4322 - Виртуальная территория_d (Vacant),Виртуальная территория_d (Vacant),MWC Денди ЗАО,10407751.0,...,Other,Traditional Independent Trade,BTC,TT,False,C0,Cluster 3,10407751,59.904175,30.420953


In [17]:
# X cols & Y cols

Y_COLS = ["Ship-to TO BE (загружено RSS)"]
X_COLS = [col for col in data.columns if col not in Y_COLS]

In [18]:
X = data[X_COLS]
y = data[Y_COLS]

In [19]:
cat_features = X.select_dtypes(include=['object']).columns  # Categorical
num_features = X.select_dtypes(exclude=['object']).columns  # Numeric

cat_features, num_features

(Index(['Region', 'Distrib', 'Office', 'FFDSL', 'TSE/MTDE',
        'Уровень Торг Региона 1', 'Уровень Торг Региона 2', 'Filial Name',
        'Chain Type', 'Chain Name', 'Chain. Chain Tier MWC',
        'Chain. Chain Sub Tier MWC', 'Channel Name 2018', 'Outlet Type 2018',
        'Trade Structure', 'Segment MWC. Segment Name', 'Cluster MWC'],
       dtype='object'),
 Index(['SWE Store Key', 'Filial Ship To', 'Chain Id', 'From Dc', 'Latitude',
        'Longitude'],
       dtype='object'))

In [20]:
for col in cat_features:
    X[col]=X[col].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [22]:
[i.shape for i in [X_train, X_test, y_train, y_test]]

[(142699, 23), (61158, 23), (142699, 1), (61158, 1)]

## LogisticRegression pipeline

In [32]:
%%time

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

preprocessing = ColumnTransformer(
    [('cat', cat_pipe, cat_features),
     ('num', num_pipe, num_features)
    ])

lr = Pipeline([
    ('preprocess', preprocessing),
    ('classifier', LogisticRegression(multi_class='multinomial', random_state=42, n_jobs=-1))
])

# model = LogisticRegression(multi_class='multinomial',random_state=20,n_jobs=-1)

lr.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


Wall time: 1min 31s


Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cat',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='missing',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                        

In [33]:
print("LR train accuracy: {0:.4f}".format(lr.score(X_train, y_train)))
print("LR test accuracy: {0:.4f}".format(lr.score(X_test, y_test)))

LR train accuracy: 0.9947
LR test accuracy: 0.9932


In [38]:
lr.decision_function(X_train)

array([[ 1.90042076, -0.57630778,  0.9453525 , ..., -0.72768851,
        -0.40769153,  0.50202909],
       [-1.93662165,  0.20236167, -0.63000141, ...,  0.05138858,
         0.34527157,  1.37647703],
       [ 0.66371066, -0.06391397, -1.17175594, ...,  2.67490107,
        -0.38956966,  0.05838454],
       ...,
       [-1.25921285, -0.24590526, -0.0908738 , ..., -0.39229752,
         1.77008011,  1.60177833],
       [-0.76357569, -1.6706218 , -0.48094414, ..., -1.26631821,
         3.11128282,  0.86779036],
       [-1.38467313, -1.12065354, -1.80216934, ..., -2.14843025,
        -0.8456568 , 15.11637678]])