# Logistic Regression

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("train.csv")

In [3]:
data.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,,Brown,0.15,40.9,15,4,2.0,4
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0.0,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18834 entries, 0 to 18833
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   pet_id          18834 non-null  object 
 1   issue_date      18834 non-null  object 
 2   listing_date    18834 non-null  object 
 3   condition       17357 non-null  float64
 4   color_type      18834 non-null  object 
 5   length(m)       18834 non-null  float64
 6   height(cm)      18834 non-null  float64
 7   X1              18834 non-null  int64  
 8   X2              18834 non-null  int64  
 9   breed_category  18834 non-null  float64
 10  pet_category    18834 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 1.6+ MB


## Impute Missing Data

In [5]:
data["condition"].value_counts()

1.0    6819
0.0    6281
2.0    4257
Name: condition, dtype: int64

In [6]:
from sklearn.impute import SimpleImputer

# Used most frequent value of condition to fill in NaN entries
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(data[["condition"]])
data["condition"] = imputer.transform(data[["condition"]])

In [7]:
data.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,1.0,Brown,0.15,40.9,15,4,2.0,4
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0.0,1


In [8]:
data["condition"].value_counts()

1.0    8296
0.0    6281
2.0    4257
Name: condition, dtype: int64

## Encode Data

### Encode Independent variable

In [9]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
data["color_type"] = label_encoder.fit_transform(data["color_type"])

In [10]:
data.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,18,0.8,7.78,13,9,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,53,0.72,14.19,13,9,0.0,2
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,1.0,15,0.15,40.9,15,4,2.0,4
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,53,0.62,17.82,0,1,0.0,2
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,2,0.5,11.06,18,4,0.0,1


In [11]:
from sklearn.preprocessing import OneHotEncoder

color_encoder = OneHotEncoder()
condition_encoder = OneHotEncoder()
color_type_1h = color_encoder.fit_transform(data[["color_type"]])
condition_1h = condition_encoder.fit_transform(data[["condition"]])

In [12]:
color_type_1h

<18834x56 sparse matrix of type '<class 'numpy.float64'>'
	with 18834 stored elements in Compressed Sparse Row format>

In [13]:
condition_1h

<18834x3 sparse matrix of type '<class 'numpy.float64'>'
	with 18834 stored elements in Compressed Sparse Row format>

In [14]:
features = data[['length(m)', 'height(cm)', 'X1', 'X2']].values
X = np.concatenate((features, color_type_1h.toarray(), condition_1h.toarray()), axis = 1)

In [15]:
X.shape

(18834, 63)

### Encode Independent variable (Not Needed for Logistic Regression Model)

In [16]:
y = data["breed_category"].values

In [17]:
print(y)

[0. 0. 2. ... 1. 1. 1.]


## Split Train Set and Test Set

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [20]:
def fullprint(*args, **kwargs):
  from pprint import pprint
  import numpy
  opt = numpy.get_printoptions()
  numpy.set_printoptions(threshold=numpy.inf)
  pprint(*args, **kwargs)
  numpy.set_printoptions(**opt)

In [22]:
fullprint(X_test)

array([[8.100e-01, 6.300e+00, 1.800e+01, 4.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 1.000e+00, 0.000e+00],
       [2.700e-01, 2.212e+01, 0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+

In [None]:
a = [[1, 2, 3, 4, 5],
     [1, 2, 3, 4, 5],
     [1, 2, 3, 4, 5],
     [1, 2, 3, 4, 5],
     [1, 2, 3, 4, 5]]

## Scale Data

In [19]:
from sklearn.preprocessing import StandardScaler

sc_x = StandardScaler()

# Scale the first 4 columns of features ['length(m)', 'height(cm)', 'X1', 'X2']
sc_x.fit(X_train[:,0:5])
X_train[:,0:5] = sc_x.transform(X_train[:, 0:5])

## Train Model

In [20]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, multi_class='ovr')
clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression(multi_class='ovr', random_state=0)

## Test Model

In [21]:
# Scale the first 4 columns of features ['length(m)', 'height(cm)', 'X1', 'X2']
sc_x.fit(X_test[:,0:5])
X_test[:,0:5] = sc_x.transform(X_test[:, 0:5])

In [22]:
y_pred = clf.predict(X_test)
print(np.concatenate((y_pred.reshape(-1,1), y_test.reshape(-1,1)), axis=1))

[[0. 0.]
 [1. 1.]
 [0. 1.]
 ...
 [1. 1.]
 [0. 2.]
 [2. 2.]]


In [23]:
from sklearn.metrics import confusion_matrix, f1_score
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: ")
print(cm)
print("f1_score: ", f1_score(y_test, y_pred, average='weighted'))

Confusion Matrix: 
[[1523  215   39]
 [ 139 1537    0]
 [ 166    0  148]]
f1_score:  0.8467998969028573


In [26]:
import joblib
joblib.dump(clf, "log_reg_clf.pkl")
# and later...
# my_model_loaded = joblib.load("my_model.pkl")

['log_reg_clf.pkl']