## 1. Exploratory LogReg

## Dogs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import scipy.stats as stats
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
dogs = pd.read_csv('./petfinder_data/dogs.csv')
cats = pd.read_csv('./petfinder_data/cats.csv')

In [3]:
# Creating list of columns to drop
drops = ['Name', 'RescuerID', 'Description', 'PetID', 'AdoptionSpeed']

# Dropping columns from data frame and dummifying categorical columns
X = dogs.drop(columns=drops)
y = dogs['AdoptionSpeed']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    stratify=y,
    random_state=42)

In [4]:
ss =StandardScaler()
Z_train = ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

In [5]:
%%time
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_train = lr.score(X_train, y_train)
lr_test = lr.score(X_test, y_test)

CPU times: user 701 ms, sys: 18.2 ms, total: 719 ms
Wall time: 133 ms


In [6]:
print(f'train score: {lr_train}')
print(f'test score: {lr_test}')

train score: 0.32809667673716014
test score: 0.32910628019323673


In [7]:
# Baseline
dogs['AdoptionSpeed'].value_counts(normalize=True)

4    0.281982
2    0.267935
3    0.244223
1    0.205860
Name: AdoptionSpeed, dtype: float64

In [8]:
y_pred=lr.predict(X_test)
y_pred[0:20]

array([4, 2, 4, 2, 2, 2, 2, 2, 2, 2, 4, 2, 4, 4, 2, 4, 2, 2, 4, 2])

In [9]:
confusion = confusion_matrix(y_test, y_pred)
print('Confusion Matrix\n')
print(confusion)

Confusion Matrix

[[  8 228   0 105]
 [  2 327   0 115]
 [  2 283   0 119]
 [  4 253   0 210]]


In [10]:
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, y_pred)))

print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, y_pred, average='weighted')))

print('\nClassification Report\n')
print(classification_report(y_test, y_pred, target_names=['Class 1', 'Class 2', 'Class 3', 'Class 4']))


Accuracy: 0.33

Micro Precision: 0.33
Micro Recall: 0.33
Micro F1-score: 0.33

Macro Precision: 0.30
Macro Recall: 0.30
Macro F1-score: 0.22

Weighted Precision: 0.29
Weighted Recall: 0.33
Weighted F1-score: 0.24

Classification Report

              precision    recall  f1-score   support

     Class 1       0.50      0.02      0.04       341
     Class 2       0.30      0.74      0.43       444
     Class 3       0.00      0.00      0.00       404
     Class 4       0.38      0.45      0.41       467

    accuracy                           0.33      1656
   macro avg       0.30      0.30      0.22      1656
weighted avg       0.29      0.33      0.24      1656



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Cats

In [11]:
# Creating list of columns to drop
drops = ['Name', 'RescuerID', 'Description', 'PetID', 'AdoptionSpeed']

# Dropping columns from data frame and dummifying categorical columns
X = cats.drop(columns=drops)
y = cats['AdoptionSpeed']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    stratify=y,
    random_state=42)

In [12]:
ss =StandardScaler()
Z_train = ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

In [13]:
%%time
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_train = lr.score(X_train, y_train)
lr_test = lr.score(X_test, y_test)

CPU times: user 738 ms, sys: 14 ms, total: 752 ms
Wall time: 127 ms


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
print(f'train score: {lr_train}')
print(f'test score: {lr_test}')

train score: 0.34206971088894894
test score: 0.33630470016207453


In [15]:
# Baseline
cats['AdoptionSpeed'].value_counts(normalize=True)

1    0.284701
2    0.280648
4    0.245187
3    0.189463
Name: AdoptionSpeed, dtype: float64

In [16]:
y_pred=lr.predict(X_test)
y_pred[0:20]

array([1, 1, 1, 2, 4, 1, 2, 1, 1, 4, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1])

In [17]:
confusion = confusion_matrix(y_test, y_pred)
print('Confusion Matrix\n')
print(confusion)

Confusion Matrix

[[270  42   0  39]
 [254  45   0  47]
 [157  38   0  39]
 [142  61   0 100]]


In [18]:
confusion = confusion_matrix(y_test, y_pred)
print('Confusion Matrix\n')
print(confusion)

print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, y_pred)))

print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, y_pred, average='weighted')))

print('\nClassification Report\n')
print(classification_report(y_test, y_pred, target_names=['Class 1', 'Class 2', 'Class 3', 'Class 4']))

Confusion Matrix

[[270  42   0  39]
 [254  45   0  47]
 [157  38   0  39]
 [142  61   0 100]]

Accuracy: 0.34

Micro Precision: 0.34
Micro Recall: 0.34
Micro F1-score: 0.34

Macro Precision: 0.25
Macro Recall: 0.31
Macro F1-score: 0.25

Weighted Precision: 0.27
Weighted Recall: 0.34
Weighted F1-score: 0.27

Classification Report

              precision    recall  f1-score   support

     Class 1       0.33      0.77      0.46       351
     Class 2       0.24      0.13      0.17       346
     Class 3       0.00      0.00      0.00       234
     Class 4       0.44      0.33      0.38       303

    accuracy                           0.34      1234
   macro avg       0.25      0.31      0.25      1234
weighted avg       0.27      0.34      0.27      1234



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
