In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import scipy.stats as stats
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin # this allows us to create a custom transformer

In [2]:
dogs = pd.read_csv('./petfinder_data/dogs.csv')
cats = pd.read_csv('./petfinder_data/cats.csv')

In [3]:
# ADDED DESCRIPTION LENGTH COLUMN!
dogs['desc_len'] = [len(x) for x in dogs['Description']]

In [4]:
dogs['AdoptionSpeed']=dogs['AdoptionSpeed'].replace(2,1)

In [5]:
dogs['AdoptionSpeed']=dogs['AdoptionSpeed'].replace(4,2)

In [6]:
dogs['AdoptionSpeed']=dogs['AdoptionSpeed'].replace(3,2)

In [7]:
dogs['AdoptionSpeed']=dogs['AdoptionSpeed'].replace(1,0)

In [8]:
dogs['AdoptionSpeed']=dogs['AdoptionSpeed'].replace(2,1)

In [9]:
dogs.AdoptionSpeed.value_counts()

1    3484
0    3137
Name: AdoptionSpeed, dtype: int64

In [10]:
dogs.AdoptionSpeed.value_counts()

1    3484
0    3137
Name: AdoptionSpeed, dtype: int64

In [11]:
# Creating list of columns to drop
drops = ['Name', 'RescuerID', 'Description', 'PetID', 'AdoptionSpeed']

# Dropping columns from data frame and dummifying categorical columns
X = dogs.drop(columns=drops)
y = dogs['AdoptionSpeed']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    stratify=y,
    random_state=42)

In [12]:
ss =StandardScaler()
Z_train = ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

In [13]:
%%time
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_train = lr.score(X_train, y_train)
lr_test = lr.score(X_test, y_test)

CPU times: user 240 ms, sys: 8.72 ms, total: 249 ms
Wall time: 51.4 ms


In [14]:
print(f'train score: {lr_train}')
print(f'test score: {lr_test}')

train score: 0.5280966767371601
test score: 0.538647342995169


In [15]:
# Baseline
dogs['AdoptionSpeed'].value_counts(normalize=True)

1    0.526205
0    0.473795
Name: AdoptionSpeed, dtype: float64

In [16]:
y_pred=lr.predict(X_test)
y_pred[0:20]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [17]:
print(f'Area Under Curve: {metrics.roc_auc_score(y_test, y_pred)}')

Area Under Curve: 0.5170862980540707


In [18]:
confusion = confusion_matrix(y_test, y_pred)
print('Confusion Matrix\n')
print(confusion)

print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, y_pred)))

print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, y_pred, average='weighted')))

print('\nClassification Report\n')
print(classification_report(y_test, y_pred, target_names=['Class 1', 'Class 2']))

Confusion Matrix

[[ 80 705]
 [ 59 812]]

Accuracy: 0.54

Micro Precision: 0.54
Micro Recall: 0.54
Micro F1-score: 0.54

Macro Precision: 0.56
Macro Recall: 0.52
Macro F1-score: 0.43

Weighted Precision: 0.55
Weighted Recall: 0.54
Weighted F1-score: 0.44

Classification Report

              precision    recall  f1-score   support

     Class 1       0.58      0.10      0.17       785
     Class 2       0.54      0.93      0.68       871

    accuracy                           0.54      1656
   macro avg       0.56      0.52      0.43      1656
weighted avg       0.55      0.54      0.44      1656



## Another attempt

Pipeline and gridsearch

In [19]:
dogs.head()

Unnamed: 0,Type,Name,Age,Gender,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,...,Breed_82,Breed_83,Breed_85,Breed_88,Breed_93,Breed_96,Breed_97,Breed_98,Breed_99,desc_len
0,1,Brisco,1,1,2,2,1,1,2,1,...,0,0,0,0,0,0,0,0,0,393
1,1,Miko,4,2,2,1,1,1,2,1,...,0,0,0,0,0,0,0,0,0,146
2,1,Hunter,1,1,2,1,2,2,2,1,...,0,0,0,0,0,0,0,0,0,390
3,1,Bear,2,1,2,1,2,1,2,1,...,0,0,0,0,0,0,0,0,0,68
4,1,Peanut,2,1,2,3,1,1,2,1,...,0,0,0,0,0,0,0,0,0,345


In [20]:
# Creating list of columns to drop
drops = ['Name', 'RescuerID', 'PetID', 'AdoptionSpeed']

# Dropping columns from data frame and dummifying categorical columns
X = dogs.drop(columns=drops)
y = dogs['AdoptionSpeed']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    stratify=y,
                                                    random_state=42)

In [22]:
tvec = TfidfVectorizer()
tvec.fit(X_train)
X_train = tvec.transform(X_train).todense()   # error in this cell????
X_test = tvec.transform(X_test).todense()

In [23]:
# look into svd for dimensionality reduction?

In [25]:
# Instantiate pipeline.
pipe_tf = Pipeline([
    ('tf', TfidfVectorizer()),
    ('lr', LogisticRegression(solver = 'saga'))
])

# Define grid of parameters to GridSearch over.
params_grid = {
    'tf__max_features': [100, 500],
    'tf__stop_words': ['english', None],
    'tf__ngram_range': [(1,1), (1,2)]
}

# GridSearch over pipeline with given grid of parameters.
gs_tf = GridSearchCV(pipe_tf, params_grid, cv=5)

# Fit model.
gs_tf.fit(X_train, y_train)


ValueError: Found input variables with inconsistent numbers of samples: [144, 4965]

In [26]:
X_train.shape

(144, 144)

In [27]:
X_train.head()

AttributeError: 'matrix' object has no attribute 'head'

In [None]:
y_train.shape

In [None]:
y_train.head()