In [1]:
import math
from scipy import stats
import matplotlib.pyplot as plt 
import numpy as np
import pandas as pd
from pydataset import data
import statistics
import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report


**2. Given the following confusion matrix, evaluate (by hand) the model's performance.**



|               | pred dog   | pred cat   |
|:------------  |-----------:|-----------:|
| actual dog    |         46 |         7  |
| actual cat    |         13 |         34 |

In [37]:
# Evaluating by hand

tp = 46
tn = 34
fp = 13
fn = 7

accuracy = (tp + tn)/(tp + tn + fp + fn)
precision = tp/(tp + fp)
recall = tp/(tp + fn)

print(accuracy)
print(precision)
print(recall)

0.8
0.7796610169491526
0.8679245283018868


- In the context of this problem, what is a false positive?

Depends on what we label as positive and negative -- I will assume predicting a dog is positive case and a cat is negative case.
False positive would be predicting a dog when its actually a cat. 

- In the context of this problem, what is a false negative?

False negative would be predicting a cat when its actually a dog. 

- How would you describe this model?

Fairly accurate model that is optomized for recall

**3. You are working as a datascientist working for Codeup Cody Creator (C3 for short), a rubber-duck manufacturing plant.**

Unfortunately, some of the rubber ducks that are produced will have defects. Your team has built several models that try to predict those defects, and the data from their predictions can be found here.

Use the predictions dataset and pandas to help answer the following questions:

- An internal team wants to investigate the cause of the manufacturing defects. They tell you that they want to identify as many of the ducks that have a defect as possible. Which evaluation metric would be appropriate here? Which model would be the best fit for this use case?

In [6]:
#Load data set 
df = pd.read_csv('c3.csv')
df.head()

Unnamed: 0,actual,model1,model2,model3
0,No Defect,No Defect,Defect,No Defect
1,No Defect,No Defect,Defect,Defect
2,No Defect,No Defect,Defect,No Defect
3,No Defect,Defect,Defect,Defect
4,No Defect,No Defect,Defect,No Defect


Best evalutation metric: I think recall is most approriate here since we are trying to optimize to capture as many positive cases as possible. 

In [7]:
# Model 1 Evaluation
pd.crosstab(df.actual, df.model1)

model1,Defect,No Defect
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Defect,8,8
No Defect,2,182


In [9]:
accuracy_score(df.actual, df.model1)

0.95

In [18]:
# Calculating recall
subset = df[df.actual == 'Defect']
(subset.model1 == subset.actual).mean()

0.5

In [20]:
#Model 2 Evaluation
pd.crosstab(df.actual, df.model2)

model2,Defect,No Defect
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Defect,9,7
No Defect,81,103


In [21]:
accuracy_score(df.actual, df.model2)

0.56

In [22]:
# Calculating recall
subset = df[df.actual == 'Defect']
(subset.model2 == subset.actual).mean()

0.5625

In [23]:
#Model 3 Evaluation
pd.crosstab(df.actual, df.model3)

model3,Defect,No Defect
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Defect,13,3
No Defect,86,98


In [24]:
accuracy_score(df.actual, df.model3)

0.555

In [25]:
# Calculating recall
subset = df[df.actual == 'Defect']
(subset.model3 == subset.actual).mean()

0.8125

*Model 3 has the best recall score (.81) and therefore best fits this use case.*

- Recently several stories in the local news have come out highlighting customers who received a rubber duck with a defect, and portraying C3 in a bad light. The PR team has decided to launch a program that gives customers with a defective duck a vacation to Hawaii. They need you to predict which ducks will have defects, but tell you the really don't want to accidentally give out a vacation package when the duck really doesn't have a defect. Which evaluation metric would be appropriate here? Which model would be the best fit for this use case?

Best evalutation metric: Precision would be most appropriate since the cost of a false positive is high and we want to optimize for correct positive predictions. 

In [32]:
# Calculating Model Precision

subset1 = df[df.model1 == 'Defect']
subset2 = df[df.model2 == 'Defect']
subset3 = df[df.model3 == 'Defect']

m1_precision = (subset1.model1 == subset1.actual).mean()
m2_precision = (subset2.model2 == subset2.actual).mean()
m3_precision = (subset3.model3 == subset3.actual).mean()

m1_precision, m2_precision, m3_precision

(0.8, 0.1, 0.13131313131313133)

*Model 1 has the highest precision by far (.8) and is therefore the best for this use case.*

**4. You are working as a data scientist for Gives You Paws ™, a subscription based service that shows you cute pictures of dogs or cats (or both for an additional fee).**

At Gives You Paws, anyone can upload pictures of their cats or dogs. The photos are then put through a two step process. First an automated algorithm tags pictures as either a cat or a dog (Phase I). Next, the photos that have been initially identified are put through another round of review, possibly with some human oversight, before being presented to the users (Phase II).

Several models have already been developed with the data, and you can find their results here.

Given this dataset, use pandas to create a baseline model (i.e. a model that just predicts the most common class) and answer the following questions:

- In terms of accuracy, how do the various models compare to the baseline model? Are any of the models better than the baseline?

In [33]:
# Read the data set
df = pd.read_csv('gives_you_paws.csv')
df.head()

Unnamed: 0,actual,model1,model2,model3,model4
0,cat,cat,dog,cat,dog
1,dog,dog,cat,cat,dog
2,dog,cat,cat,cat,dog
3,dog,dog,dog,cat,dog
4,cat,cat,cat,dog,dog


In [35]:
# Which is most common class? - Dogs
df.actual.value_counts()

dog    3254
cat    1746
Name: actual, dtype: int64

In [34]:
#Create a baseline
df['baseline'] = 'dog'
df.head()

Unnamed: 0,actual,model1,model2,model3,model4,baseline
0,cat,cat,dog,cat,dog,dog
1,dog,dog,cat,cat,dog,dog
2,dog,cat,cat,cat,dog,dog
3,dog,dog,dog,cat,dog,dog
4,cat,cat,cat,dog,dog,dog


In [38]:
# Calculating accuracy of models and baseline

m1_accuracy = (df.model1 == df.actual).mean()
m2_accuracy = (df.model2 == df.actual).mean()
m3_accuracy = (df.model3 == df.actual).mean()
m4_accuracy = (df.model4 == df.actual).mean()
baseline_accuracy = (df.baseline == df.actual).mean()

m1_accuracy, m2_accuracy, m3_accuracy, m4_accuracy, baseline_accuracy

(0.8074, 0.6304, 0.5096, 0.7426, 0.6508)

*Two of our models (m1 and m2) are better than the baseline accuracy.*

- Suppose you are working on a team that solely deals with dog pictures. Which of these models would you recomend for Phase I? For Phase II?

In [41]:
# Phase I, we should choose a model optomized for recall
subset = df[df.actual == 'dog']
subset.head()

Unnamed: 0,actual,model1,model2,model3,model4,baseline
1,dog,dog,cat,cat,dog,dog
2,dog,cat,cat,cat,dog,dog
3,dog,dog,dog,cat,dog,dog
5,dog,dog,dog,dog,dog,dog
8,dog,dog,cat,dog,dog,dog


In [43]:
# Calculating model recall

m1_recall = (subset.model1 == subset.actual).mean()
m2_recall = (subset.model2 == subset.actual).mean()
m3_recall = (subset.model3 == subset.actual).mean()
m4_recall = (subset.model4 == subset.actual).mean()

m1_recall, m2_recall, m3_recall, m4_recall

(0.803318992009834,
 0.49078057775046097,
 0.5086047940995697,
 0.9557467732022127)

*Model 4 has highest recall and is best for Phase I.*

In [46]:
# Phase II - we should use precision as metric since were trying to minimize false positives

subset1 = df[df.model1 == 'dog']
subset2 = df[df.model2 == 'dog']
subset3 = df[df.model3 == 'dog']
subset4 = df[df.model4 == 'dog']

m1_precision = (subset1.model1 == subset1.actual).mean()
m2_precision = (subset2.model2 == subset2.actual).mean()
m3_precision = (subset3.model3 == subset3.actual).mean()
m4_precision = (subset4.model4 == subset4.actual).mean()

m1_precision, m2_precision, m3_precision, m4_precision

(0.8900238338440586,
 0.8931767337807607,
 0.6598883572567783,
 0.7312485304490948)

*Model 2 has the highest precision though model 1 is very close. We should choose model 2 for this use case in phase II.*

- Suppose you are working on a team that solely deals with cat pictures. Which of these models would you recomend for Phase I? For Phase II?

In [49]:
# Phase I - Calculating Recall
subset = df[df.actual == 'cat']
m1_recall = (subset.model1 == subset.actual).mean()
m2_recall = (subset.model2 == subset.actual).mean()
m3_recall = (subset.model3 == subset.actual).mean()
m4_recall = (subset.model4 == subset.actual).mean()

m1_recall, m2_recall, m3_recall, m4_recall

(0.8150057273768614,
 0.8906071019473081,
 0.5114547537227949,
 0.34536082474226804)

*In this case, model 2 is the best with a recall of .89.*

In [50]:
# Phase II - Calculating Precision

subset1 = df[df.model1 == 'cat']
subset2 = df[df.model2 == 'cat']
subset3 = df[df.model3 == 'cat']
subset4 = df[df.model4 == 'cat']

m1_precision = (subset1.model1 == subset1.actual).mean()
m2_precision = (subset2.model2 == subset2.actual).mean()
m3_precision = (subset3.model3 == subset3.actual).mean()
m4_precision = (subset4.model4 == subset4.actual).mean()

m1_precision, m2_precision, m3_precision, m4_precision

(0.6897721764420747, 0.4841220423412204, 0.358346709470305, 0.8072289156626506)

*In this case, model 4 has the highest precision (.81) and is the best for this use case.* 

**5. Follow the links below to read the documentation about each function, then apply those functions to the data from the previous problem.**

In [48]:
# Model 1 Classification Report
x = classification_report(df.actual, df.model1, output_dict=True)
pd.DataFrame(x).transpose()

Unnamed: 0,precision,recall,f1-score,support
cat,0.689772,0.815006,0.747178,1746.0
dog,0.890024,0.803319,0.844452,3254.0
accuracy,0.8074,0.8074,0.8074,0.8074
macro avg,0.789898,0.809162,0.795815,5000.0
weighted avg,0.820096,0.8074,0.810484,5000.0


In [51]:
# Model 2 Classification Report
x = classification_report(df.actual, df.model2, output_dict=True)
pd.DataFrame(x).transpose()

Unnamed: 0,precision,recall,f1-score,support
cat,0.484122,0.890607,0.627269,1746.0
dog,0.893177,0.490781,0.633479,3254.0
accuracy,0.6304,0.6304,0.6304,0.6304
macro avg,0.688649,0.690694,0.630374,5000.0
weighted avg,0.750335,0.6304,0.63131,5000.0


In [52]:
# Model 3 Classification Report
x = classification_report(df.actual, df.model3, output_dict=True)
pd.DataFrame(x).transpose()

Unnamed: 0,precision,recall,f1-score,support
cat,0.358347,0.511455,0.421425,1746.0
dog,0.659888,0.508605,0.574453,3254.0
accuracy,0.5096,0.5096,0.5096,0.5096
macro avg,0.509118,0.51003,0.497939,5000.0
weighted avg,0.55459,0.5096,0.521016,5000.0


In [53]:
# Model 4 Classification Report
x = classification_report(df.actual, df.model4, output_dict=True)
pd.DataFrame(x).transpose()

Unnamed: 0,precision,recall,f1-score,support
cat,0.807229,0.345361,0.483755,1746.0
dog,0.731249,0.955747,0.82856,3254.0
accuracy,0.7426,0.7426,0.7426,0.7426
macro avg,0.769239,0.650554,0.656157,5000.0
weighted avg,0.757781,0.7426,0.708154,5000.0


In [None]:
# Comparing accuracy scores with sklearn function 

positive = 'dog'
df.loc[:, 'model1':'model4'].apply(lambda pred: preci(df.actual, pred, pos_label=positive))

In [54]:
# Comparing precision scores with sklearn function

positive = 'dog'
df.loc[:, 'model1':'model4'].apply(lambda pred: precision_score(df.actual, pred, pos_label=positive))

model1    0.890024
model2    0.893177
model3    0.659888
model4    0.731249
dtype: float64

In [55]:
# Comparing recall scores with sklearn function 

positive = 'dog'
df.loc[:, 'model1':'model4'].apply(lambda pred: recall_score(df.actual, pred, pos_label=positive))

model1    0.803319
model2    0.490781
model3    0.508605
model4    0.955747
dtype: float64