# Logistic Regression with Titanic Dataset

In [184]:
from acquire import get_titanic_data
from prepare import prep_titanic

In [241]:
df = prep_titanic(get_titanic_data())

In [243]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
passenger_id       891 non-null int64
survived           891 non-null int64
pclass             891 non-null int64
sex                891 non-null object
age                714 non-null float64
sibsp              891 non-null int64
parch              891 non-null int64
fare               891 non-null float64
embarked           891 non-null object
class              891 non-null object
embark_town        889 non-null object
alone              891 non-null int64
embarked_town      891 non-null object
embarked_encode    891 non-null int64
dtypes: float64(2), int64(7), object(5)
memory usage: 97.5+ KB


In [244]:
df.dropna(inplace=True)

In [246]:
df.isnull().sum()

passenger_id       0
survived           0
pclass             0
sex                0
age                0
sibsp              0
parch              0
fare               0
embarked           0
class              0
embark_town        0
alone              0
embarked_town      0
embarked_encode    0
dtype: int64

In [247]:
from sklearn import preprocessing

encoder = preprocessing.LabelEncoder()

df.embarked.fillna('Unknown', inplace=True)

encoder.fit(df.embarked)
df.embarked = encoder.transform(df.embarked)

df.embarked.head(10)


0     2
1     0
2     2
3     2
4     2
6     2
7     2
8     2
9     0
10    2
Name: embarked, dtype: int64

In [248]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df)

In [249]:
train.isnull().sum()

passenger_id       0
survived           0
pclass             0
sex                0
age                0
sibsp              0
parch              0
fare               0
embarked           0
class              0
embark_town        0
alone              0
embarked_town      0
embarked_encode    0
dtype: int64

In [250]:
from sklearn.preprocessing import MinMaxScaler
#make the thing
scaler = MinMaxScaler()

#train the thing
scaler.fit(train[['fare','age']])
#use the thing
train[['fare','age']] = scaler.transform(train[['fare','age']])
test[['fare','age']] = scaler.transform(test[['fare','age']])


train[['age', 'fare']].head(10)

Unnamed: 0,age,fare
172,0.00416,0.021731
757,0.218455,0.022447
13,0.483172,0.061045
762,0.243666,0.01411
135,0.281482,0.029367
56,0.256271,0.020495
310,0.294088,0.162314
333,0.193243,0.035134
693,0.306694,0.014102
86,0.193243,0.067096


In [251]:
train.isnull().sum()

passenger_id       0
survived           0
pclass             0
sex                0
age                0
sibsp              0
parch              0
fare               0
embarked           0
class              0
embark_town        0
alone              0
embarked_town      0
embarked_encode    0
dtype: int64

In [252]:
from sklearn.linear_model import LogisticRegression

logit = LogisticRegression()

logit.fit(train[['pclass','age','fare','sibsp','parch']], train.survived)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

## Fit the logistic regression classifier to your training sample and transform, i.e. make predictions on the training sample.

In [253]:
y_pred = logit.predict(train[['pclass','age','fare','sibsp','parch']])

In [254]:
y_pred

array([0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,

In [255]:
y_pred_proba = logit.predict_proba(train[['pclass','age','fare','sibsp','parch']])

In [256]:
y_pred_proba

array([[0.62865672, 0.37134328],
       [0.49600132, 0.50399868],
       [0.60797702, 0.39202298],
       ...,
       [0.57220402, 0.42779598],
       [0.73921603, 0.26078397],
       [0.77949163, 0.22050837]])

## Evaluate your in-sample results using the model score, confusion matrix, and classification report.


In [257]:
train['prediction'] = y_pred

In [258]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,embarked_town,embarked_encode,prediction
172,172,1,3,female,0.00416,1,1,0.021731,2,Third,Southampton,0,Southampton,3,0
757,757,0,2,male,0.218455,0,0,0.022447,2,Second,Southampton,1,Southampton,3,1
13,13,0,3,male,0.483172,1,5,0.061045,2,Third,Southampton,0,Southampton,3,0
762,762,1,3,male,0.243666,0,0,0.01411,0,Third,Cherbourg,1,Cherbourg,1,0
135,135,0,2,male,0.281482,0,0,0.029367,0,Second,Cherbourg,1,Cherbourg,1,0


In [259]:
train.shape

(534, 15)

## Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.


## Accuracy

In [260]:
print('The accuracy of the model is: ',logit.score(train[['pclass','age','fare','sibsp','parch']], train.survived))

The accuracy of the model is:  0.7284644194756554


In [261]:
from sklearn.metrics import confusion_matrix

In [262]:
print(confusion_matrix(train.survived, y_pred))


[[275  46]
 [ 99 114]]


## True Positive Rate

In [264]:
true_pos = 114
true_pos_rate = 114/(114+99+46+275)

print('True positive rate is', true_pos_rate)

True positive rate is 0.21348314606741572


## False Positive Rate

In [265]:
false_pos = 46
false_pos_rate =46/(114+99+46+275)
print('False positive rate is', false_pos_rate)

False positive rate is 0.08614232209737828


## True Negative Rate

In [267]:
true_neg = 275
true_neg_rate = 275/(114+99+46+275)
print('True negative rate is', true_neg_rate)

True negative rate is 0.5149812734082397


## False negative rate


In [269]:
false_neg = 99
false_neg_rate = 99/(114+99+46+275)
print('False negative rate is', false_neg_rate)

False negative rate is 0.1853932584269663


## Classification Report with precision, recall, f1-score and support.

In [270]:
from sklearn.metrics import classification_report


In [271]:
print(classification_report(train.survived, y_pred))


              precision    recall  f1-score   support

           0       0.74      0.86      0.79       321
           1       0.71      0.54      0.61       213

   micro avg       0.73      0.73      0.73       534
   macro avg       0.72      0.70      0.70       534
weighted avg       0.73      0.73      0.72       534



## Look in the scikit-learn documentation to research the solver parameter. What is your best option(s) for the particular problem you are trying to solve and the data to be used?
### The solver parameter is the optimization method (algorithm) used to find the best function. Liblinear is the default algorithm. I will use the 'sag' solver since it is faster for large datasets.


## Run through steps 2-4 using another solver (from question 5)

In [276]:
from sklearn.model_selection import train_test_split

train2, test2 = train_test_split(df)

## Sag solver

In [277]:
from sklearn.linear_model import LogisticRegression

logit_2 = LogisticRegression(solver='sag')

logit_2.fit(train2[['pclass','age','fare','sibsp','parch']], train2.survived)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='sag',
          tol=0.0001, verbose=0, warm_start=False)

In [278]:
y_pred_2 = logit_2.predict(train2[['pclass','age','fare','sibsp','parch']])

In [279]:
y_pred_proba_2 = logit_2.predict_proba(train2[['pclass','age','fare','sibsp','parch']])

In [280]:
print('The accuracy of the model is: ',logit_2.score(train2[['pclass','age','fare','sibsp','parch']], train2.survived))

The accuracy of the model is:  0.6835205992509363


## lbfgs Solver

In [281]:
from sklearn.linear_model import LogisticRegression

logit_3 = LogisticRegression(solver='lbfgs')

logit_3.fit(train2[['pclass','age','fare','sibsp','parch']], train2.survived)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [282]:
y_pred_2 = logit_3.predict(train2[['pclass','age','fare','sibsp','parch']])

In [283]:
y_pred_proba_2 = logit_3.predict_proba(train2[['pclass','age','fare','sibsp','parch']])

In [284]:
print('The accuracy of the model is: ',logit_3.score(train2[['pclass','age','fare','sibsp','parch']], train2.survived))

The accuracy of the model is:  0.704119850187266


## Saga Solver

In [285]:
from sklearn.linear_model import LogisticRegression

logit_4 = LogisticRegression(solver='saga')

logit_4.fit(train2[['pclass','age','fare','sibsp','parch']], train2.survived)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

In [286]:
y_pred_2 = logit_4.predict(train2[['pclass','age','fare','sibsp','parch']])

In [287]:
y_pred_proba_2 = logit_4.predict_proba(train2[['pclass','age','fare','sibsp','parch']])

In [288]:
print('The accuracy of the model is: ',logit_4.score(train2[['pclass','age','fare','sibsp','parch']], train2.survived))

The accuracy of the model is:  0.6722846441947565


## Newton-cg Solver

In [289]:
from sklearn.linear_model import LogisticRegression

logit_5 = LogisticRegression(solver='saga')

logit_5.fit(train2[['pclass','age','fare','sibsp','parch']], train2.survived)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

In [290]:
y_pred_2 = logit_5.predict(train2[['pclass','age','fare','sibsp','parch']])

In [291]:
y_pred_proba_2 = logit_5.predict_proba(train2[['pclass','age','fare','sibsp','parch']])

In [292]:
print('The accuracy of the model is: ',logit_5.score(train2[['pclass','age','fare','sibsp','parch']], train2.survived))

The accuracy of the model is:  0.6722846441947565


## Which performs better on your in-sample data?
## Save the best model in logit_fit

### lbfgs performed best. It handles multinomial loss, meaning it can handle multiclassification problems.
### Saga has L1 penalty, meaning it might have more than one solution. It performed a little worse than lbfgs.
### Newton-cg also performed a little worse than lbfgs

In [293]:
logit_fit_titanic = logit_3

# Decision Tree with Titanic Dataset

In [294]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 14 columns):
passenger_id       712 non-null int64
survived           712 non-null int64
pclass             712 non-null int64
sex                712 non-null object
age                712 non-null float64
sibsp              712 non-null int64
parch              712 non-null int64
fare               712 non-null float64
embarked           712 non-null int64
class              712 non-null object
embark_town        712 non-null object
alone              712 non-null int64
embarked_town      712 non-null object
embarked_encode    712 non-null int64
dtypes: float64(2), int64(8), object(4)
memory usage: 83.4+ KB


In [298]:
X = df[['pclass','age','fare','sibsp','parch']]
y = df[['survived']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.shape

(498, 5)

## Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)


In [299]:
from sklearn.tree import DecisionTreeClassifier  
clf = DecisionTreeClassifier()  
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [300]:
y_pred = clf.predict(X_train)


In [301]:
y_pred_proba = clf.predict_proba(X_train)


## Evaluate your in-sample results using the model score, confusion matrix, and classification report.

### Accuracy

In [302]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.98


### Confusion Matrix

In [237]:
confusion_matrix(y_train, y_pred)


array([[46,  0],
       [ 0, 81]])

### Classification Report

In [238]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        46
           1       1.00      1.00      1.00        81

   micro avg       1.00      1.00      1.00       127
   macro avg       1.00      1.00      1.00       127
weighted avg       1.00      1.00      1.00       127



## Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

### True Positive Rate

In [56]:
true_pos = 81
true_pos_rate = 81/(81+46)
print('The true positive rate is', true_pos_rate)


The true positive rate is 0.6377952755905512


### False Positive Rate

In [57]:
print('The false positive rate is 0.')

The false positive rate is 0.


### False Negative Rate

In [58]:
print('The false negative rate is 0.')

The false negative rate is 0.


### Precision

In [59]:
print('Precision is 1.')

Precision is 1.


## Run through steps 2-4 using entropy as your measure of impurity.

In [60]:
clf_2 = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=123)


### Fit the model

In [61]:
clf_2.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

In [62]:
y_pred_2 = clf_2.predict(X_train)


In [63]:
y_pred_proba_2 = clf_2.predict_proba(X_train)


## Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [64]:
print('Accuracy of Second Decision Tree classifier on training set: {:.2f}'
     .format(clf_2.score(X_train, y_train)))

Accuracy of Second Decision Tree classifier on training set: 0.74


In [65]:
confusion_matrix(y_train, y_pred_2)

array([[28, 18],
       [15, 66]])

In [66]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        46
           1       1.00      1.00      1.00        81

   micro avg       1.00      1.00      1.00       127
   macro avg       1.00      1.00      1.00       127
weighted avg       1.00      1.00      1.00       127



## Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

### True Positive Rate

In [67]:
true_pos = 66
true_pos_rate = 66/(127)
print('The true positive rate is', true_pos_rate)


The true positive rate is 0.5196850393700787


### False Positive Rate

In [68]:
false_pos = 18
false_pos_rate = 18/127

print('The false positive rate is', false_pos_rate)

The false positive rate is 0.14173228346456693


### False Negative Rate

In [69]:
false_neg = 15
false_neg_rate = 15/127
print('The false negative rate is',false_neg_rate)

The false negative rate is 0.11811023622047244


### Precision

In [70]:
precision = true_pos/(true_pos+false_pos)
print('The precision is', precision)

The precision is 0.7857142857142857


In [177]:
tree_fit_titanic = clf

## Which performs better on your in-sample data? Save the best model in tree_fit

### When comparing the different types of solver options, the lbfgs (no 'entropy' as the gini impurity) method works best.

### The first (no 'entropy' as the gini impurity) model has accuracy of 72.7%. The second model (Decision Tree with Titanic Dataset) has accuracy of 78.5%. So the Decision Tree model works better.

# Decision Tree with Iris Dataset

## Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [72]:
from pydataset import data
iris = data('iris')
iris.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [73]:
iris.columns = [col.lower().replace('.', '_') for col in iris]

In [74]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [75]:
X = iris.drop(['species'],axis=1)
y = iris[['species']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .40, random_state = 12)

X_train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
114,5.7,2.5,5.0,2.0
6,5.4,3.9,1.7,0.4
145,6.7,3.3,5.7,2.5
107,4.9,2.5,4.5,1.7
115,5.8,2.8,5.1,2.4


In [76]:
clf = DecisionTreeClassifier()

In [77]:
clf.fit(X_train, y_train)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [78]:
y_pred = clf.predict(X_train)
y_pred[0:5]

array(['virginica', 'setosa', 'virginica', 'virginica', 'virginica'],
      dtype=object)

In [79]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [80]:
import pandas as pd
predictions = pd.DataFrame(y_pred_proba)

In [81]:
predictions.sample(10)

Unnamed: 0,0,1,2
17,0.0,0.0,1.0
19,1.0,0.0,0.0
54,1.0,0.0,0.0
35,0.0,1.0,0.0
6,0.0,1.0,0.0
64,1.0,0.0,0.0
1,1.0,0.0,0.0
14,1.0,0.0,0.0
7,0.0,1.0,0.0
27,0.0,1.0,0.0


## Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [82]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 1.00


### Confusion Matrix

In [83]:
confusion_matrix(y_train, y_pred)

array([[24,  0,  0],
       [ 0, 31,  0],
       [ 0,  0, 35]])

In [84]:
sorted(y_train.species.unique())


['setosa', 'versicolor', 'virginica']

In [85]:
y_train.species.value_counts()


virginica     35
versicolor    31
setosa        24
Name: species, dtype: int64

In [86]:
import pandas as pd

labels = sorted(y_train.species.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,setosa,versicolor,virginica
setosa,24,0,0
versicolor,0,31,0
virginica,0,0,35


In [87]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        24
  versicolor       1.00      1.00      1.00        31
   virginica       1.00      1.00      1.00        35

   micro avg       1.00      1.00      1.00        90
   macro avg       1.00      1.00      1.00        90
weighted avg       1.00      1.00      1.00        90



## Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

### Accuracy

In [88]:
accuracy = (105)/105
print('The accuracy is', accuracy)

The accuracy is 1.0


### True Positive Rate

In [89]:
true_pos_rate = 105/105
print('The true positive rate is', true_pos_rate)

The true positive rate is 1.0


### False Positive Rate

In [90]:
false_pos_rate = 0/105
print('The false positive rate is', false_pos_rate)

The false positive rate is 0.0


### True Negative Rate

In [91]:
true_neg_rate = 0/105
print('The true negative rate is', true_neg_rate)

The true negative rate is 0.0


### False Negative Rate

In [92]:
false_neg_rate = 0/105
print('The true negative rate is', false_neg_rate)

The true negative rate is 0.0


## Run through steps 2-4 using entropy as your measure of impurity.

# Second run through with 'entropy' as measure of gini impurity

In [93]:
clf_2 = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=12)

In [94]:
clf_2.fit(X_train, y_train)


DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=12,
            splitter='best')

In [95]:
y_pred_2 = clf_2.predict(X_train)


In [96]:
y_pred_proba_2 = clf_2.predict_proba(X_train)


## Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [97]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf_2.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.98


In [98]:
confusion_matrix(y_train, y_pred_2)


array([[24,  0,  0],
       [ 0, 29,  2],
       [ 0,  0, 35]])

In [99]:
print(classification_report(y_train, y_pred_2))


              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        24
  versicolor       1.00      0.94      0.97        31
   virginica       0.95      1.00      0.97        35

   micro avg       0.98      0.98      0.98        90
   macro avg       0.98      0.98      0.98        90
weighted avg       0.98      0.98      0.98        90



## Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

### Accuracy

In [100]:
accuracy = (105-2)/105
print('The accuracy is', accuracy)

The accuracy is 0.9809523809523809


### True Positive Rate

In [101]:
true_pos_rate = 103/105
print('The true positive rate is', true_pos_rate)

The true positive rate is 0.9809523809523809


### False Positive Rate

In [102]:
false_pos_rate = 2/105
print('The false positive rate is', false_pos_rate)

The false positive rate is 0.01904761904761905


### True Negative Rate

In [103]:
print('The true negative rate is 0.')

The true negative rate is 0.


### False Negative Rate

In [104]:
print('The false negative rate is 0.')

The false negative rate is 0.


## Which performs better on your in-sample data?
## Save the best model in tree_fit

In [178]:
tree_fit_iris = clf

## The first model works best. But it is overfit because it has 100% accuracy.

## Second run through with 'entropy' as measure of gini impurity has accuracy of 98%.


In [105]:
from sklearn.datasets import load_iris
from sklearn import tree

iris = load_iris()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)

import graphviz

from graphviz import Graph

dot_data = tree.export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 

graph.render('iris_decision_tree2', view=True)

'iris_decision_tree2.pdf'

# Random Forests

# Titanic Dataset

## Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 20.


In [113]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from acquire import get_titanic_data
from prepare import prep_titanic

titanic_df = prep_titanic(get_titanic_data())
titanic_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
passenger_id       891 non-null int64
survived           891 non-null int64
pclass             891 non-null int64
sex                891 non-null object
age                714 non-null float64
sibsp              891 non-null int64
parch              891 non-null int64
fare               891 non-null float64
embarked           891 non-null object
class              891 non-null object
embark_town        889 non-null object
alone              891 non-null int64
embarked_town      891 non-null object
embarked_encode    891 non-null int64
dtypes: float64(2), int64(7), object(5)
memory usage: 97.5+ KB


In [109]:
titanic_df.sample(20)

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,embarked_town,embarked_encode
125,125,1,3,male,12.0,1,0,11.2417,C,Third,Cherbourg,0,Cherbourg,1
476,476,0,2,male,34.0,1,0,21.0,S,Second,Southampton,0,Southampton,3
752,752,0,3,male,33.0,0,0,9.5,S,Third,Southampton,1,Southampton,3
246,246,0,3,female,25.0,0,0,7.775,S,Third,Southampton,1,Southampton,3
710,710,1,1,female,24.0,0,0,49.5042,C,First,Cherbourg,1,Cherbourg,1
799,799,0,3,female,30.0,1,1,24.15,S,Third,Southampton,0,Southampton,3
729,729,0,3,female,25.0,1,0,7.925,S,Third,Southampton,0,Southampton,3
463,463,0,2,male,48.0,0,0,13.0,S,Second,Southampton,1,Southampton,3
59,59,0,3,male,11.0,5,2,46.9,S,Third,Southampton,0,Southampton,3
187,187,1,1,male,45.0,0,0,26.55,S,First,Southampton,1,Southampton,3


In [110]:
# Handle missing age values
titanic_df.dropna(inplace=True)

X = titanic_df[['pclass','age','fare','sibsp','parch']]
y = titanic_df.survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,pclass,age,fare,sibsp,parch
605,3,36.0,15.55,1,0
197,3,42.0,8.4042,0,1
56,2,21.0,10.5,0,0
645,1,48.0,76.7292,1,0
356,1,22.0,55.0,0,1


In [111]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=20, 
                            random_state=123)

In [112]:
rf.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=123, verbose=0, warm_start=False)

## Evaluate your results using the model score, confusion matrix, and classification report.

In [114]:
y_pred = rf.predict(X_train)


In [115]:
y_pred_proba = rf.predict_proba(X_train)


In [116]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.98


In [117]:
print(confusion_matrix(y_train, y_pred))


[[296   3]
 [  6 193]]


In [118]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.98      0.99      0.99       299
           1       0.98      0.97      0.98       199

   micro avg       0.98      0.98      0.98       498
   macro avg       0.98      0.98      0.98       498
weighted avg       0.98      0.98      0.98       498



## Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

### True Positive Rate

In [119]:
true_pos = 193
true_pos_rate = 193/(193+9+296)
print('The true positive rate is', true_pos_rate)


The true positive rate is 0.38755020080321284


### False Positive Rate

In [120]:
false_pos = 3
false_pos_rate = 3/(193+9+296)
print('The false positive rate is', false_pos_rate)

The false positive rate is 0.006024096385542169


### True Negative Rate

In [121]:
true_neg = 296
true_neg_rate = 296/(193+9+296)
print('The true negative rate is', true_neg_rate)

The true negative rate is 0.5943775100401606


### False Negative Rate

In [122]:
false_neg = 6
false_neg_rate = 6/(193+9+296)
print('The false positive rate is', false_neg_rate)

The false positive rate is 0.012048192771084338


### Precision

In [123]:
precision = true_pos/(true_pos + false_pos)

In [124]:
print('The precision is', precision)

The precision is 0.9846938775510204


## Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.

In [125]:
rf_2 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)

In [126]:
rf_2.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=123, verbose=0, warm_start=False)

## Evaluate your results using the model score, confusion matrix, and classification report.

In [127]:
y_pred_2 = rf_2.predict(X_train)


In [128]:
y_pred_proba_2 = rf_2.predict_proba(X_train)


In [129]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf_2.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.75


In [130]:
print(confusion_matrix(y_train, y_pred_2))


[[269  30]
 [ 93 106]]


In [131]:
print(classification_report(y_train, y_pred_2))


              precision    recall  f1-score   support

           0       0.74      0.90      0.81       299
           1       0.78      0.53      0.63       199

   micro avg       0.75      0.75      0.75       498
   macro avg       0.76      0.72      0.72       498
weighted avg       0.76      0.75      0.74       498



## Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

### True Positive Rate

In [132]:
true_pos = 106
true_pos_rate = 106/(106+93+30+269)
print('The true positive rate is', true_pos_rate)


The true positive rate is 0.21285140562248997


### False Positive Rate

In [133]:
false_pos = 30
false_pos_rate = 30/(106+93+30+269)
print('The true positive rate is', false_pos_rate)


The true positive rate is 0.060240963855421686


### True Negative Rate

In [134]:
true_neg = 269
true_neg_rate = 269/(106+93+30+269)
print('The true positive rate is', true_neg_rate)


The true positive rate is 0.5401606425702812


### False Negative Rate

In [135]:
false_neg = 93
false_neg_rate = 93/(106+93+30+269)
print('The true positive rate is', false_neg_rate)


The true positive rate is 0.18674698795180722


### Precision

In [136]:
precision = true_pos/(true_pos + false_pos)

In [137]:
print('The precision is', precision)

The precision is 0.7794117647058824


## What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?
## Save the best model in forest_fit

## The first model where min_samples_leaf = 1 and max_depth = 20 was more accurate with a precision of 98.5%.

## Decreasing the min_samples_leaf and increating the max_depth also dimished accuracy to 77.9%.

In [179]:
forest_fit_titanic = rf

# Random Forests

# Iris Dataset

## Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 20.


In [139]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from acquire import get_iris_data
from prepare import prep_iris

iris = prep_iris(get_iris_data())
iris.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
species           150 non-null object
sepal_length      150 non-null float64
sepal_width       150 non-null float64
petal_length      150 non-null float64
petal_width       150 non-null float64
species_encode    150 non-null int64
dtypes: float64(4), int64(1), object(1)
memory usage: 7.1+ KB


In [141]:
iris.sample(20)

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width,species_encode
109,virginica,7.2,3.6,6.1,2.5,2
85,versicolor,6.0,3.4,4.5,1.6,1
73,versicolor,6.1,2.8,4.7,1.2,1
145,virginica,6.7,3.0,5.2,2.3,2
140,virginica,6.7,3.1,5.6,2.4,2
63,versicolor,6.1,2.9,4.7,1.4,1
103,virginica,6.3,2.9,5.6,1.8,2
92,versicolor,5.8,2.6,4.0,1.2,1
126,virginica,6.2,2.8,4.8,1.8,2
34,setosa,4.9,3.1,1.5,0.2,0


In [143]:
iris.isnull().sum()

species           0
sepal_length      0
sepal_width       0
petal_length      0
petal_width       0
species_encode    0
dtype: int64

In [145]:
X = iris[['sepal_length','sepal_width','petal_length','petal_width']]
y = iris.species

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
114,5.8,2.8,5.1,2.4
136,6.3,3.4,5.6,2.4
53,5.5,2.3,4.0,1.3
19,5.1,3.8,1.5,0.3
38,4.4,3.0,1.3,0.2


In [146]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=20, 
                            random_state=123)

In [147]:
rf.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=123, verbose=0, warm_start=False)

## Evaluate your results using the model score, confusion matrix, and classification report.

In [148]:
y_pred = rf.predict(X_train)


In [149]:
y_pred_proba = rf.predict_proba(X_train)


In [150]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 1.00


In [151]:
print(confusion_matrix(y_train, y_pred))


[[32  0  0]
 [ 0 40  0]
 [ 0  0 33]]


In [152]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        32
  versicolor       1.00      1.00      1.00        40
   virginica       1.00      1.00      1.00        33

   micro avg       1.00      1.00      1.00       105
   macro avg       1.00      1.00      1.00       105
weighted avg       1.00      1.00      1.00       105



## Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [154]:
accuracy = (32+40+33)/(32+40+33)
print('The accuracy is', accuracy)


The accuracy is 1.0


### Precision

In [156]:
precision = 32/32

In [180]:
print('The precision for all species is', precision)

The precision for all species is 1.0


## Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.

In [158]:
rf_2 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)

In [159]:
rf_2.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=123, verbose=0, warm_start=False)

## Evaluate your results using the model score, confusion matrix, and classification report.

In [160]:
y_pred_2 = rf_2.predict(X_train)


In [161]:
y_pred_proba_2 = rf_2.predict_proba(X_train)


In [162]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf_2.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.96


In [163]:
print(confusion_matrix(y_train, y_pred_2))


[[32  0  0]
 [ 0 37  3]
 [ 0  1 32]]


In [166]:
sorted(y_train.unique())


['setosa', 'versicolor', 'virginica']

In [164]:
print(classification_report(y_train, y_pred_2))


              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        32
  versicolor       0.97      0.93      0.95        40
   virginica       0.91      0.97      0.94        33

   micro avg       0.96      0.96      0.96       105
   macro avg       0.96      0.96      0.96       105
weighted avg       0.96      0.96      0.96       105



## Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

### Precision

In [171]:
corr0 = 32
wrong0 = 0
precision_0 = corr0/(corr0 + wrong0)

print('The precision for setosa is', precision_0)

The precision for setosa is 1.0


In [172]:
corr1 = 37
wrong1 = 1
precision_1 = corr1/(corr1 + wrong1)

print('The precision for versicolor is', precision_1)

The precision for versicolor is 0.9736842105263158


In [174]:
corr2 = 32
wrong2 = 3
precision_2 = corr2/(corr2 + wrong2)

print('The precision for versicolor is', precision_1)

The precision for versicolor is 0.9736842105263158


### Accuracy

In [175]:
acc = (32+32+37)/(32+32+37+3+1)
print("The accuracy of the model is", acc)

The accuracy of the model is 0.9619047619047619


## What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?
## Save the best model in forest_fit

## The first model (where min_samples_leaf = 1 and max_depth = 20) was more accurate with a precision of 100%.

## Decreasing the min_samples_leaf and increating the max_depth also dimished accuracy to 96.2%.

In [142]:
forest_fit_iris = rf

## Iris Dataset

## Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 20.


In [151]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from acquire import get_iris_data
from prepare import prep_iris

iris_df = prep_iris(get_iris_data())
iris_df.info

TypeError: prep_iris() takes 0 positional arguments but 1 was given

In [117]:
df.sample(20)

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,embarked_town,embarked_encode
603,603,0,3,male,44.0,0,0,8.05,S,Third,Southampton,1,Southampton,3
157,157,0,3,male,30.0,0,0,8.05,S,Third,Southampton,1,Southampton,3
535,535,1,2,female,7.0,0,2,26.25,S,Second,Southampton,0,Southampton,3
296,296,0,3,male,23.5,0,0,7.2292,C,Third,Cherbourg,1,Cherbourg,1
836,836,0,3,male,21.0,0,0,8.6625,S,Third,Southampton,1,Southampton,3
812,812,0,2,male,35.0,0,0,10.5,S,Second,Southampton,1,Southampton,3
292,292,0,2,male,36.0,0,0,12.875,C,Second,Cherbourg,1,Cherbourg,1
379,379,0,3,male,19.0,0,0,7.775,S,Third,Southampton,1,Southampton,3
835,835,1,1,female,39.0,1,1,83.1583,C,First,Cherbourg,0,Cherbourg,1
218,218,1,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,Cherbourg,1


In [108]:
# Handle missing age values
df.dropna(inplace=True)

X = df[['pclass','age','fare','sibsp','parch']]
y = df.survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,pclass,age,fare,sibsp,parch
605,3,36.0,15.55,1,0
197,3,42.0,8.4042,0,1
56,2,21.0,10.5,0,0
645,1,48.0,76.7292,1,0
356,1,22.0,55.0,0,1


In [112]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=20, 
                            random_state=123)

In [113]:
rf.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=123, verbose=0, warm_start=False)

## Evaluate your results using the model score, confusion matrix, and classification report.

In [114]:
y_pred = rf.predict(X_train)


In [115]:
y_pred_proba = rf.predict_proba(X_train)


In [116]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.98


In [118]:
print(confusion_matrix(y_train, y_pred))


[[296   3]
 [  6 193]]


In [119]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.98      0.99      0.99       299
           1       0.98      0.97      0.98       199

   micro avg       0.98      0.98      0.98       498
   macro avg       0.98      0.98      0.98       498
weighted avg       0.98      0.98      0.98       498



## Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

### True Positive Rate

In [122]:
true_pos = 193
true_pos_rate = 193/(193+9+296)
print('The true positive rate is', true_pos_rate)


The true positive rate is 0.38755020080321284


### False Positive Rate

In [123]:
false_pos = 3
false_pos_rate = 3/(193+9+296)
print('The false positive rate is', false_pos_rate)

The false positive rate is 0.006024096385542169


### True Negative Rate

In [124]:
true_neg = 296
true_neg_rate = 296/(193+9+296)
print('The true negative rate is', true_neg_rate)

The true negative rate is 0.5943775100401606


### False Negative Rate

In [125]:
false_neg = 6
false_neg_rate = 6/(193+9+296)
print('The false positive rate is', false_neg_rate)

The false positive rate is 0.012048192771084338


### Precision

In [126]:
precision = true_pos/(true_pos + false_pos)

In [127]:
print('The precision is', precision)

The precision is 0.9846938775510204


## Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.

In [128]:
rf_2 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)

In [129]:
rf_2.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=123, verbose=0, warm_start=False)

## Evaluate your results using the model score, confusion matrix, and classification report.

In [130]:
y_pred_2 = rf_2.predict(X_train)


In [131]:
y_pred_proba_2 = rf_2.predict_proba(X_train)


In [132]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf_2.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.75


In [134]:
print(confusion_matrix(y_train, y_pred_2))


[[269  30]
 [ 93 106]]


In [135]:
print(classification_report(y_train, y_pred_2))


              precision    recall  f1-score   support

           0       0.74      0.90      0.81       299
           1       0.78      0.53      0.63       199

   micro avg       0.75      0.75      0.75       498
   macro avg       0.76      0.72      0.72       498
weighted avg       0.76      0.75      0.74       498



## Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

### True Positive Rate

In [136]:
true_pos = 106
true_pos_rate = 106/(106+93+30+269)
print('The true positive rate is', true_pos_rate)


The true positive rate is 0.21285140562248997


### False Positive Rate

In [137]:
false_pos = 30
false_pos_rate = 30/(106+93+30+269)
print('The true positive rate is', false_pos_rate)


The true positive rate is 0.060240963855421686


### True Negative Rate

In [139]:
true_neg = 269
true_neg_rate = 269/(106+93+30+269)
print('The true positive rate is', true_neg_rate)


The true positive rate is 0.5401606425702812


### False Negative Rate

In [138]:
false_neg = 93
false_neg_rate = 93/(106+93+30+269)
print('The true positive rate is', false_neg_rate)


The true positive rate is 0.18674698795180722


### Precision

In [140]:
precision = true_pos/(true_pos + false_pos)

In [141]:
print('The precision is', precision)

The precision is 0.7794117647058824


## What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?
## Save the best model in forest_fit

## The first model where min_samples_leaf = 1 and max_depth = 20 was more accurate with a precision of 98.5%.

## Decreasing the min_samples_leaf and increating the max_depth also dimished accuracy to 77.9%.

In [142]:
forest_fit = rf