# intro_to_models example

In [2]:

# We'll use sklearn's Dummy Classifier as a standin for other classification algorithms
# The sklearn API operates in the same way as our "real" models
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report
import acquire
import prepare

import warnings
warnings.filterwarnings('ignore')

In [3]:
train, validate, test = prepare.prep_titanic(acquire.get_titanic_data())
train.shape, validate.shape, test.shape
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
583,583,0,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0
337,337,1,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0
50,50,0,3,male,7.0,4,1,39.6875,S,Third,Southampton,0,0,1
218,218,1,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0
31,31,1,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0


In [4]:

train, validate, test = prepare.prep_titanic(acquire.get_titanic_data())
train.shape, validate.shape, test.shape
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
583,583,0,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0
337,337,1,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0
50,50,0,3,male,7.0,4,1,39.6875,S,Third,Southampton,0,0,1
218,218,1,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0
31,31,1,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0


In [5]:

X_train, y_train = train.drop(columns='survived'), train.survived
X_validate, y_validate = validate.drop(columns='survived'), validate.survived
X_test, y_test = test.drop(columns='survived'), test.survived

In [6]:
# 1. Create the object
model = DummyClassifier(strategy='constant', constant=1) # here, we're predicting everyone survives
# 2. Fit the object
model.fit(X_train, y_train)

DummyClassifier(constant=1, strategy='constant')

In [7]:
accuracy = round(model.score(X_train, y_train), 2)
print(f'Training accuracy: {accuracy}')

Training accuracy: 0.38


In [8]:
# TODO: view the accuracy on the validate split
model.score(X_validate, y_validate)

0.38317757009345793

In [9]:

X_validate.head()

Unnamed: 0,passenger_id,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
610,610,3,female,39.0,1,5,31.275,S,Third,Southampton,0,0,1
424,424,3,male,18.0,1,1,20.2125,S,Third,Southampton,0,0,1
568,568,3,male,29.916875,0,0,7.2292,C,Third,Cherbourg,1,0,0
701,701,1,male,35.0,0,0,26.2875,S,First,Southampton,1,0,1
101,101,3,male,29.916875,0,0,7.8958,S,Third,Southampton,1,0,1


In [10]:
model.predict(X_validate)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [11]:
# TODO: create a new column on the train dataframe that contains the models predictions
train['prediction'] = model.predict(X_train)

In [12]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S,prediction
583,583,0,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0,1
337,337,1,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0,1
50,50,0,3,male,7.0,4,1,39.6875,S,Third,Southampton,0,0,1,1
218,218,1,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0,1
31,31,1,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0,1


In [13]:
# use the column you just created and the actual values in the survived column
# to generate a classification report
print(classification_report(train.survived, train.prediction, zero_division=True))

              precision    recall  f1-score   support

           0       1.00      0.00      0.00       307
           1       0.38      1.00      0.55       190

    accuracy                           0.38       497
   macro avg       0.69      0.50      0.28       497
weighted avg       0.76      0.38      0.21       497



In [14]:
pd.DataFrame(classification_report(train.survived, train.prediction, output_dict=True)).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,0.0,0.0,0.0,307.0
1,0.382294,1.0,0.55313,190.0
accuracy,0.382294,0.382294,0.382294,0.382294
macro avg,0.191147,0.5,0.276565,497.0
weighted avg,0.146149,0.382294,0.211458,497.0
