In [50]:
import pandas as pd

In [None]:
# read the csv files and convert them into python data frames that we can work with
data = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

# test_ids is used at the very end of the exercise once we have a prediction
# for each row. we use the passenger ids to indicate the results of the model
test_passenger_ids = test["PassengerId"]

In [None]:
# clean the data and limit to the "features" (column names) we want to use
# in the Logistic Regression classifier below.
def clean(data):
    # drop the following columns, as they will not be used in classifying
    # whether a passenger lived or died. cabin would actually be a good
    # thing to consider, if we could know where the cabin was located on
    # the ship. for example, do cabins closer to lifeboats, or further away
    # from point of impact (etc) correlate with survival rates?
    data = data.drop(["Ticket", "Cabin", "Name", "PassengerId"], axis=1)
    
    # these columns have some missing values. let's call them out
    # and act on them in the loop below
    cols = ["SibSp", "Parch", "Fare", "Age"]

    # if the cell value is empty, fill it with that column's median
    # value. i have mixed feelings about doing this. i guess it 
    # wouldn't throw off or skew the Logistic Regression classifier,
    # significantly, but it seems like it would reduce the variability 
    # of the data. is it enough to be a problem?
    for col in cols:
        data.fillna({col: data[col].median()}, inplace=True)
    
    # replace the Embarked column with a single U value if it is unknown
    # could this be done in the for loop? or is it better to do all the
    # other data cleaning steps outside of a for loop? just seems weird
    # to have some clean steps in a for loop, and others outside the loop
    data['Embarked'] = data["Embarked"].fillna("U")
    return data

# use and reuse the clean function to clean the data for both sets
data = clean(data)
test = clean(test)

# preview the cleaned data
data.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [None]:
from sklearn import preprocessing

# get an instance of LabelEncoder to use on Sex and Embarked
# this will assign 0 or 1 to male and female. why do we 
# need to do this? is it a performance thing?
le = preprocessing.LabelEncoder()

cols = ["Sex", "Embarked"]

for col in cols:
    data[col] = le.fit_transform(data[col])
    test[col] = le.fit_transform(test[col])
    # le.classes_ is the unique values that the encoder has 
    # identified and will transform into more readable values
    # that will be used in the modeling
    print(le.classes_)

# check out the first 10 rows of the transformed data
data.head(10)


['female' 'male']
['C' 'Q' 'S']


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2
5,0,3,1,28.0,0,0,8.4583,1
6,0,1,1,54.0,0,0,51.8625,2
7,0,3,1,2.0,3,1,21.075,2
8,1,3,0,27.0,0,2,11.1333,2
9,1,2,0,14.0,1,0,30.0708,0


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# make a copy of the survived column off of the data frame
y = data["Survived"]
# once we have a copy, drop the column before we pass it into the training
X = data.drop("Survived", axis=1)

# i don't really understand what is happening here. 
# what does train_test_split do??
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# this seems to be the real heart of the model. LogisticRegression takes in
# the training data, which includes the `Survived` column. this is important
# because we need to start to draw conclusions from the characteristics of 
# those who survived and those who did not. use `clf` as a convention that
# is an abbreviation for "classifier"

# todo: figure out what .fit does under the hood
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)

# printing the feature coefficients shows that the strongest indicator of
# survival was "Sex", which had a coefficient of 2.6. 
print("Feature Coefficients:", dict(zip(X.columns, clf.coef_[0])))

# todo: look into what intercept is
print("Intercept:", clf.intercept_)

Feature Coefficients: {'Pclass': np.float64(-0.9314377443658908), 'Sex': np.float64(-2.5987985416779416), 'Age': np.float64(-0.0304056126976557), 'SibSp': np.float64(-0.2946358116063582), 'Parch': np.float64(-0.11276422694591964), 'Fare': np.float64(0.0025629336135515324), 'Embarked': np.float64(-0.21266513877158785)}
Intercept: [4.54950335]


In [None]:
# todo: figure out what this is doing under the hood
predictions = clf.predict(X_val)
from sklearn.metrics import accuracy_score

# i am guessing this is just a direct comparison of the correct
# values against the model's predictions.
accuracy_score(y_val, predictions)

0.8100558659217877

In [None]:
# todo: how does submission_preds know that it should be a 0 or a 1?
submission_preds = clf.predict(test)

In [None]:
# create a data frame with two columns, PassengerId and Survived
# test_passenger_ids.values is a list of passenger ids that was
# declared in the first codeblock at the top of this notebook.
# submission_preds is a list of 0s and 1s indicating the results of
# the predictive model
df = pd.DataFrame({ "PassengerId": test_passenger_ids.values, "Survived": submission_preds })

In [None]:
# export the data frame into a csv. 
# index=False excludes index values from being added
# to each row
df.to_csv("submission.csv", index=False)