In [3]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

# Set Up Data Frame

In [4]:
df = pd.read_csv('/Users/jenny/Downloads/Titanic-Dataset.csv')
pd.set_option("display.max_columns", 12)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Clean Dataset

## Irrelevant Variables

In [5]:
df["Ticket"].value_counts()

Ticket
347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: count, Length: 681, dtype: int64

Variables like "PassengerId", "Name", and "Ticket" have unique values for each passenger or less than 10 frequencies per count with the majority being unique; thus, they don't serve to provide significance in determining whether a passenger would survive or not.

In [6]:
df.drop(columns = ["PassengerId", "Name", "Ticket"], inplace = True)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.2500,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.9250,,S
3,1,1,female,35.0,1,0,53.1000,C123,S
4,0,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,,S
887,1,1,female,19.0,0,0,30.0000,B42,S
888,0,3,female,,1,2,23.4500,,S
889,1,1,male,26.0,0,0,30.0000,C148,C


## Missing Values

In [7]:
df.dropna(how= 'all', inplace = True)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.2500,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.9250,,S
3,1,1,female,35.0,1,0,53.1000,C123,S
4,0,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,,S
887,1,1,female,19.0,0,0,30.0000,B42,S
888,0,3,female,,1,2,23.4500,,S
889,1,1,male,26.0,0,0,30.0000,C148,C


Given that the resulting dataframe has the same number of rows before and after running the code, there is no observation / row that consists of missing values for all columns.

In [1]:
df.info()

NameError: name 'df' is not defined

The code shows that "Age", "Cabin", and "Embarked" columns have missing values.
"Age" has 177 null values, "Cabin" has 687 null values, and "Embarked" has 2 null values. 
Because there is a significantly large amount of null values for "Age" and "Cabin", there must be additional analyses to whether having a null value affects survival rate or not. In the case of "Embarked", there are only two null values, which can be removed.

In [9]:
df = df.dropna(subset=['Embarked'])
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.2500,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.9250,,S
3,1,1,female,35.0,1,0,53.1000,C123,S
4,0,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,,S
887,1,1,female,19.0,0,0,30.0000,B42,S
888,0,3,female,,1,2,23.4500,,S
889,1,1,male,26.0,0,0,30.0000,C148,C


# Data Analysis

## Pclass (Ticket Class)

In [10]:
pclassda = df[["Pclass", "Survived"]]
pc = pclassda.groupby(["Pclass", "Survived"])
pclassda = pc.value_counts().reset_index()
pctotal = pclassda.groupby("Pclass")["count"].transform("sum")
pclassda["Percentage"] = pclassda["count"].div(pctotal).mul(100)
pclassdayes = (pclassda["Survived"] == 1)
pclassda[pclassdayes].sort_values(by = "Percentage", ascending = False)

Unnamed: 0,Pclass,Survived,count,Percentage
1,1,1,134,62.616822
3,2,1,87,47.282609
5,3,1,119,24.236253


There is an evident pattern where the higher the Pclass, 1 as the highest, the higher the survival rate. Thus, there is a direct relationship between Pclass and survival rate. This can spur explanations such as giving priority or access to safety methods to richer people, receiving closer rooms to survival boats and escape, etc.

## Sex

In [11]:
sexda = df[["Sex", "Survived"]]
sex = sexda.groupby(["Sex", "Survived"])
sexda = sex.value_counts().reset_index()
sextotal = sexda.groupby("Sex")["count"].transform("sum")
sexda["Percentage"] = sexda["count"].div(sextotal).mul(100)
sexdayes = (sexda["Survived"] == 1)
sexda[sexdayes].sort_values(by = "Percentage", ascending = False)

Unnamed: 0,Sex,Survived,count,Percentage
1,female,1,231,74.038462
3,male,1,109,18.890815


There is statistically significant difference between the survival rate of two genders—this is not surprising given that women and children were given priority to the safety boats. 

## Age

In [12]:
agenull = (df["Age"].isnull())
agenulldf = df[agenull]
agenulldf.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,177.0,177.0,0.0,177.0,177.0,177.0
mean,0.293785,2.59887,,0.564972,0.180791,22.158567
std,0.456787,0.763216,,1.626316,0.534145,31.874608
min,0.0,1.0,,0.0,0.0,0.0
25%,0.0,3.0,,0.0,0.0,7.75
50%,0.0,3.0,,0.0,0.0,8.05
75%,1.0,3.0,,0.0,0.0,24.15
max,1.0,3.0,,8.0,2.0,227.525


In [13]:
agepresent = (df["Age"].isnull() == False)
agepresentdf = df[agepresent]
agepresentdf.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,712.0,712.0,712.0,712.0,712.0,712.0
mean,0.404494,2.240169,29.642093,0.514045,0.432584,34.567251
std,0.491139,0.836854,14.492933,0.930692,0.854181,52.938648
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,1.0,20.0,0.0,0.0,8.05
50%,0.0,2.0,28.0,0.0,0.0,15.64585
75%,1.0,3.0,38.0,1.0,1.0,33.0
max,1.0,3.0,80.0,5.0,6.0,512.3292


As the code above shows, analyzing survival rates based on if the age was a null value or not showed large differences: only 29% of no-age observations survived, while 40% of yes-age observations survived. Thus, it is unreasonable to elimintae "Age" as one of the explanatory factors to survival rate.

In [14]:
ageda = df[["Age", "Survived"]]
ageda.loc[(ageda["Age"] <= 10), "Age"] = 1
ageda.loc[(ageda["Age"] <= 20) & (ageda["Age"] > 10), "Age"] = 2
ageda.loc[(ageda["Age"] <= 30) & (ageda["Age"] > 20), "Age"] = 3
ageda.loc[(ageda["Age"] <= 40) & (ageda["Age"] > 30), "Age"] = 4
ageda.loc[(ageda["Age"] <= 50) & (ageda["Age"] > 40), "Age"] = 5
ageda.loc[(ageda["Age"] <= 60) & (ageda["Age"] > 50), "Age"] = 6
ageda.loc[(ageda["Age"] <= 70) & (ageda["Age"] > 60), "Age"] = 7
ageda.loc[(ageda["Age"] <= 80) & (ageda["Age"] > 70), "Age"] = 8

In [15]:
age = ageda.groupby(["Age", "Survived"])
ageda = age.value_counts().reset_index()
agetotal = ageda.groupby("Age")["count"].transform("sum")
ageda["Percentage"] = ageda["count"].div(agetotal).mul(100)
agedayes = (ageda["Survived"] == 1)
ageda[agedayes].sort_values(by = "Percentage", ascending = False)

Unnamed: 0,Age,Survived,count,Percentage
1,1.0,1,38,59.375
7,4.0,1,68,44.155844
11,6.0,1,17,40.47619
9,5.0,1,33,38.372093
3,2.0,1,44,38.26087
5,3.0,1,84,36.521739
15,8.0,1,1,20.0
13,7.0,1,3,18.75


The ages were initially broken up into intervals of 10 years of age to facilitate the analysis process. Though there isn't a linear pattern, the older groups had a relatively small survival rate, and the kids (group 1) had the highest survival rate. As mentioned above, kids and women were given the highest priority; thus, the youngest group of kids and group 4, presumably average ages of mothers, would've had the highest survival rate.

## Siblings and Spouses

In [16]:
sibda = df[["SibSp", "Survived"]]
sib = sibda.groupby(["SibSp", "Survived"])
sibda = sib.value_counts().reset_index()
sibtotal = sibda.groupby("SibSp")["count"].transform("sum")
sibda["Percentage"] = sibda["count"].div(sibtotal).mul(100)
sibdayes = (sibda["Survived"] == 1)
sibda[sibdayes].sort_values(by = "Percentage", ascending = False)

Unnamed: 0,SibSp,Survived,count,Percentage
3,1,1,112,53.588517
5,2,1,13,46.428571
1,0,1,208,34.323432
7,3,1,4,25.0
9,4,1,3,16.666667


Generally, those with more members in the family had a lower suvival rate, and those with less or no family members had a higher survival rate. It could be assumed that the more family members one had, the more they had to spend time and energy in helping others, which could've led to one's own death.

# Parents and Children

In [17]:
pchda = df[["Parch", "Survived"]]
pch = pchda.groupby(["Parch", "Survived"])
pchda = pch.value_counts().reset_index()
pchtotal = pchda.groupby("Parch")["count"].transform("sum")
pchda["Percentage"] = pchda["count"].div(pchtotal).mul(100)
pchdayes = (pchda["Survived"] == 1)
pchda[pchdayes].sort_values(by = "Percentage", ascending = False)

Unnamed: 0,Parch,Survived,count,Percentage
7,3,1,3,60.0
3,1,1,65,55.084746
5,2,1,40,50.0
1,0,1,231,34.171598
10,5,1,1,20.0


There is a relatively inconsistent pattern as the survival rate highest to lowest goes from 3, 1, 2, 0, and 5. Additionally, the data sets are concentrated around 0, 1, 2 parents or children, making the survival rate for 3 or 5 parents / children to be relatively more extreme. 

## Cabin

In [18]:
cabinnull = (df["Cabin"].isnull())
cabinnulldf = df[cabinnull]
cabinnulldf.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,687.0,687.0,529.0,687.0,687.0,687.0
mean,0.299854,2.63901,27.555293,0.547307,0.365357,19.157325
std,0.458528,0.589602,13.472634,1.207492,0.827106,28.663343
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,19.0,0.0,0.0,7.8771
50%,0.0,3.0,26.0,0.0,0.0,10.5
75%,1.0,3.0,35.0,1.0,0.0,23.0
max,1.0,3.0,74.0,8.0,6.0,512.3292


In [19]:
cabinpresent = (df["Cabin"].isnull() == False)
cabinpresentdf = df[cabinpresent]
cabinpresentdf.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,202.0,202.0,183.0,202.0,202.0,202.0
mean,0.663366,1.19802,35.674426,0.445545,0.440594,76.103301
std,0.473732,0.528205,15.643866,0.63049,0.732294,74.759941
min,0.0,1.0,0.92,0.0,0.0,0.0
25%,0.0,1.0,24.0,0.0,0.0,28.959375
50%,1.0,1.0,36.0,0.0,0.0,55.0
75%,1.0,1.0,47.5,1.0,1.0,89.77605
max,1.0,3.0,80.0,3.0,4.0,512.3292


Survival rate of null cabin values are 30%, while that of non-null cabin values are 66%, showing the significance of whether there exists a cabin value or not. Thus, null values shouldn't be eliminated and kept with "Null" being one value for cabin.

In [20]:
df["Cabin"] = df["Cabin"].astype(str).str[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Cabin"] = df["Cabin"].astype(str).str[0]


In [21]:
cabda = df[["Cabin", "Survived"]]
cab = cabda.groupby(["Cabin", "Survived"])
cabda = cab.value_counts().reset_index()
cabtotal = cabda.groupby("Cabin")["count"].transform("sum")
cabda["Percentage"] = cabda["count"].div(cabtotal).mul(100)
cabdayes = (cabda["Survived"] == 1)
cabda[cabdayes].sort_values(by = "Percentage", ascending = False)

Unnamed: 0,Cabin,Survived,count,Percentage
7,D,1,25,75.757576
9,E,1,24,75.0
3,B,1,33,73.333333
11,F,1,8,61.538462
5,C,1,35,59.322034
13,G,1,2,50.0
1,A,1,7,46.666667
16,n,1,206,29.985444


Because there is no information regarding what each letter for the cabin represents, there is a significant difference between the survival rates, even excluding the null survival rates.

## Embarked

In [22]:
embda = df[["Embarked", "Survived"]]
emb = embda.groupby(["Embarked", "Survived"])
embda = emb.value_counts().reset_index()
embtotal = embda.groupby("Embarked")["count"].transform("sum")
embda["Percentage"] = embda["count"].div(embtotal).mul(100)
embdayes = (embda["Survived"] == 1)
embda[embdayes].sort_values(by = "Percentage", ascending = False)

Unnamed: 0,Embarked,Survived,count,Percentage
1,C,1,93,55.357143
3,Q,1,30,38.961039
5,S,1,217,33.695652


Logically speaking, it doesn't seem like destination would matter to the survival as it's more a matter of either they're on the boat or not. However, survival rate for Cherbourg is larger than those of the other two, which gives a reason to focus on this variable as well.

# Predictive Modeling

In [23]:
df.loc[(df["Age"].isna()), "Age"] = 0
df.loc[(df["Sex"] == "female"), "Sex"] = 1
df.loc[(df["Sex"] == "male"), "Sex"] = 2
df.loc[(df["Cabin"] == "A"), "Cabin"] = 1
df.loc[(df["Cabin"] == "B"), "Cabin"] = 2
df.loc[(df["Cabin"] == "C"), "Cabin"] = 3
df.loc[(df["Cabin"] == "D"), "Cabin"] = 4
df.loc[(df["Cabin"] == "E"), "Cabin"] = 5
df.loc[(df["Cabin"] == "F"), "Cabin"] = 6
df.loc[(df["Cabin"] == "G"), "Cabin"] = 7
df.loc[(df["Cabin"] == "T"), "Cabin"] = 8
df.loc[(df["Cabin"] == "n"), "Cabin"] = 9
df.loc[(df["Embarked"] == "C"), "Embarked"] = 1
df.loc[(df["Embarked"] == "Q"), "Embarked"] = 2
df.loc[(df["Embarked"] == "S"), "Embarked"] = 3

In [24]:
lgrg = LogisticRegression()

In [25]:
outcome = df.Survived
predictors = df.drop("Survived", axis = 1)

xtrain, xtest, ytrain, ytest = train_test_split(predictors, outcome, train_size = 0.8, random_state = 1)

lgrg.fit(xtrain, ytrain)
y_preds = lgrg.predict(xtest)
accuracy = accuracy_score(ytest, y_preds)
accuracy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.848314606741573

# Creating Inputs

In [26]:
knn = KNeighborsClassifier(n_neighbors = 5)

In [27]:
knn.fit(xtrain, ytrain)

In [28]:
# Person = [[Pclass, Sex, Age, SibSp, Parch, Fare, Cabin, Embarked]]
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,2,22.0,1,0,7.2500,9,3
1,1,1,1,38.0,1,0,71.2833,3,1
2,1,3,1,26.0,0,0,7.9250,9,3
3,1,1,1,35.0,1,0,53.1000,3,3
4,0,3,2,35.0,0,0,8.0500,9,3
...,...,...,...,...,...,...,...,...,...
886,0,2,2,27.0,0,0,13.0000,9,3
887,1,1,1,19.0,0,0,30.0000,2,3
888,0,3,1,0.0,1,2,23.4500,9,3
889,1,1,2,26.0,0,0,30.0000,3,1


In [29]:
person1 = np.array([[3, 1, 38, 2, 1, 62, 3, 1]])

In [30]:
prediction = knn.predict(person1)



In [31]:
print("Prediction: {}".format(prediction))

Prediction: [1]
