In [223]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

In [224]:
kfold = StratifiedKFold(10, random_state=42, shuffle=True)

In [225]:
data = pd.read_csv("train.csv")
y = data["Survived"]

test_data = pd.read_csv("test.csv")
data_df = pd.concat([data, test_data], axis=0)

data_df.shape

(1309, 12)

In [226]:
data_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [227]:
data_df['Title'] = data_df["Name"].apply(lambda x: str.split(x, ", ")[1].split(".")[0])
data_df['Family'] = data_df['Name'].apply(lambda x: str.split(x, ",")[0])

data_df.head()

In [229]:
# adult males

test_data_start_id = y.shape[0] + 1

ids_of_adult_males_test_set = data_df.loc[(data_df["Sex"] == "male") &
                                            (data_df["Title"] != "Master") &
                                            (data_df["PassengerId"] >= test_data_start_id), "PassengerId"]
ids_of_adult_males_test_set = [x - test_data_start_id for x in ids_of_adult_males_test_set]

In [230]:
mapping = {'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'Sir': 'Mr', 'Don': 'Mr', 'Mme': 'Miss',
          'Jonkheer': 'Mr', 'Lady': 'Mrs', 'Capt': 'Mr', 'Countess': 'Mrs', 'Ms': 'Miss', 'Dona': 'Mrs'}
data_df.replace({'Title': mapping}, inplace=True)
titles = ['Dr', 'Master', 'Miss', 'Mr', 'Mrs', 'Rev']

data_df["Age"] = data_df["Age"].fillna(data_df.groupby(["Title"])["Age"].transform("median"))

data_df.drop('Title', axis=1, inplace=True)
data_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Braund
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Futrelle
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Allen


In [231]:
data_df['Family_Size'] = data_df['Parch'] + data_df['SibSp']
data_df['Fare'].fillna(data_df['Fare'].mean(), inplace=True)

data_df['Family_Survival'] = 0.5

for grp, grp_df in data_df.groupby(['Family', 'Fare']):

    if (len(grp_df) != 1):
        # A Family group is found.
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin==0.0):
                data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 0

print("Number of passengers with family survival information:",
      data_df.loc[data_df['Family_Survival']!=0.5].shape[0])

Number of passengers with family survival information: 420


In [232]:
for _, grp_df in data_df.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin==0.0):
                    data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 0

print("Number of passenger with family/group survival information: "
      +str(data_df[data_df['Family_Survival']!=0.5].shape[0]))

Number of passenger with family/group survival information: 546


In [233]:
data_df['FareBin'] = pd.qcut(data_df['Fare'], 5, labels=False)
data_df['AgeBin'] = pd.qcut(data_df['Age'], 4, labels=False)

data_df['Sex'].replace(['male','female'],[0,1],inplace=True)
data_df.drop(['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin',
               'Embarked', "Age", "Fare", "Family"], axis = 1, inplace = True)

data_df.head()

Unnamed: 0,Survived,Pclass,Sex,Family_Size,Family_Survival,FareBin,AgeBin
0,0.0,3,0,1,0.5,0,0
1,1.0,1,1,1,0.5,4,3
2,1.0,3,1,0,0.5,1,1
3,1.0,1,1,1,0.0,4,2
4,0.0,3,0,0,0.5,1,2


In [234]:
X = data_df[:(y.shape[0])].drop(columns=["Survived"])
test_data = data_df[(y.shape[0]):].drop(columns=["Survived"])

X.shape, test_data.shape

((891, 6), (418, 6))

In [235]:
model_roc = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("model", KNeighborsClassifier(
        n_neighbors=6,
        leaf_size=26,
    ))
])

cross_val_score(model_roc, X, y, cv=kfold).mean()

0.8271161048689137

In [236]:
model_roc.fit(X, y)
y_pred = model_roc.predict(test_data)
y_pred = [0 if i in ids_of_adult_males_test_set else y_pred[i] for i in range(len(y_pred))]

In [237]:
def make_submission(predictions):
  predictions_df = pd.DataFrame(data={"PassengerId": range(892, 892 + len(predictions)), "Survived": predictions})
  predictions_df.to_csv("submission.csv", index=False)

make_submission(y_pred)