In [888]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_log_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

train_data = pd.read_csv("train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [889]:
titles = []
for name in train_data["Name"]:
  txt1 = name.split(",")[1]
  txt2 = txt1.split(".")[0]
  txt2 = txt2.strip()
  titles.append(txt2)

titles

['Mr',
 'Mrs',
 'Miss',
 'Mrs',
 'Mr',
 'Mr',
 'Mr',
 'Master',
 'Mrs',
 'Mrs',
 'Miss',
 'Miss',
 'Mr',
 'Mr',
 'Miss',
 'Mrs',
 'Master',
 'Mr',
 'Mrs',
 'Mrs',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Miss',
 'Mrs',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Don',
 'Mrs',
 'Miss',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Miss',
 'Miss',
 'Mrs',
 'Mrs',
 'Mr',
 'Miss',
 'Miss',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Mrs',
 'Master',
 'Mr',
 'Mrs',
 'Mrs',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Miss',
 'Master',
 'Mr',
 'Miss',
 'Mr',
 'Master',
 'Mr',
 'Master',
 'Mrs',
 'Mr',
 'Miss',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Master',
 'Miss',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Miss',
 'Mrs',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mrs',
 'Mr',
 'Miss',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Miss',
 'Mr',
 'Miss',
 'Miss',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Master',
 'Mr',
 

In [890]:
X = train_data.drop(columns=["PassengerId", "Survived", "Name", "Ticket", "Embarked"])
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin
0,3,male,22.0,1,0,7.25,
1,1,female,38.0,1,0,71.2833,C85
2,3,female,26.0,0,0,7.925,
3,1,female,35.0,1,0,53.1,C123
4,3,male,35.0,0,0,8.05,


In [891]:
y = train_data["Survived"]
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [892]:
X["Title"] = pd.DataFrame(titles)
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Title
0,3,male,22.0,1,0,7.25,,Mr
1,1,female,38.0,1,0,71.2833,C85,Mrs
2,3,female,26.0,0,0,7.925,,Miss
3,1,female,35.0,1,0,53.1,C123,Mrs
4,3,male,35.0,0,0,8.05,,Mr


In [893]:
X["Title"].value_counts()

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: Title, dtype: int64

In [894]:
common_titles = ["Mr", "Miss", "Mrs", "Master"]

for index in X.index:
  if X.loc[index, "Title"] not in common_titles:
    X.loc[index, "Title"] = "Other"

X["Title"].value_counts()

Mr        517
Miss      182
Mrs       125
Master     40
Other      27
Name: Title, dtype: int64

In [895]:
def print_nans(data):
  for col in data.columns:
    nans = data[col].isna().sum()
    if nans > 0:
      print(col + " " + str(nans))

print_nans(X)

Age 177
Cabin 687


In [896]:
X.groupby(['Sex','Title'])["Age"].mean()

Sex     Title 
female  Miss      21.773973
        Mrs       35.898148
        Other     32.857143
male    Master     4.574167
        Mr        32.368090
        Other     45.894737
Name: Age, dtype: float64

In [897]:
X.loc[5]

Pclass         3
Sex         male
Age          NaN
SibSp          0
Parch          0
Fare      8.4583
Cabin        NaN
Title         Mr
Name: 5, dtype: object

In [898]:
X["Age"] = X["Age"].fillna(X.groupby(['Sex','Title'])["Age"].transform("mean"))
X.loc[5]

Pclass           3
Sex           male
Age       32.36809
SibSp            0
Parch            0
Fare        8.4583
Cabin          NaN
Title           Mr
Name: 5, dtype: object

In [899]:
cabin_nans = X["Cabin"].isna()
cabin_nans = cabin_nans.map({True: 0, False: 1})
X["Cabin"] = cabin_nans
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Title
0,3,male,22.0,1,0,7.25,0,Mr
1,1,female,38.0,1,0,71.2833,1,Mrs
2,3,female,26.0,0,0,7.925,0,Miss
3,1,female,35.0,1,0,53.1,1,Mrs
4,3,male,35.0,0,0,8.05,0,Mr


In [900]:
categorical_cols = [col for col in X.columns if X[col].dtype == "object"]
polynomial_features = ["Fare"]

pol_features_transformer = Pipeline(
  steps=[
    ("imputer", SimpleImputer()),
    ("poly", PolynomialFeatures())
  ]
)

categorical_transformer = Pipeline(
  steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
  ]
)

preprocessor = ColumnTransformer(
  transformers=[
    ("num_poly", pol_features_transformer, polynomial_features),
    ("cat", categorical_transformer, categorical_cols)
  ],
  remainder="passthrough"
)

In [901]:
processed_data = pd.DataFrame(preprocessor.fit_transform(X))
processed_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1.0,7.25,52.5625,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,22.0,1.0,0.0,0.0
1,1.0,71.2833,5081.308859,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,38.0,1.0,0.0,1.0
2,1.0,7.925,62.805625,1.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,26.0,0.0,0.0,0.0
3,1.0,53.1,2819.61,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,35.0,1.0,0.0,1.0
4,1.0,8.05,64.8025,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,35.0,0.0,0.0,0.0


In [902]:
processed_data[5].sum()

40.0

In [903]:
processed_data.columns = ["fare^0", "fare", "fare^2", "is_female", "is_male", "is_master", "is_miss", "is_mr", "is_mrs", "is_other", "passenger_class", "age", "siblings", "parch", "had_cabin"]

In [904]:
# swap passenger class 1 with 3 so that the order is: higher = better
processed_data["passenger_class"] = processed_data["passenger_class"].map({3: 1, 1: 3, 2: 2})

processed_data.head()

Unnamed: 0,fare^0,fare,fare^2,is_female,is_male,is_master,is_miss,is_mr,is_mrs,is_other,passenger_class,age,siblings,parch,had_cabin
0,1.0,7.25,52.5625,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,22.0,1.0,0.0,0.0
1,1.0,71.2833,5081.308859,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3,38.0,1.0,0.0,1.0
2,1.0,7.925,62.805625,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1,26.0,0.0,0.0,0.0
3,1.0,53.1,2819.61,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3,35.0,1.0,0.0,1.0
4,1.0,8.05,64.8025,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,35.0,0.0,0.0,0.0


In [905]:
print(processed_data["age"].min())
processed_data["age"] = np.ceil(processed_data["age"])
print(processed_data["age"].min())

0.42
1.0


In [906]:
processed_data["binned_age"] = np.floor(processed_data["age"] / 10)
processed_data.head()

# this raised the score in both models

Unnamed: 0,fare^0,fare,fare^2,is_female,is_male,is_master,is_miss,is_mr,is_mrs,is_other,passenger_class,age,siblings,parch,had_cabin,binned_age
0,1.0,7.25,52.5625,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,22.0,1.0,0.0,0.0,2.0
1,1.0,71.2833,5081.308859,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3,38.0,1.0,0.0,1.0,3.0
2,1.0,7.925,62.805625,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1,26.0,0.0,0.0,0.0,2.0
3,1.0,53.1,2819.61,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3,35.0,1.0,0.0,1.0,3.0
4,1.0,8.05,64.8025,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,35.0,0.0,0.0,0.0,3.0


In [907]:
columns_to_scale = ["age", "fare", "fare^2"]
for col in columns_to_scale:
  scaled = StandardScaler().fit_transform(processed_data[[col]])
  processed_data[col] = pd.DataFrame(scaled)

processed_data.head()

Unnamed: 0,fare^0,fare,fare^2,is_female,is_male,is_master,is_miss,is_mr,is_mrs,is_other,passenger_class,age,siblings,parch,had_cabin,binned_age
0,1.0,-0.502445,-0.199305,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,-0.592321,1.0,0.0,0.0,2.0
1,1.0,0.786845,0.091101,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3,0.612091,1.0,0.0,1.0,3.0
2,1.0,-0.488854,-0.198713,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1,-0.291218,0.0,0.0,0.0,2.0
3,1.0,0.42073,-0.03951,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3,0.386263,1.0,0.0,1.0,3.0
4,1.0,-0.486337,-0.198598,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,0.386263,0.0,0.0,0.0,3.0


In [908]:
cols_to_drop = ["fare^0"]
processed_data.drop(columns=cols_to_drop, inplace=True)
processed_data.head()

Unnamed: 0,fare,fare^2,is_female,is_male,is_master,is_miss,is_mr,is_mrs,is_other,passenger_class,age,siblings,parch,had_cabin,binned_age
0,-0.502445,-0.199305,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,-0.592321,1.0,0.0,0.0,2.0
1,0.786845,0.091101,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3,0.612091,1.0,0.0,1.0,3.0
2,-0.488854,-0.198713,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1,-0.291218,0.0,0.0,0.0,2.0
3,0.42073,-0.03951,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3,0.386263,1.0,0.0,1.0,3.0
4,-0.486337,-0.198598,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,0.386263,0.0,0.0,0.0,3.0


In [909]:
#model = XGBClassifier(n_estimators=40, learning_rate=0.11, max_depth=10)
model = LogisticRegression(random_state=0, max_iter=1000, solver="liblinear")

means = []
for i in range(2, 11):
  scores = cross_val_score(model, processed_data, y, cv=i, scoring="accuracy")
  means.append(scores.mean())
  print(scores.mean())

print()
print(sum(means) / len(means))

0.8282838716178768
0.8271604938271605
0.824935361370339
0.8260059004456719
0.8248987242275834
0.8226588863892015
0.8237914253539254
0.823793490460157
0.8248813982521848

0.8251566168826777


In [910]:
model.fit(processed_data, y)

In [911]:
test_data = pd.read_csv("test.csv")
test_data.drop(columns=["PassengerId", "Ticket", "Embarked"], inplace=True)

test_data.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin
0,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,
3,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,


In [912]:
titles = []
for name in test_data["Name"]:
  txt1 = name.split(",")[1]
  txt2 = txt1.split(".")[0]
  txt2 = txt2.strip()
  titles.append(txt2)
test_data["Title"] = pd.DataFrame(titles)
common_titles = ["Mr", "Miss", "Mrs", "Master"]

for index in test_data.index:
  if test_data.loc[index, "Title"] not in common_titles:
    test_data.loc[index, "Title"] = "Other"

test_data["Age"] = test_data["Age"].fillna(test_data.groupby(['Sex','Title'])["Age"].transform("mean"))

test_data.drop(columns="Name", inplace=True)

cabin_nans = test_data["Cabin"].isna()
cabin_nans = cabin_nans.map({True: 0, False: 1})
test_data["Cabin"] = cabin_nans

test_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Title
0,3,male,34.5,0,0,7.8292,0,Mr
1,3,female,47.0,1,0,7.0,0,Mrs
2,2,male,62.0,0,0,9.6875,0,Mr
3,3,male,27.0,0,0,8.6625,0,Mr
4,3,female,22.0,1,1,12.2875,0,Mrs


In [913]:
print_nans(test_data)

Fare 1


In [914]:
test_data = pd.DataFrame(preprocessor.transform(test_data))
test_data[10] = test_data[10].map({3: 1, 1: 3, 2: 2})
test_data[11] = np.ceil(test_data[11])
test_data["binned_age"] = np.floor(test_data[11] / 10)

test_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,binned_age
0,1.0,7.8292,61.296373,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,35.0,0.0,0.0,0.0,3.0
1,1.0,7.0,49.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,47.0,1.0,0.0,0.0,4.0
2,1.0,9.6875,93.847656,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2,62.0,0.0,0.0,0.0,6.0
3,1.0,8.6625,75.038906,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,27.0,0.0,0.0,0.0,2.0
4,1.0,12.2875,150.982656,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,22.0,1.0,1.0,0.0,2.0


In [915]:
test_data.drop(columns=0, inplace=True)
test_data.columns = ["fare", "fare^2", "is_female", "is_male", "is_master", "is_miss", "is_mr", "is_mrs", "is_other", "passenger_class", "age", "siblings", "parch", "had_cabin", "binned_age"]

for col in columns_to_scale:
  scaled = StandardScaler().fit_transform(test_data[[col]])
  test_data[col] = pd.DataFrame(scaled)

test_data.head()

Unnamed: 0,fare,fare^2,is_female,is_male,is_master,is_miss,is_mr,is_mrs,is_other,passenger_class,age,siblings,parch,had_cabin,binned_age
0,-0.498258,-0.250235,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,0.364491,0.0,0.0,0.0,3.0
1,-0.513125,-0.250948,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1.289268,1.0,0.0,0.0,4.0
2,-0.46494,-0.248349,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2,2.44524,0.0,0.0,0.0,6.0
3,-0.483317,-0.249439,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,-0.252028,0.0,0.0,0.0,2.0
4,-0.418323,-0.245038,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,-0.637352,1.0,1.0,0.0,2.0


In [916]:
predictions = model.predict(test_data)
predictions

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [917]:
def make_submission(predictions):
  predictions_df = pd.DataFrame(data={"PassengerId": range(892, 892 + len(predictions)), "Survived": predictions})
  predictions_df.to_csv("submission.csv", index=False)

make_submission(predictions)