In [1]:
import numpy as np 
import pandas as pd 
import warnings


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


warnings.filterwarnings('ignore')

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
train.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
281,282,0,3,"Olsson, Mr. Nils Johan Goransson",male,28.0,0,0,347464,7.8542,,S
214,215,0,3,"Kiernan, Mr. Philip",male,,1,0,367229,7.75,,Q
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
724,725,1,1,"Chambers, Mr. Norman Campbell",male,27.0,1,0,113806,53.1,E8,S
410,411,0,3,"Sdycoff, Mr. Todor",male,,0,0,349222,7.8958,,S


In [3]:
test = pd.read_csv('/kaggle/input/titanic/test.csv')
test_pids = test["PassengerId"]

In [4]:
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [5]:
title_mapping = {
    
    "Mr": "Mr",
    "Miss": "Miss",
    "Mrs": "Mrs",
    "Master": "Master",
    
    "Dr": "Professional",
    "Rev": "Professional",
    "Col": "Professional",
    "Major": "Professional",
    "Capt": "Professional",
    
    "Countess": "Nobility",
    "Lady": "Nobility",
    "Sir": "Nobility",
    "Don": "Nobility",
    "Jonkheer": "Nobility",
    
    "Mlle": "Miss",  
    "Mme": "Mrs",    
    "Ms": "Mrs",     
    "Dona": "Nobility"  
}


train['Title'] = train['Title'].map(title_mapping)
test['Title'] = test['Title'].map(title_mapping)

print(train['Title'].value_counts())
print(test['Title'].value_counts())

Title
Mr              517
Miss            184
Mrs             127
Master           40
Professional     18
Nobility          5
Name: count, dtype: int64
Title
Mr              240
Miss             78
Mrs              73
Master           21
Professional      5
Nobility          1
Name: count, dtype: int64


In [6]:
def clean(data):
    data = data.drop(["Ticket", "PassengerId", "Name", "Cabin"], axis=1)
    
    cols = ["SibSp", "Parch", "Fare", "Age"]
    for col in cols:
        data[col].fillna(data[col].mean(), inplace=True)
        
    data['Embarked'].fillna("U", inplace=True)
    return data

train = clean(train)
test = clean(test)

In [7]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
cols = ["Sex", "Title", "Embarked"]

for col in cols:
    train[col] = label_encoder.fit_transform(train[col])
    test[col] = label_encoder.fit_transform(test[col])
    print(label_encoder.classes_)

train.sample(5)

['female' 'male']
['Master' 'Miss' 'Mr' 'Mrs' 'Nobility' 'Professional']
['C' 'Q' 'S']


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
822,0,1,1,38.0,0,0,0.0,2,4
530,1,2,0,2.0,1,1,26.0,2,1
551,0,2,1,27.0,0,0,26.0,2,2
75,0,3,1,25.0,0,0,7.65,2,2
655,0,2,1,24.0,2,0,73.5,2,2


In [8]:
train.loc[:, 'Pclass':'Title']

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,3,1,22.000000,1,0,7.2500,2,2
1,1,0,38.000000,1,0,71.2833,0,3
2,3,0,26.000000,0,0,7.9250,2,1
3,1,0,35.000000,1,0,53.1000,2,3
4,3,1,35.000000,0,0,8.0500,2,2
...,...,...,...,...,...,...,...,...
886,2,1,27.000000,0,0,13.0000,2,5
887,1,0,19.000000,0,0,30.0000,2,1
888,3,0,29.699118,1,2,23.4500,2,1
889,1,1,26.000000,0,0,30.0000,0,2


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

y = train["Survived"]
X = train.drop("Survived", axis=1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
lr = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)

In [11]:
print(lr.intercept_)
print(lr.coef_)

[4.63699095]
[[-9.34444069e-01 -2.59652160e+00 -2.97138313e-02 -2.98634230e-01
  -1.12800769e-01  2.49026278e-03 -2.17958904e-01 -4.12244515e-02]]


In [12]:
predictions = lr.predict(X_val)
from sklearn.metrics import accuracy_score
accuracy_score(y_val, predictions)

0.8100558659217877

In [13]:
submission = lr.predict(test)

In [14]:
submission_df = pd.DataFrame({"PassengerId": test_pids.values,
                   "Survived": submission,
                  })

In [15]:
submission_df.to_csv("/kaggle/working/submission.csv", index=False)