<a href="https://www.kaggle.com/code/jibonk/titanic-logistic-regrssion?scriptVersionId=242874463" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np 
import pandas as pd 
import warnings


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


warnings.filterwarnings('ignore')

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
train.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
801,802,1,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31.0,1,1,C.A. 31921,26.25,,S
590,591,0,3,"Rintamaki, Mr. Matti",male,35.0,0,0,STON/O 2. 3101273,7.125,,S
592,593,0,3,"Elsbury, Mr. William James",male,47.0,0,0,A/5 3902,7.25,,S
15,16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",female,55.0,0,0,248706,16.0,,S
512,513,1,1,"McGough, Mr. James Robert",male,36.0,0,0,PC 17473,26.2875,E25,S


In [3]:
test = pd.read_csv('/kaggle/input/titanic/test.csv')
test_pids = test["PassengerId"]

In [4]:
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [5]:
title_mapping = {
    
    "Mr": "Mr",
    "Miss": "Miss",
    "Mrs": "Mrs",
    "Master": "Master",
    
    "Dr": "Professional",
    "Rev": "Professional",
    "Col": "Professional",
    "Major": "Professional",
    "Capt": "Professional",
    
    "Countess": "Nobility",
    "Lady": "Nobility",
    "Sir": "Nobility",
    "Don": "Nobility",
    "Jonkheer": "Nobility",
    
    "Mlle": "Miss",  
    "Mme": "Mrs",    
    "Ms": "Mrs",     
    "Dona": "Nobility"  
}


train['Title'] = train['Title'].map(title_mapping)
test['Title'] = test['Title'].map(title_mapping)

print(train['Title'].value_counts())
print(test['Title'].value_counts())

Title
Mr              517
Miss            184
Mrs             127
Master           40
Professional     18
Nobility          5
Name: count, dtype: int64
Title
Mr              240
Miss             78
Mrs              73
Master           21
Professional      5
Nobility          1
Name: count, dtype: int64


In [6]:
def clean(data):
    data = data.drop(["Ticket", "PassengerId", "Name", "Cabin"], axis=1)
    
    cols = ["SibSp", "Parch", "Fare", "Age"]
    for col in cols:
        data[col].fillna(data[col].median(), inplace=True)
        
    data['Embarked'].fillna("U", inplace=True)
    return data

train = clean(train)
test = clean(test)

In [7]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
cols = ["Sex", "Title", "Embarked"]

for col in cols:
    train[col] = label_encoder.fit_transform(train[col])
    test[col] = label_encoder.fit_transform(test[col])
    print(label_encoder.classes_)

train.sample(5)

['female' 'male']
['Master' 'Miss' 'Mr' 'Mrs' 'Nobility' 'Professional']
['C' 'Q' 'S']


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
188,0,3,1,40.0,1,1,15.5,1,2
732,0,2,1,28.0,0,0,0.0,2,2
721,0,3,1,17.0,1,0,7.0542,2,2
498,0,1,0,25.0,1,2,151.55,2,3
213,0,2,1,30.0,0,0,13.0,2,2


In [8]:
train.loc[:, 'Pclass':'Title']

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,3,1,22.0,1,0,7.2500,2,2
1,1,0,38.0,1,0,71.2833,0,3
2,3,0,26.0,0,0,7.9250,2,1
3,1,0,35.0,1,0,53.1000,2,3
4,3,1,35.0,0,0,8.0500,2,2
...,...,...,...,...,...,...,...,...
886,2,1,27.0,0,0,13.0000,2,5
887,1,0,19.0,0,0,30.0000,2,1
888,3,0,28.0,1,2,23.4500,2,1
889,1,1,26.0,0,0,30.0000,0,2


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

y = train["Survived"]
X = train.drop("Survived", axis=1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
lr = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)

In [11]:
print(lr.intercept_)
print(lr.coef_)

[4.60276931]
[[-9.35239812e-01 -2.59545131e+00 -2.88505982e-02 -2.96988685e-01
  -1.08264035e-01  2.49938959e-03 -2.13889045e-01 -4.59006401e-02]]


In [12]:
predictions = lr.predict(X_val)
from sklearn.metrics import accuracy_score
accuracy_score(y_val, predictions)

0.8100558659217877

In [13]:
submission = lr.predict(test)

In [14]:
submission_df = pd.DataFrame({"PassengerId": test_pids.values,
                   "Survived": submission,
                  })

In [15]:
submission_df.to_csv("/kaggle/working/submission.csv", index=False)