In [525]:
import pandas as panda
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import OrdinalEncoder

In [526]:
train_file = panda.read_csv("../input/titanic/train.csv")
train_file.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [527]:
train_file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [528]:
train_file2 = train_file.drop(['Name', 'Ticket', 'Cabin'], axis=1)
train_file2.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [529]:
impu = SimpleImputer(strategy='mean')
titanic_numerical = train_file2.drop(['Embarked', 'Sex'], axis=1)
impu.fit(titanic_numerical)

SimpleImputer()

In [530]:
impu.statistics_

array([4.46000000e+02, 3.83838384e-01, 2.30864198e+00, 2.96991176e+01,
       5.23007856e-01, 3.81593715e-01, 3.22042080e+01])

In [531]:
d_f = impu.transform(titanic_numerical)
d_f[:5, :]

array([[ 1.    ,  0.    ,  3.    , 22.    ,  1.    ,  0.    ,  7.25  ],
       [ 2.    ,  1.    ,  1.    , 38.    ,  1.    ,  0.    , 71.2833],
       [ 3.    ,  1.    ,  3.    , 26.    ,  0.    ,  0.    ,  7.925 ],
       [ 4.    ,  1.    ,  1.    , 35.    ,  1.    ,  0.    , 53.1   ],
       [ 5.    ,  0.    ,  3.    , 35.    ,  0.    ,  0.    ,  8.05  ]])

In [532]:
t_transform = panda.DataFrame(d_f, columns=titanic_numerical.columns, index=titanic_numerical.index)
t_transform.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
0,1.0,0.0,3.0,22.0,1.0,0.0,7.25
1,2.0,1.0,1.0,38.0,1.0,0.0,71.2833
2,3.0,1.0,3.0,26.0,0.0,0.0,7.925
3,4.0,1.0,1.0,35.0,1.0,0.0,53.1
4,5.0,0.0,3.0,35.0,0.0,0.0,8.05


In [533]:
t_transform.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    float64
 1   Survived     891 non-null    float64
 2   Pclass       891 non-null    float64
 3   Age          891 non-null    float64
 4   SibSp        891 non-null    float64
 5   Parch        891 non-null    float64
 6   Fare         891 non-null    float64
dtypes: float64(7)
memory usage: 48.9 KB


In [534]:
d_f = t_transform.drop('Survived', axis=1)
d_f2 = t_transform['Survived'].copy()
d_f.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
0,1.0,3.0,22.0,1.0,0.0,7.25
1,2.0,1.0,38.0,1.0,0.0,71.2833
2,3.0,3.0,26.0,0.0,0.0,7.925
3,4.0,1.0,35.0,1.0,0.0,53.1
4,5.0,3.0,35.0,0.0,0.0,8.05


In [535]:
d_f2

0      0.0
1      1.0
2      1.0
3      1.0
4      0.0
      ... 
886    0.0
887    1.0
888    0.0
889    1.0
890    0.0
Name: Survived, Length: 891, dtype: float64

In [536]:
d_f_train, d_f_test, d_f2_train, d_f2_test = train_test_split(d_f, d_f2, test_size=0.20, random_state=42)
LR = LogisticRegression(max_iter=200,random_state=42).fit(d_f_train, d_f2_train)
LR.predict(d_f_test)

array([0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       1., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0.,
       0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0.,
       0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 1., 1., 0., 0., 0., 0.,
       0., 0., 1., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0.,
       1., 1., 1., 0., 0., 0., 0., 1., 0.])

In [537]:
LR_CV = LogisticRegressionCV(cv=5, max_iter=300).fit(d_f_train, d_f2_train)
LR_CV.predict(d_f_test)

array([0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       1., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0.,
       0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 1., 0., 0., 0., 0.,
       0., 0., 1., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0.,
       1., 1., 1., 0., 0., 0., 0., 0., 0.])

In [538]:
LR_CV.score(d_f_test, d_f2_test) 

0.7430167597765364

In [539]:
o_encoder = OrdinalEncoder()
titanic_genre = np.c_[train_file2['Sex']]
genre_encoded = o_encoder.fit_transform(titanic_genre)
genre_encoded[:5]

array([[1.],
       [0.],
       [0.],
       [0.],
       [1.]])

In [540]:
t_transform['Sex'] = genre_encoded 
t_transform.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex
0,1.0,0.0,3.0,22.0,1.0,0.0,7.25,1.0
1,2.0,1.0,1.0,38.0,1.0,0.0,71.2833,0.0
2,3.0,1.0,3.0,26.0,0.0,0.0,7.925,0.0
3,4.0,1.0,1.0,35.0,1.0,0.0,53.1,0.0
4,5.0,0.0,3.0,35.0,0.0,0.0,8.05,1.0


In [541]:
d_f_new = t_transform.drop(['Survived'], axis=1)
d_f2_new = t_transform['Survived'].copy()
d_f_train_new, d_f_test_new, d_f2_train_new, d_f2_test_new = train_test_split(d_f_new, d_f2_new, test_size=0.30, random_state=42)
d_f_train_new.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex
445,446.0,1.0,4.0,0.0,2.0,81.8583,1.0
650,651.0,3.0,29.699118,0.0,0.0,7.8958,1.0
172,173.0,3.0,1.0,1.0,1.0,11.1333,0.0
450,451.0,2.0,36.0,1.0,2.0,27.75,1.0
314,315.0,2.0,43.0,1.0,1.0,26.25,1.0


In [542]:
LR = LogisticRegression(max_iter=200).fit(d_f_train_new, d_f2_train_new)
LR.predict(d_f_test_new)

array([0., 0., 0., 1., 1., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 0., 1., 0., 1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 1., 0., 0.,
       0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 1., 0., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1.,
       0., 0., 1., 1., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0., 1., 0., 1.,
       1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0.,
       1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
       0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0., 1., 1., 0.,
       0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0.,
       1., 0., 0., 0., 1.

In [543]:
LR.score(d_f_test_new, d_f2_test_new)

0.8097014925373134

In [544]:
test_file = panda.read_csv('/kaggle/input/titanic/test.csv')
test_file.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [545]:
test_file = test_file.drop(['Name', 'Ticket','Cabin', 'Embarked'], axis=1)
genre_cat_test_data = np.c_[test_file['Sex']]
test_data_genre_encoded = o_encoder.fit_transform(genre_cat_test_data)
test_data_genre_encoded[:5]

array([[1.],
       [0.],
       [1.],
       [1.],
       [0.]])

In [546]:
test_file_num_attr = test_file.drop(['Sex', 'PassengerId'], axis=1)
test_PId = test_file['PassengerId'].copy()
test_file_num_attr

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3,34.5,0,0,7.8292
1,3,47.0,1,0,7.0000
2,2,62.0,0,0,9.6875
3,3,27.0,0,0,8.6625
4,3,22.0,1,1,12.2875
...,...,...,...,...,...
413,3,,0,0,8.0500
414,1,39.0,0,0,108.9000
415,3,38.5,0,0,7.2500
416,3,,0,0,8.0500


In [547]:
test_data_impu = impu.fit_transform(test_file_num_attr)
test_data_impu

array([[ 3.        , 34.5       ,  0.        ,  0.        ,  7.8292    ],
       [ 3.        , 47.        ,  1.        ,  0.        ,  7.        ],
       [ 2.        , 62.        ,  0.        ,  0.        ,  9.6875    ],
       ...,
       [ 3.        , 38.5       ,  0.        ,  0.        ,  7.25      ],
       [ 3.        , 30.27259036,  0.        ,  0.        ,  8.05      ],
       [ 3.        , 30.27259036,  1.        ,  1.        , 22.3583    ]])

In [548]:
test_data_impu = panda.DataFrame(test_data_impu, columns=test_file_num_attr.columns, index=test_file_num_attr.index)
test_data_impu.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3.0,34.5,0.0,0.0,7.8292
1,3.0,47.0,1.0,0.0,7.0
2,2.0,62.0,0.0,0.0,9.6875
3,3.0,27.0,0.0,0.0,8.6625
4,3.0,22.0,1.0,1.0,12.2875


In [549]:
test_data_impu['Sex'] = test_data_genre_encoded
test_data_impu['PassengerId'] = test_PId
test_data_impu = test_data_impu[['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex']]
test_data_impu.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex
0,892,3.0,34.5,0.0,0.0,7.8292,1.0
1,893,3.0,47.0,1.0,0.0,7.0,0.0
2,894,2.0,62.0,0.0,0.0,9.6875,1.0
3,895,3.0,27.0,0.0,0.0,8.6625,1.0
4,896,3.0,22.0,1.0,1.0,12.2875,0.0


In [550]:
t_survived = LR.predict(test_data_impu)
t_survived[:20]

array([0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 1., 1.])

In [551]:
test_data_impu['Survived'] = t_survived
test_data_impu['Survived'] = test_data_impu['Survived'].astype(int)
test_data_impu.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex,Survived
0,892,3.0,34.5,0.0,0.0,7.8292,1.0,0
1,893,3.0,47.0,1.0,0.0,7.0,0.0,0
2,894,2.0,62.0,0.0,0.0,9.6875,1.0,0
3,895,3.0,27.0,0.0,0.0,8.6625,1.0,0
4,896,3.0,22.0,1.0,1.0,12.2875,0.0,1


In [552]:
last_test_data = test_data_impu.drop(['Pclass', 'Age', 'SibSp', 'Parch', 'Sex', 'Fare'], axis=1)
last_test_data.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [553]:
last_test_data.to_csv('final.csv', index=False)