In [326]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split

In [327]:
titanic = sns.load_dataset('titanic')

In [328]:
titanic.to_csv("./titanic.csv",index=False)

In [329]:
titanic_copy = titanic.copy()

In [330]:
titanic_copy

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [331]:
titanic.shape

(891, 15)

In [332]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [333]:
# checking for null values
titanic.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [334]:
#checking for duplicated rows ! 
titanic.duplicated().sum()

107

In [335]:
#removing duplicated rows 
titanic.drop_duplicates(inplace=True)

In [336]:
titanic.duplicated().sum()

0

In [337]:

titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


<p>Duplicated data's has been removed, now lets see for the null values and what can we do about it ! </p>

In [338]:
(titanic.isna().sum()/titanic.shape[0]) * 100

survived        0.000000
pclass          0.000000
sex             0.000000
age            13.520408
sibsp           0.000000
parch           0.000000
fare            0.000000
embarked        0.255102
class           0.000000
who             0.000000
adult_male      0.000000
deck           74.234694
embark_town     0.255102
alive           0.000000
alone           0.000000
dtype: float64

##### we can remove the embarked and embark_town null values  , since to mee it seems like the 0.22 null values can be ignored

In [339]:
# removing embarked and embark_town columns ! 
titanic.dropna(subset=["embarked","embark_town"],inplace=True)

##### predicting age using regression method ! , thanks to deepseek ! :)


In [340]:
from sklearn.ensemble import RandomForestRegressor

# Split into known and unknown age data
known_age = titanic[titanic['age'].notna()]
unknown_age = titanic[titanic['age'].isna()]

# Train a model (example)
model = RandomForestRegressor()
X_train = known_age[['pclass', 'fare', 'sibsp', 'parch']]
y_train = known_age['age']
model.fit(X_train, y_train)

# Predict missing ages
X_test = unknown_age[['pclass', 'fare', 'sibsp', 'parch']]
predicted_ages = model.predict(X_test)
titanic.loc[titanic['age'].isna(), 'age'] = predicted_ages

In [341]:
titanic.isna().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         0
class            0
who              0
adult_male       0
deck           582
embark_town      0
alive            0
alone            0
dtype: int64

##### Now, let's use classification algorithm to mess up with this deck column !  But before that we need to check on some of the data's and carry on with preprocessing ! 


In [342]:
titanic['deck'].value_counts()

deck
C    59
B    44
D    33
E    32
A    15
F    13
G     4
Name: count, dtype: int64

In [343]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [344]:
titanic["class"].value_counts(),titanic["who"].value_counts(),titanic["sex"].value_counts(),titanic["who"].value_counts()

(class
 Third     405
 First     212
 Second    165
 Name: count, dtype: int64,
 who
 man      451
 woman    249
 child     82
 Name: count, dtype: int64,
 sex
 male      491
 female    291
 Name: count, dtype: int64,
 who
 man      451
 woman    249
 child     82
 Name: count, dtype: int64)

In [345]:
# creating a new column name adult_female based on her age , returns True if age >= 18 else, return false 
titanic['adult_female'] = (titanic['sex'] == 'female') & (titanic['age'] >= 18)

In [346]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,adult_female
0,0,3,male,22.000000,1,0,7.2500,S,Third,man,True,,Southampton,no,False,False
1,1,1,female,38.000000,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,True
2,1,3,female,26.000000,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True,True
3,1,1,female,35.000000,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,True
4,0,3,male,35.000000,0,0,8.0500,S,Third,man,True,,Southampton,no,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.000000,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False,True
887,1,1,female,19.000000,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,True
888,0,3,female,17.329417,1,2,23.4500,S,Third,woman,False,,Southampton,no,False,False
889,1,1,male,26.000000,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True,False


In [347]:
titanic["family_size"] = titanic["sibsp"] + titanic["parch"]

In [348]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,adult_female,family_size
0,0,3,male,22.000000,1,0,7.2500,S,Third,man,True,,Southampton,no,False,False,1
1,1,1,female,38.000000,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,True,1
2,1,3,female,26.000000,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True,True,0
3,1,1,female,35.000000,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,True,1
4,0,3,male,35.000000,0,0,8.0500,S,Third,man,True,,Southampton,no,True,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.000000,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False,True,5
887,1,1,female,19.000000,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,True,0
888,0,3,female,17.329417,1,2,23.4500,S,Third,woman,False,,Southampton,no,False,False,3
889,1,1,male,26.000000,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True,False,0


In [349]:
temp_df = titanic.copy()

#### we will be working with the copy of original data now, though i should have done this from the beginning ! 

In [350]:
temp_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,adult_female,family_size
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,False,1
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,True,1
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,True,0
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,True,1
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,False,0


In [351]:
temp_df["who"].value_counts()

who
man      451
woman    249
child     82
Name: count, dtype: int64

<p>We are changing categorical values into numerical values ! Using mapping,ordinalEncoder,OneHotEncoder whichever gives us the best options </p>

In [352]:
temp_df["sex_en"] = temp_df["sex"].map({'male' : 0,'female' : 1})
temp_df["who_en"] = temp_df["who"].map({'man' : 0,'woman' : 1,'child':2})
temp_df["adult_male_en"] = temp_df["adult_male"].astype(int)
temp_df["alive_en"] = temp_df["alive"].map({'no' : 0,'yes' : 1})
temp_df["alone_en"] = temp_df["alone"].astype(int)
temp_df["adult_female_en"] = temp_df["adult_female"].astype(int)
# temp_df["sex_en"] = temp_df["sex"].map({'male' : 0,'female' : 1})

In [353]:
temp_df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone', 'adult_female', 'family_size', 'sex_en', 'who_en',
       'adult_male_en', 'alive_en', 'alone_en', 'adult_female_en'],
      dtype='object')

##### USING ORDINAL ENCODING FOR  CLASS BECAUSE PRIORITY OF THIS COLUMN MATTERS 

In [354]:
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,LabelEncoder

In [355]:
temp_df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,alive,alone,adult_female,family_size,sex_en,who_en,adult_male_en,alive_en,alone_en,adult_female_en
885,0,3,female,39.0,0,5,29.125,Q,Third,woman,...,no,False,True,5,1,1,0,0,0,1
887,1,1,female,19.0,0,0,30.0,S,First,woman,...,yes,True,True,0,1,1,0,1,1,1
888,0,3,female,17.329417,1,2,23.45,S,Third,woman,...,no,False,False,3,1,1,0,0,0,0
889,1,1,male,26.0,0,0,30.0,C,First,man,...,yes,True,False,0,0,0,1,1,1,0
890,0,3,male,32.0,0,0,7.75,Q,Third,man,...,no,True,False,0,0,0,1,0,1,0


In [356]:
### using ordinalEncoder 
oe = OrdinalEncoder()
temp_df["class_en"] = oe.fit_transform(temp_df[['class']])

In [357]:
temp_df.sample(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,alone,adult_female,family_size,sex_en,who_en,adult_male_en,alive_en,alone_en,adult_female_en,class_en
572,1,1,male,36.0,0,0,26.3875,S,First,man,...,True,False,0,0,0,1,1,1,0,0.0
190,1,2,female,32.0,0,0,13.0,S,Second,woman,...,True,True,0,1,1,0,1,1,1,1.0
310,1,1,female,24.0,0,0,83.1583,C,First,woman,...,True,True,0,1,1,0,1,1,1,0.0
852,0,3,female,9.0,1,1,15.2458,C,Third,child,...,False,False,2,1,2,0,0,0,0,2.0
801,1,2,female,31.0,1,1,26.25,S,Second,woman,...,False,True,2,1,1,0,1,0,1,1.0


In [358]:
temp_df["embarked"].value_counts()

embarked
S    568
C    155
Q     59
Name: count, dtype: int64

In [359]:
temp_df["embark_town"].value_counts()

embark_town
Southampton    568
Cherbourg      155
Queenstown      59
Name: count, dtype: int64

In [360]:
temp_df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,alone,adult_female,family_size,sex_en,who_en,adult_male_en,alive_en,alone_en,adult_female_en,class_en
0,0,3,male,22.000000,1,0,7.2500,S,Third,man,...,False,False,1,0,0,1,0,0,0,2.0
1,1,1,female,38.000000,1,0,71.2833,C,First,woman,...,False,True,1,1,1,0,1,0,1,0.0
2,1,3,female,26.000000,0,0,7.9250,S,Third,woman,...,True,True,0,1,1,0,1,1,1,2.0
3,1,1,female,35.000000,1,0,53.1000,S,First,woman,...,False,True,1,1,1,0,1,0,1,0.0
4,0,3,male,35.000000,0,0,8.0500,S,Third,man,...,True,False,0,0,0,1,0,1,0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.000000,0,5,29.1250,Q,Third,woman,...,False,True,5,1,1,0,0,0,1,2.0
887,1,1,female,19.000000,0,0,30.0000,S,First,woman,...,True,True,0,1,1,0,1,1,1,0.0
888,0,3,female,17.329417,1,2,23.4500,S,Third,woman,...,False,False,3,1,1,0,0,0,0,2.0
889,1,1,male,26.000000,0,0,30.0000,C,First,man,...,True,False,0,0,0,1,1,1,0,0.0


In [361]:
### USING LABEL FOR EMBARKED and EMBARK TOWN 

In [362]:

encoder = LabelEncoder()
temp_df["embarked_en"] = encoder.fit_transform(temp_df["embarked"])
temp_df["embark_town_en"] = encoder.fit_transform(temp_df["embark_town"])

In [363]:
temp_df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,family_size,sex_en,who_en,adult_male_en,alive_en,alone_en,adult_female_en,class_en,embarked_en,embark_town_en
885,0,3,female,39.0,0,5,29.125,Q,Third,woman,...,5,1,1,0,0,0,1,2.0,1,1
887,1,1,female,19.0,0,0,30.0,S,First,woman,...,0,1,1,0,1,1,1,0.0,2,2
888,0,3,female,17.329417,1,2,23.45,S,Third,woman,...,3,1,1,0,0,0,0,2.0,2,2
889,1,1,male,26.0,0,0,30.0,C,First,man,...,0,0,0,1,1,1,0,0.0,0,0
890,0,3,male,32.0,0,0,7.75,Q,Third,man,...,0,0,0,1,0,1,0,2.0,1,1


##### now let's create another df for just numerical values 

In [364]:
temp_df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone', 'adult_female', 'family_size', 'sex_en', 'who_en',
       'adult_male_en', 'alive_en', 'alone_en', 'adult_female_en', 'class_en',
       'embarked_en', 'embark_town_en'],
      dtype='object')

In [365]:
temp_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 782 entries, 0 to 890
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   survived         782 non-null    int64   
 1   pclass           782 non-null    int64   
 2   sex              782 non-null    object  
 3   age              782 non-null    float64 
 4   sibsp            782 non-null    int64   
 5   parch            782 non-null    int64   
 6   fare             782 non-null    float64 
 7   embarked         782 non-null    object  
 8   class            782 non-null    category
 9   who              782 non-null    object  
 10  adult_male       782 non-null    bool    
 11  deck             200 non-null    category
 12  embark_town      782 non-null    object  
 13  alive            782 non-null    object  
 14  alone            782 non-null    bool    
 15  adult_female     782 non-null    bool    
 16  family_size      782 non-null    int64   
 17  se

In [366]:
df_num = temp_df.select_dtypes(exclude=["bool",'object'])

In [367]:
df_num.drop(columns=["class"],inplace=True)

In [405]:
titanic.isna().sum()

survived          0
pclass            0
sex               0
age               0
sibsp             0
parch             0
fare              0
embarked          0
class             0
who               0
adult_male        0
deck            582
embark_town       0
alive             0
alone             0
adult_female      0
family_size       0
dtype: int64

In [406]:
df_num

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,deck,family_size,sex_en,who_en,adult_male_en,alive_en,alone_en,adult_female_en,class_en,embarked_en,embark_town_en
0,0,3,22.000000,1,0,7.2500,,1,0,0,1,0,0,0,2.0,2,2
1,1,1,38.000000,1,0,71.2833,C,1,1,1,0,1,0,1,0.0,0,0
2,1,3,26.000000,0,0,7.9250,,0,1,1,0,1,1,1,2.0,2,2
3,1,1,35.000000,1,0,53.1000,C,1,1,1,0,1,0,1,0.0,2,2
4,0,3,35.000000,0,0,8.0500,,0,0,0,1,0,1,0,2.0,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,39.000000,0,5,29.1250,,5,1,1,0,0,0,1,2.0,1,1
887,1,1,19.000000,0,0,30.0000,B,0,1,1,0,1,1,1,0.0,2,2
888,0,3,17.329417,1,2,23.4500,,3,1,1,0,0,0,0,2.0,2,2
889,1,1,26.000000,0,0,30.0000,C,0,0,0,1,1,1,0,0.0,0,0


In [407]:
from sklearn.feature_selection import SelectKBest,chi2

##### now lets separate the data into known and unknown colun

In [380]:
known_data = df_num[df_num["deck"].notna()]
unknown_data = df_num[df_num["deck"].isna()]

In [381]:
known_data.columns

Index(['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'deck',
       'family_size', 'sex_en', 'who_en', 'adult_male_en', 'alive_en',
       'alone_en', 'adult_female_en', 'class_en', 'embarked_en',
       'embark_town_en'],
      dtype='object')

In [408]:
known_data_x = known_data.loc[:,['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare',
       'family_size', 'sex_en', 'who_en', 'adult_male_en', 'alive_en',
       'alone_en', 'adult_female_en', 'class_en', 'embarked_en',
       'embark_town_en']]
known_data_y = known_data.loc[:,'deck']

In [417]:
selector = SelectKBest(score_func=chi2, k=5)
x_new = selector.fit_transform(known_data_x, known_data_y)

# Get selected feature names
selected_features = known_data_x.columns[selector.get_support()]
print("Top features:", selected_features)

Top features: Index(['pclass', 'age', 'fare', 'family_size', 'class_en'], dtype='object')


In [None]:
known_data_x = pd.DataFrame(known_data_x,columns=selected_features)

In [421]:
known_data_x["class_en"].value_counts()

class_en
0.0    172
1.0     16
2.0     12
Name: count, dtype: int64

In [424]:
known_data_y.value_counts()

deck
C    59
B    44
D    33
E    32
A    15
F    13
G     4
Name: count, dtype: int64

In [426]:
from imblearn.over_sampling import RandomOverSampler

In [428]:
ros = RandomOverSampler()
x,y = ros.fit_resample(known_data_x,known_data_y)

In [430]:
y.value_counts()

deck
A    59
B    59
C    59
D    59
E    59
F    59
G    59
Name: count, dtype: int64

In [465]:
x

Unnamed: 0,pclass,age,fare,family_size,class_en
0,1,38.0,71.2833,1,0.0
1,1,35.0,53.1000,1,0.0
2,1,54.0,51.8625,0,0.0
3,3,4.0,16.7000,2,2.0
4,1,58.0,26.5500,0,0.0
...,...,...,...,...,...
408,3,2.0,10.4625,1,2.0
409,3,4.0,16.7000,2,2.0
410,3,24.0,16.7000,2,2.0
411,3,4.0,16.7000,2,2.0


In [466]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=50,test_size=0.2)

In [476]:
for i in range(0,100):
    x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=i,test_size=0.1)
    model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    class_weight='balanced', 
    )
    model.fit(x_train,y_train)

    print(i,model.score(x_test,y_test) * 100,model.score(x_train,y_train) * 100)
    



0 80.95238095238095 99.73045822102425
1 71.42857142857143 99.46091644204851
2 85.71428571428571 99.46091644204851
3 88.09523809523809 99.73045822102425
4 90.47619047619048 99.73045822102425
5 76.19047619047619 99.73045822102425
6 80.95238095238095 100.0
7 90.47619047619048 99.73045822102425
8 85.71428571428571 99.73045822102425
9 85.71428571428571 99.73045822102425
10 76.19047619047619 100.0
11 85.71428571428571 100.0
12 83.33333333333334 100.0
13 88.09523809523809 99.73045822102425
14 88.09523809523809 99.46091644204851
15 88.09523809523809 99.46091644204851
16 92.85714285714286 100.0
17 85.71428571428571 99.73045822102425
18 80.95238095238095 99.73045822102425
19 71.42857142857143 99.73045822102425
20 88.09523809523809 99.73045822102425
21 85.71428571428571 100.0
22 83.33333333333334 99.46091644204851
23 83.33333333333334 99.46091644204851
24 73.80952380952381 99.73045822102425
25 92.85714285714286 99.19137466307278
26 92.85714285714286 99.46091644204851
27 88.09523809523809 99.73045

<p>The best bet seems to be at 33 , so lets use that </p>

In [477]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=33,test_size=0.1)

In [None]:


model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    class_weight='balanced',   
    
)
model.fit(x_train,y_train)

In [482]:
model.score(x_test,y_test) * 100,model.score(x_train,y_train)

(95.23809523809523, 0.9973045822102425)

In [483]:
x_predict = pd.DataFrame(unknown_data,columns=selected_features)

In [484]:
x_predict

Unnamed: 0,pclass,age,fare,family_size,class_en
0,3,22.000000,7.2500,1,2.0
2,3,26.000000,7.9250,0,2.0
4,3,35.000000,8.0500,0,2.0
5,3,23.950000,8.4583,0,2.0
7,3,2.000000,21.0750,4,2.0
...,...,...,...,...,...
882,3,22.000000,10.5167,0,2.0
883,2,28.000000,10.5000,0,1.0
885,3,39.000000,29.1250,5,2.0
888,3,17.329417,23.4500,3,2.0


In [485]:
predicted_decks = model.predict(x_predict)

In [486]:
predicted_decks

array(['G', 'F', 'F', 'F', 'F', 'G', 'F', 'F', 'F', 'F', 'E', 'F', 'D',
       'E', 'F', 'D', 'F', 'F', 'F', 'F', 'F', 'F', 'B', 'F', 'E', 'C',
       'D', 'F', 'F', 'G', 'G', 'G', 'F', 'F', 'F', 'F', 'E', 'E', 'F',
       'E', 'F', 'F', 'F', 'E', 'F', 'F', 'F', 'F', 'F', 'B', 'G', 'F',
       'G', 'G', 'F', 'F', 'F', 'E', 'F', 'F', 'E', 'F', 'E', 'F', 'A',
       'E', 'G', 'F', 'F', 'E', 'F', 'F', 'F', 'F', 'F', 'F', 'E', 'G',
       'F', 'F', 'F', 'F', 'F', 'E', 'F', 'G', 'F', 'F', 'F', 'F', 'F',
       'F', 'F', 'G', 'F', 'F', 'F', 'F', 'F', 'E', 'F', 'D', 'F', 'G',
       'F', 'E', 'F', 'E', 'F', 'F', 'F', 'D', 'E', 'F', 'G', 'F', 'E',
       'F', 'E', 'F', 'F', 'E', 'D', 'F', 'F', 'F', 'F', 'F', 'D', 'F',
       'F', 'G', 'G', 'F', 'E', 'F', 'F', 'D', 'F', 'F', 'E', 'B', 'G',
       'F', 'E', 'F', 'G', 'G', 'F', 'E', 'F', 'F', 'F', 'E', 'F', 'F',
       'F', 'D', 'F', 'G', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'E', 'F',
       'F', 'F', 'F', 'E', 'F', 'F', 'F', 'F', 'F', 'E', 'E', 'E

In [487]:
df_num.loc[df_num['deck'].isna(), 'deck'] = predicted_decks

In [488]:
df_num

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,deck,family_size,sex_en,who_en,adult_male_en,alive_en,alone_en,adult_female_en,class_en,embarked_en,embark_town_en
0,0,3,22.000000,1,0,7.2500,G,1,0,0,1,0,0,0,2.0,2,2
1,1,1,38.000000,1,0,71.2833,C,1,1,1,0,1,0,1,0.0,0,0
2,1,3,26.000000,0,0,7.9250,F,0,1,1,0,1,1,1,2.0,2,2
3,1,1,35.000000,1,0,53.1000,C,1,1,1,0,1,0,1,0.0,2,2
4,0,3,35.000000,0,0,8.0500,F,0,0,0,1,0,1,0,2.0,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,39.000000,0,5,29.1250,F,5,1,1,0,0,0,1,2.0,1,1
887,1,1,19.000000,0,0,30.0000,B,0,1,1,0,1,1,1,0.0,2,2
888,0,3,17.329417,1,2,23.4500,F,3,1,1,0,0,0,0,2.0,2,2
889,1,1,26.000000,0,0,30.0000,C,0,0,0,1,1,1,0,0.0,0,0
