In [None]:
#Importing Libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
import os
from scipy.stats import mode
import string
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Loading data 
df_train = pd.read_csv('../input/titanic/train.csv')
df_test = pd.read_csv('../input/titanic/test.csv')
df_train

In [None]:
#Checking the numbers of columns and rows
rows = len(df_train.axes[0])
columns = len(df_train.axes[1])

print("Number of Rows:",rows)
print("Number of Columns:",columns)

#Checking of Data Types
df_train.info()


    It shows that all columns are in a correct data types as of now. After checking the data types and number of columns and rows, We can now proceed to cleaning and exploring the data. 

In [None]:
#Checking of null values
missing_values_count = df_train.isnull().sum()
missing_values_count[0:]


It shows that there are 177 missing values out of 891 in column Age , 687 missing values out of 891 in column cabin and 2 missing values out of 891 in Column Embarked. We can replace the missing value in Column Age by its mean and In column Embarked, we can replace the missing values by its mode. While in Cabin column, we will extract it since it contains the the letter of the deck. Also, I created a new column 'Died' 

In [None]:
#Data Manipulation and Feature Engineering
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if str.find(big_string, substring) != -1:
            return substring
    print (big_string)
    return np.nan
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
df_train['Cabin'] = df_train['Cabin'].fillna('Unknown') 
df_train['Deck']=df_train['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
df_train['Age'].replace([np.nan],df_train['Age'].mean(),inplace=True)
df_train['Embarked'].replace([np.nan],df_train['Embarked'].mode(),inplace=True)
#Extracting the Died passenger from the Survived Column
df_train['Died'] = 1 - df_train['Survived']
#Scaling the Age and Fare data values 
df_train['Age'] = (df_train['Age']-min(df_train['Age']))/(max(df_train['Age']-min(df_train['Age'])))
df_train['Fare'] = (df_train['Fare']-min(df_train['Fare']))/(max(df_train['Fare']-min(df_train['Fare'])))
df_train.head()


Let's start exploring the columns 

In [None]:
#Data Exploration
z = sns.countplot(x='Sex',data=df_train)
plt.title("Gender")
for i in z.containers:
    z.bar_label(i,)

In [None]:
z1 = df_train.groupby('Sex').agg('sum')[['Survived','Died']].plot(kind='bar',figsize = (5,5),stacked=True,color = ['Green', 'red'])
for i1 in z1.containers:
    z1.bar_label(i1,)

There are only 233 female passengers survivor out of 314 while 109 male survivor out of 577. It only shows that 81 female and 468 male passenger died. 

In [None]:
z2 = sns.countplot(x='Pclass',hue='Pclass',data=df_train,dodge=False)
plt.title("Socio-economic status")
plt.legend(title='Status', loc='upper left', labels=['Upper', 'Middle','Lower'])
for i2 in z2.containers:
    z2.bar_label(i2,)

In [None]:
z3 = df_train.groupby('Deck').agg('sum')[['Died']].plot(kind='bar',figsize = (8,8),stacked=True,color = ['red'])
plt.title("Number of passenger died based on there deck")
for i in z3.containers:
    z3.bar_label(i,)

That's all for the EDA. Now, lets build our model. I'm gonna use the XGBoost. 

In [None]:
#Setting the load data in variable X and X_test
X = df_train
X_test = df_test

#Setting predicion target
y = X.Survived
X.drop(['Survived','Name','Ticket','Cabin','Died','PassengerId'],axis=1,inplace=True)

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [None]:
# Apply ordinal encoder to the X_train_full dataset
s = (X_train_full.dtypes == 'object')
object_cols = list(s[s].index)
ordinal_encoder = OrdinalEncoder()
X_train_full[object_cols] = ordinal_encoder.fit_transform(X_train_full[object_cols])
X_valid_full[object_cols] = ordinal_encoder.transform(X_valid_full[object_cols])


In [None]:
#Hypertuning the XGBoostModel 
params = { 'max_depth': [3, 5, 6, 10, 15, 20],
           'learning_rate': [0.01, 0.1, 0.2, 0.3],
           'subsample': np.arange(0.5, 1.0, 0.1),
           'colsample_bytree': np.arange(0.4, 1.0, 0.1),
           'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
           'n_estimators': [100, 500, 1000]}
XGBmodel = XGBRegressor(seed = 20)
clf = RandomizedSearchCV(estimator=XGBmodel,
                         param_distributions=params,
                         scoring='neg_mean_squared_error',
                         n_iter=25,
                         verbose=1)
clf.fit(X_train_full,y_train)
prediction = clf.predict(X_valid_full)
mae = mean_absolute_error(prediction,y_valid)
print("Mean Absolute Error:" , mae)


In [None]:
X_test.isnull().sum()

In [None]:
#Data Manipulation and Feature Engineering in Test Data
X_test['Age'].replace([np.nan],X_test['Age'].mean(),inplace=True)
X_test['Fare'].replace([np.nan],X_test['Fare'].median(),inplace=True)
X_test['Cabin'] = X_test['Cabin'].fillna('Unknown') 
X_test['Age'] = (X_test['Age']-min(X_test['Age']))/(max(X_test['Age']-min(X_test['Age'])))
X_test['Fare'] = (X_test['Fare']-min(X_test['Fare']))/(max(X_test['Fare']-min(X_test['Fare'])))
X_test['Deck']= X_test['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
X_test1 = X_test.drop(['Name','Ticket','Cabin','PassengerId'],axis=1)

In [None]:
X_test1.info()

For the cell below, if you recieved the error "ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''" just retry to run the code. 

In [None]:
# Apply ordinal encoder to the object columns in X_test data
s = (X_test1.dtypes == 'object')
object_cols1 = list(s[s].index)

ordinal_encoder1 = OrdinalEncoder()
X_test1[object_cols1] = ordinal_encoder1.fit_transform(X_test1[object_cols1])
X_test1

In [None]:
#Submission of the prediction result
test_pred = clf.predict(X_test1)
test_pred_new = np.rint(test_pred).astype(int)
submission = pd.DataFrame({
        "PassengerId": X_test['PassengerId'],
        "Survived": test_pred_new
    })
submission.to_csv('submission.csv', index=False)
pred_df = pd.read_csv('submission.csv')
pred_df

In [None]:
#Plotting the predict number of survivor and died passenger
z5 = sns.countplot(x='Survived',hue = 'Survived',data=pred_df,dodge=False)
plt.title("Status of the Passenger")
plt.legend(title='Status', loc='upper right', labels=['Died', 'Survive'])
for i5 in z5.containers:
    z5.bar_label(i5,)

That's all for my Titanic Competition entry. Would appreciate if you give points or suggestion to improve. I'm just a beginner wanting to learn more and be better. Thank you very much! 