In [None]:
#libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [41]:
#load data
titanic=sns.load_dataset("titanic")
#print first ten 10 rows
titanic.head(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False


In [None]:
#Count the # columms in dataset
titanic.shape

In [None]:
#Get statistics
titanic.describe().T

In [None]:
#Get a count of the number of surviors
titanic.survived.value_counts()

In [None]:
#Visualize count of survivers
sns.countplot(titanic.survived)

In [None]:
#visualize the count of survivors for colıumns "who", "sex", "pclass", "sibsp", "parch", "embarked" 
cols=["who", "sex", "pclass", "sibsp", "parch", "embarked"]
n_rows=2
n_cols=3
#the subplot grid and figure size of each graph
fig,axs=plt.subplots(n_rows,n_cols,figsize=(n_cols*3.2,n_rows*3.2))
for r in range(0,n_rows):
    for c in range(0,n_cols):
        i=r*n_cols+c #index to go through the number of columns
        ax=axs[r][c] #show where to position each subplot
        sns.countplot(titanic[cols[i]],hue=titanic["survived"],ax=ax)
        ax.set_title(cols[i])
        ax.legend(title="survived",loc="upper center")
    plt.tight_layout()
        

In [None]:
#look at the survival rate by sex
titanic.groupby("sex")[["survived"]].mean()

In [None]:
#look at survival rate by sex and class
titanic.pivot_table("survived",index="sex",columns="class")

In [None]:
#look at survival rate by sex and class visually
titanic.pivot_table("survived",index="sex",columns="class").plot()

In [None]:
#Plot the survival rate of each class
sns.barplot(x="class",y="survived",data=titanic)

In [None]:
#look at survival rate by sex age and class
age=pd.cut(titanic.age,[0,18,80]) # people who are older than 18 and younger than 80
titanic.pivot_table("survived",["sex",age],"class")

In [None]:
#plot the prices paid of each class
plt.scatter(x=titanic["fare"],y=titanic["class"],color="purple",label="Passenger Paid")
plt.xlabel("Price / Fare")
plt.ylabel("class")
plt.title("Price of each class")
plt.legend()
plt.show()

In [None]:
#Count the empty values in each column
titanic.isna().sum()

In [None]:
#look at all of the values in each column & get a count
for val in titanic:
    print(titanic[val].value_counts())

In [None]:
#drop the columns
titanic=titanic.drop(["deck","embark_town","alive","class","who","alone","adult_male"],axis=1)

#Remove the rows with missing value
titanic=titanic.dropna(subset=["embarked","age"])

In [None]:
#count the new number of rows and columns in data set
titanic.shape

In [None]:
#look at the datatypes
titanic.dtypes

In [None]:
#print the unique values in the columns
print(titanic["sex"].unique())
print(titanic["embarked"].unique())

In [None]:
from sklearn.preprocessing import LabelEncoder
labalencoder=LabelEncoder()
#encode the "sex" column
titanic.iloc[:,2]=labalencoder.fit_transform(titanic.iloc[:,2].values)
#encode the "embarked" column
titanic.iloc[:,7]=labalencoder.fit_transform(titanic.iloc[:,7].values)


In [None]:
#print the unique values in the columns
print(titanic["sex"].unique())
print(titanic["embarked"].unique())

In [42]:
titanic.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [None]:
#split data into independent 'X' and dependent 'Y' variables
X=titanic.iloc[:,1:8].values 
Y=titanic.iloc[:,0].values

In [None]:
#split the dataset into 80% training and 20% testing
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=0)

In [43]:
#scale the data
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.fit_transform(X_test)

In [None]:
#create a function with many ml models
def models(X_train,Y_train):
    #use logistic regression
    from sklearn.linear_model import LogisticRegression
    log=LogisticRegression(random_state=0)
    log.fit(X_train,Y_train)
    
    #use KNeighbors
    from sklearn.neighbors import KNeighborsClassifier
    knn=KNeighborsClassifier(n_neighbors=5,metric="minkowski",p=2)
    knn.fit(X_train,Y_train)
    
    #use SVC (linear kernel)
    from sklearn.svm import SVC
    svc_lin=SVC(kernel="linear",random_state=0)
    svc_lin.fit(X_train,Y_train)
    
    #use SVC (RBF kernel)
    svc_rbf=SVC(kernel="rbf",random_state=0)
    svc_rbf.fit(X_train,Y_train)
    
    #use GaussianNB
    from sklearn.naive_bayes import GaussianNB
    gauss=GaussianNB()
    gauss.fit(X_train,Y_train)
    
    #use Decision Tree
    from sklearn.tree import DecisionTreeClassifier
    tree=DecisionTreeClassifier(criterion="entropy",random_state=0)
    tree.fit(X_train,Y_train)
    
    #use Random Forest Classifier
    from sklearn.ensemble import RandomForestClassifier
    forest=RandomForestClassifier(n_estimators=10,criterion="entropy",random_state=0)
    forest.fit(X_train,Y_train)
    
    #print the training accuracy for each model
    print('[0]Logistic Regression Training Accuracy:',log.score(X_train,Y_train))
    print('\n[1]K Neighbors Training Accuracy:',knn.score(X_train,Y_train))
    print('\n[2]SVC Linear Training Accuracy:',svc_lin.score(X_train,Y_train))
    print('\n[3]SVC RBF Training Accuracy:',svc_rbf.score(X_train,Y_train))
    print('\n[4]Gaussian Training Accuracy:',gauss.score(X_train,Y_train))
    print('\n[5]Decision Tree Training Accuracy:',tree.score(X_train,Y_train))
    print('\n[6]Random Forest Training Accuracy:',forest.score(X_train,Y_train))
    
    return log,knn,svc_lin,svc_rbf,gauss,tree,forest

In [None]:
#get and train all of the models
model=models(X_train,Y_train)

In [None]:
#show the confusion matrix and accuracy for all of the models on the test data
from sklearn.metrics import confusion_matrix
for i in range(len(model)):
    cm=confusion_matrix(Y_test,model[i].predict(X_test))
    
    #Extract TN, FP, FN, TP
    TN, FP, FN, TP=confusion_matrix(Y_test,model[i].predict(X_test)).ravel()
    
    test_score=(TP+TN)/(TP+TN+FN+FP)
    
    print(cm)
    print("Model[{0}] Testing Accuracy='{1}'".format(i,test_score))
    print()

In [None]:
#get feature importance 
forest=model[6]
importances=pd.DataFrame({"feature":titanic.iloc[:,1:8].columns,"importance":np.round(forest.feature_importances_,3)})
importances=importances.sort_values("importance",ascending=False).set_index("feature")
importances

In [None]:
#Visualize the importance
importances.plot(kind="bar")

In [None]:
#print the prediction of the random forest classifier
pred=model[6].predict(X_test)
print(pred,"\n")

#print the actual values
print(Y_test)

In [48]:
#based on my own data
my_survival=[[3,1,22,0,0,0,1]]

#scaling my survival
sc=StandardScaler() # we imported before
my_survival_scaled=sc.fit_transform(my_survival)

#prediction of my survival using Random Forest Classifier
pred=model[6].predict(my_survival_scaled)
print(pred)

if pred==0:
    print("You didn't make it")
else:
    print("Nice!! You survived!")

[0]
You didn't make it
