In [1]:
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/agconti/kaggle-titanic/master/data/train.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.shape

(891, 12)

In [5]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
df['Age'].fillna(df['Age'].mean(),inplace=True)
df['Cabin'].fillna(method='bfill',inplace=True)
df['Cabin'].fillna(method='pad',inplace=True)
df['Embarked'].fillna(method='pad',inplace=True)

In [8]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [9]:
df.drop_duplicates(inplace=True)

In [10]:
# dropign un-nessery columns
df.drop(columns=['PassengerId','Name','Ticket'],inplace=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     891 non-null    object 
 8   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 69.6+ KB


In [12]:
all_categorical_col = []
dtypes = df.dtypes
for column,dtype in dtypes.items():
    if dtype == 'object':
        all_categorical_col.append(column)
all_categorical_col

['Sex', 'Cabin', 'Embarked']

In [13]:
labelEncoder = LabelEncoder()

In [14]:
for one_cat in all_categorical_col:
    df[one_cat] = labelEncoder.fit_transform(df[one_cat])

In [15]:
df.dtypes

Survived      int64
Pclass        int64
Sex           int64
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Cabin         int64
Embarked      int64
dtype: object

In [16]:
dependent_variables = ['Survived','Pclass','Sex','SibSp','Parch','Embarked']

In [17]:
def getScoreAndConfussionMatrix(df,dependent_variable):
    X = df.drop(columns=[dependent_variable])
    y = df[dependent_variable]
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
    bernoulliNB = BernoulliNB()
    bernoulliNB.fit(X_train,y_train)
    y_predict = bernoulliNB.predict(X_test)
    accuracy_score_res = accuracy_score(y_test,y_predict)
    confusion_matrix_res = confusion_matrix(y_test,y_predict)
    
    return dependent_variable,accuracy_score_res,confusion_matrix_res
    

In [18]:
result_score_arr = []
for dependent_variable in dependent_variables:
    result_score_arr.append(getScoreAndConfussionMatrix(df,dependent_variable))

In [19]:
for one_result in result_score_arr:
    column_name, accuracy_score_one, confusion_matrix_one = one_result
    print('--------------------------------------------------------')
    print('--------------------** '+column_name+' **----------------------')    
    print('--------------------------------------------------------')
    print(accuracy_score_one)
    print(confusion_matrix_one)
    print('\n')
        

--------------------------------------------------------
--------------------** Survived **----------------------
--------------------------------------------------------
0.7947761194029851
[[133  24]
 [ 31  80]]


--------------------------------------------------------
--------------------** Pclass **----------------------
--------------------------------------------------------
0.582089552238806
[[ 31   0  46]
 [  9   0  43]
 [ 14   0 125]]


--------------------------------------------------------
--------------------** Sex **----------------------
--------------------------------------------------------
0.7910447761194029
[[ 78  23]
 [ 33 134]]


--------------------------------------------------------
--------------------** SibSp **----------------------
--------------------------------------------------------
0.6902985074626866
[[166  24   0   0   0]
 [ 43  19   0   0   0]
 [  4   3   0   0   0]
 [  3   2   0   0   0]
 [  1   3   0   0   0]]


-----------------------------------