In [47]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error
import matplotlib.pyplot as plt
from xgboost import XGBClassifier, XGBRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from scipy import stats
pd.set_option('display.max_rows',891)

# function to compute loss function
loss = -y(i)log(f_wb(x) + (1-y(i))log(1-f_wb(x))

In [2]:
def loss_funct(w, b, x, y):
    loss = 0
    for i in range(x.shape[0]):
        z = np.dot(w, x[i]) + b
        f_wb = 1/(1+np.exp(-1*z))
        loss = loss + -1*y[i]*np.log(f_wb) + (1-y[i])*np.log(1-f_wb)
    return loss

# function to compute cost function J(w,b)



In [3]:
def cost_funct(w, b, x, y):
    m = x.shape[0] # number of train example
    cost = 0  # initalize cost by zero
    for i in range(m):
        loss = loss_funct(w, b, x, y)
        cost = cost + loss
    return cost

# function to compute gradient descent
repeat{
    wj = wj - alpha x dJ(wi,b)∕dwi
    
    b = b - alpha x dJ(w,b)/db 
    }

In [4]:
def compute_gradient(w, b, x, y):
    d_dw = np.array([i*0 for i in range(x.shape[1])])
    d_dwj = []
    d_db = 0
    for i in range(x.shape[0]):
        d_dwj = []
        z = np.dot(w, x[i]) + b
        f_wb = 1/(1+np.exp(-1*z))
        for j in range(x.shape[1]):
            a = (f_wb - y[i])*x[i][j]
            d_dwj.append(a)
        d_dw = d_dw + np.array(d_dwj)
        d_db = d_db + (f_wb - y[i])
    
    d_dw = np.array(d_dw)
    print('dJ_dw : ',d_dw)
    print('\n')
        
    return d_dw/x.shape[0], d_db/x.shape[0]

In [5]:
def find_best_param(w, b, x, y, alpha):
    for k in range(700):
        d_dw, d_db = compute_gradient(w, b, x, y)
        temp_w = w - alpha*d_dw
        temp_b = b - alpha*d_db
        
        w = temp_w
        b = temp_b
        print(loss_funct(w, b, x, y))
    return w, b
    
        

In [6]:
def plot_decision_boundary(w, b, x, y):
    z = []
    g_z = []
    for i in range(x.shape[0]):
        z.append(np.dot(w, x[i]) + b)
        g_z.append(1/(1+np.exp(-1*z[i])))
    plt.plot(z, g_z, '.')
    plt.xlabel('z')

In [7]:
def dummies(df, obj):
    for i in obj:
        dum = pd.get_dummies(df[i])
        dum.columns = i+'_'+dum.columns  #rename the column by merge the name of original column and value
        for j in dum.columns:
            df[j] = dum[j]  # update the dataframe origin
        df = df.drop(i, axis=1)  # drop the column origin (column that are used to dummies)
    return df

# read data 

In [8]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

features = [i for i in test.columns]
target = 'Survived'

# output informations about data

In [9]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                      

In [10]:
# drop columns
df.drop('Name', axis=1, inplace=True)
df.drop('Ticket', axis=1, inplace=True)
#df.drop('Embarked', axis=1, inplace=True)
df.drop('PassengerId', axis=1, inplace=True)
df.drop('Cabin', axis=1, inplace=True)

In [11]:
print(df.info())
print(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB
None
     Survived  Pclass     Sex    Age  SibSp  Parch      Fare Embarked
0           0       3    male  22.00      1      0    7.2500        S
1           1       1  female  38.00      1      0   71.2833        C
2           1       3  female  26.00      0      0    7.9250        S
3           1       1  female  35.00      1      0   53.1000        S
4           0       3    male  35.00      0      0    8.0500        S
5           0     

In [12]:
df['Age'] = df['Age'].fillna(round(np.mean(df['Age'])))
df = dummies(df, ['Embarked'])
df['Sex'] = df['Sex'] == 'male'

In [13]:
print(df)

     Survived  Pclass    Sex    Age  SibSp  Parch      Fare  Embarked_C  \
0           0       3   True  22.00      1      0    7.2500           0   
1           1       1  False  38.00      1      0   71.2833           1   
2           1       3  False  26.00      0      0    7.9250           0   
3           1       1  False  35.00      1      0   53.1000           0   
4           0       3   True  35.00      0      0    8.0500           0   
5           0       3   True  30.00      0      0    8.4583           0   
6           0       1   True  54.00      0      0   51.8625           0   
7           0       3   True   2.00      3      1   21.0750           0   
8           1       3  False  27.00      0      2   11.1333           0   
9           1       2  False  14.00      1      0   30.0708           1   
10          1       3  False   4.00      1      1   16.7000           0   
11          1       1  False  58.00      0      0   26.5500           0   
12          0       3   T

In [14]:
# we will add feature with linspace using 'Fare', less exp, mean exp, more exp
bins = df['Fare'].quantile([0, 0.25, 0.75, 1])
bin_feat = []
print(bins.iloc[3])
for i in df['Fare']:
    if i <=bins.iloc[1]:
        bin_feat.append(1)
    elif i >bins.iloc[1] and i <=bins.iloc[2]:
        bin_feat.append(2)
    elif i > bins.iloc[2]:
        bin_feat.append(3)
df['Fare_bin'] = bin_feat



512.3292


In [15]:
bin_feat = []
for i in df['Age']:
    if i<=1:
        bin_feat.append(1)
    elif i >1 and i <=5:
        bin_feat.append(2)
    elif i > 5 and i<=12:
        bin_feat.append(3)
    elif i > 12 and i <=18:
        bin_feat.append(4)
    elif i > 18 and i<=30:
        bin_feat.append(5)
    elif i > 30 and i <= 45:
        bin_feat.append(6)
    elif i > 45 and i <=60:
        bin_feat.append(7)
    elif i > 60:
        bin_feat.append(8)

df['Age_bin'] = bin_feat

In [16]:
print(df['Age'].max())
print(df['Age'])

80.0
0      22.00
1      38.00
2      26.00
3      35.00
4      35.00
5      30.00
6      54.00
7       2.00
8      27.00
9      14.00
10      4.00
11     58.00
12     20.00
13     39.00
14     14.00
15     55.00
16      2.00
17     30.00
18     31.00
19     30.00
20     35.00
21     34.00
22     15.00
23     28.00
24      8.00
25     38.00
26     30.00
27     19.00
28     30.00
29     30.00
30     40.00
31     30.00
32     30.00
33     66.00
34     28.00
35     42.00
36     30.00
37     21.00
38     18.00
39     14.00
40     40.00
41     27.00
42     30.00
43      3.00
44     19.00
45     30.00
46     30.00
47     30.00
48     30.00
49     18.00
50      7.00
51     21.00
52     49.00
53     29.00
54     65.00
55     30.00
56     21.00
57     28.50
58      5.00
59     11.00
60     22.00
61     38.00
62     45.00
63      4.00
64     30.00
65     30.00
66     29.00
67     19.00
68     17.00
69     26.00
70     32.00
71     16.00
72     21.00
73     26.00
74     32.00
75     25.00
76     

In [17]:
df.corr()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Fare_bin,Age_bin
Survived,1.0,-0.338481,-0.543351,-0.070657,-0.035322,0.081629,0.257307,0.16824,0.00365,-0.15566,0.278828,-0.09151
Pclass,-0.338481,1.0,0.1319,-0.329727,0.083081,0.018443,-0.5495,-0.243292,0.221009,0.08172,-0.613295,-0.304233
Sex,-0.543351,0.1319,1.0,0.08466,-0.114631,-0.245489,-0.182333,-0.082853,-0.074115,0.125722,-0.227227,0.089058
Age,-0.070657,-0.329727,0.08466,1.0,-0.23244,-0.18033,0.090632,0.03233,-0.010738,-0.029322,0.076585,0.934669
SibSp,-0.035322,0.083081,-0.114631,-0.23244,1.0,0.414838,0.159651,-0.059528,-0.026354,0.070941,0.349465,-0.283585
Parch,0.081629,0.018443,-0.245489,-0.18033,0.414838,1.0,0.216225,-0.011069,-0.081228,0.063036,0.339821,-0.246679
Fare,0.257307,-0.5495,-0.182333,0.090632,0.159651,0.216225,1.0,0.269335,-0.117216,-0.166603,0.58037,0.083779
Embarked_C,0.16824,-0.243292,-0.082853,0.03233,-0.059528,-0.011069,0.269335,1.0,-0.148258,-0.778359,0.163169,0.004829
Embarked_Q,0.00365,0.221009,-0.074115,-0.010738,-0.026354,-0.081228,-0.117216,-0.148258,1.0,-0.496624,-0.276459,-0.036151
Embarked_S,-0.15566,0.08172,0.125722,-0.029322,0.070941,0.063036,-0.166603,-0.778359,-0.496624,1.0,0.023852,0.011097


In [18]:
p_values = []
p_coefs = []
for i in df.columns:
    print(i)
    p_value, p_coef = stats.pearsonr(df[i], df[target])
    
    p_values.append(p_value)
    p_coefs.append(p_coef)

pearson = pd.DataFrame({'columns':df.columns, 'p_values':p_values, 'p_coefs':p_coefs})
pearson

Survived
Pclass
Sex
Age
SibSp
Parch
Fare
Embarked_C
Embarked_Q
Embarked_S
Fare_bin
Age_bin


Unnamed: 0,columns,p_values,p_coefs
0,Survived,1.0,0.0
1,Pclass,-0.338481,2.5370470000000002e-25
2,Sex,-0.543351,1.406066e-69
3,Age,-0.070657,0.0349647
4,SibSp,-0.035322,0.2922439
5,Parch,0.081629,0.01479925
6,Fare,0.257307,6.120189e-15
7,Embarked_C,0.16824,4.397151e-07
8,Embarked_Q,0.00365,0.9133532
9,Embarked_S,-0.15566,3.036111e-06


In [19]:
features = [i for i in df.columns if i!='Survived']
#features.remove('Embarked_Q')
#features.remove('SibSp')
#features.remove('Age')
#features.remove('Age_bin')
#features.remove('Parch')

In [20]:
train_set = df[:700]
valid_set = df[700:800]
test_set = df[800:]

In [21]:
x_train, y_train = df[features], df[target]
x_valid, y_valid = df[features], df[target]
x_test, y_test = df[features], df[target]

In [22]:
#best_param = grid.best_params_
#print(best_param)
model = DecisionTreeClassifier(max_leaf_nodes=8, max_depth=5)
model.fit(x_train, y_train)
pred_valid = model.predict(x_valid)
pred_test = model.predict(x_test)

In [23]:
accuracy_score(pred_valid, y_valid)

0.8327721661054994

In [24]:
accuracy_score(pred_test, y_test)

0.8327721661054994

In [25]:
print(test.info())
print(test['Age'].max())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
None
76.0


In [26]:
# fill the na in Fare column and Age
test['Fare'] = test['Fare'].fillna(np.mean(test['Fare']))
test['Age'] = test['Age'].fillna(round(np.mean(test['Age'])))

In [27]:
test.corr()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.026751,-0.030935,0.003818,0.04308,0.008209
Pclass,-0.026751,1.0,-0.443234,0.001087,0.018721,-0.576619
Age,-0.030935,-0.443234,1.0,-0.07887,-0.04498,0.328429
SibSp,0.003818,0.001087,-0.07887,1.0,0.306895,0.171488
Parch,0.04308,0.018721,-0.04498,0.306895,1.0,0.230001
Fare,0.008209,-0.576619,0.328429,0.171488,0.230001,1.0


In [28]:
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [29]:
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [30]:
test = dummies(test, ['Embarked'])
test['Sex'] = df['Sex'] == 'male'

In [31]:
# predict the test set in test.csv
bins = df['Fare'].quantile([0, 0.25, 0.75, 1])
bin_feat = []
print(bins.iloc[3])
for i in test['Fare']:
    if i <=bins.iloc[1]:
        bin_feat.append(1)
    elif i >bins.iloc[1] and i <=bins.iloc[2]:
        bin_feat.append(2)
    elif i > bins.iloc[2]:
        bin_feat.append(3)
test['Fare_bin'] = bin_feat

512.3292


In [32]:
bin_feat = []
for i in test['Age']:
    if i<=1:
        bin_feat.append(1)
    elif i >1 and i <=5:
        bin_feat.append(2)
    elif i > 5 and i<=12:
        bin_feat.append(3)
    elif i > 12 and i <=18:
        bin_feat.append(4)
    elif i > 18 and i<=30:
        bin_feat.append(5)
    elif i > 30 and i <= 45:
        bin_feat.append(6)
    elif i > 45 and i <=60:
        bin_feat.append(7)
    elif i > 60:
        bin_feat.append(8)

test['Age_bin'] = bin_feat

In [33]:
# drop columns
test.drop('Name', axis=1, inplace=True)
test.drop('Ticket', axis=1, inplace=True)
#df.drop('Embarked', axis=1, inplace=True)
test.drop('PassengerId', axis=1, inplace=True)
test.drop('Cabin', axis=1, inplace=True)

In [34]:
test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Fare_bin,Age_bin
0,3,False,34.5,0,0,7.8292,0,1,0,1,6
1,3,False,47.0,1,0,7.0,0,0,1,1,7
2,2,False,62.0,0,0,9.6875,0,1,0,2,8
3,3,False,27.0,0,0,8.6625,0,0,1,2,5
4,3,False,22.0,1,1,12.2875,0,0,1,2,5
5,3,False,14.0,0,0,9.225,0,0,1,2,4
6,3,False,30.0,0,0,7.6292,0,1,0,1,5
7,2,False,26.0,1,1,29.0,0,0,1,2,5
8,3,False,18.0,0,0,7.2292,1,0,0,1,4
9,3,False,21.0,2,0,24.15,0,0,1,2,5


In [35]:
x_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Fare_bin,Age_bin
0,3,True,22.0,1,0,7.25,0,0,1,1,5
1,1,False,38.0,1,0,71.2833,1,0,0,3,6
2,3,False,26.0,0,0,7.925,0,0,1,2,5
3,1,False,35.0,1,0,53.1,0,0,1,3,6
4,3,True,35.0,0,0,8.05,0,0,1,2,6
5,3,True,30.0,0,0,8.4583,0,1,0,2,5
6,1,True,54.0,0,0,51.8625,0,0,1,3,7
7,3,True,2.0,3,1,21.075,0,0,1,2,2
8,3,False,27.0,0,2,11.1333,0,0,1,2,5
9,2,False,14.0,1,0,30.0708,1,0,0,2,4


In [36]:
pred = model.predict(test)


In [37]:
t = pd.read_csv('test.csv')
print(t['PassengerId'])

0       892
1       893
2       894
3       895
4       896
5       897
6       898
7       899
8       900
9       901
10      902
11      903
12      904
13      905
14      906
15      907
16      908
17      909
18      910
19      911
20      912
21      913
22      914
23      915
24      916
25      917
26      918
27      919
28      920
29      921
30      922
31      923
32      924
33      925
34      926
35      927
36      928
37      929
38      930
39      931
40      932
41      933
42      934
43      935
44      936
45      937
46      938
47      939
48      940
49      941
50      942
51      943
52      944
53      945
54      946
55      947
56      948
57      949
58      950
59      951
60      952
61      953
62      954
63      955
64      956
65      957
66      958
67      959
68      960
69      961
70      962
71      963
72      964
73      965
74      966
75      967
76      968
77      969
78      970
79      971
80      972
81      973
82      974
83  

In [38]:
print(pred.shape)

(418,)


In [39]:
subm = pd.DataFrame({'PassengerId':t['PassengerId'], 
                     'Survived':pred})
subm.to_csv('submission.csv', index=False)

In [40]:
xgb = XGBClassifier(n_estimators=500, learning_rate=0.05)
xgb.fit(x_train, y_train)

In [54]:
result = cross_validate(xgb, df[features], df[target], cv=5)

In [55]:
result

{'fit_time': array([0.49237299, 0.36454368, 0.4651432 , 0.37235999, 0.39699435]),
 'score_time': array([0.00531411, 0.00519753, 0.00519228, 0.00509548, 0.00533652]),
 'test_score': array([0.79329609, 0.82022472, 0.84831461, 0.76966292, 0.86516854])}

In [59]:
model = result['test_score']
model

array([0.79329609, 0.82022472, 0.84831461, 0.76966292, 0.86516854])

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Sex         891 non-null    bool   
 3   Age         891 non-null    float64
 4   SibSp       891 non-null    int64  
 5   Parch       891 non-null    int64  
 6   Fare        891 non-null    float64
 7   Embarked_C  891 non-null    uint8  
 8   Embarked_Q  891 non-null    uint8  
 9   Embarked_S  891 non-null    uint8  
 10  Fare_bin    891 non-null    int64  
 11  Age_bin     891 non-null    int64  
dtypes: bool(1), float64(2), int64(6), uint8(3)
memory usage: 59.3 KB


In [45]:
df.corr()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Fare_bin,Age_bin
Survived,1.0,-0.338481,-0.543351,-0.070657,-0.035322,0.081629,0.257307,0.16824,0.00365,-0.15566,0.278828,-0.09151
Pclass,-0.338481,1.0,0.1319,-0.329727,0.083081,0.018443,-0.5495,-0.243292,0.221009,0.08172,-0.613295,-0.304233
Sex,-0.543351,0.1319,1.0,0.08466,-0.114631,-0.245489,-0.182333,-0.082853,-0.074115,0.125722,-0.227227,0.089058
Age,-0.070657,-0.329727,0.08466,1.0,-0.23244,-0.18033,0.090632,0.03233,-0.010738,-0.029322,0.076585,0.934669
SibSp,-0.035322,0.083081,-0.114631,-0.23244,1.0,0.414838,0.159651,-0.059528,-0.026354,0.070941,0.349465,-0.283585
Parch,0.081629,0.018443,-0.245489,-0.18033,0.414838,1.0,0.216225,-0.011069,-0.081228,0.063036,0.339821,-0.246679
Fare,0.257307,-0.5495,-0.182333,0.090632,0.159651,0.216225,1.0,0.269335,-0.117216,-0.166603,0.58037,0.083779
Embarked_C,0.16824,-0.243292,-0.082853,0.03233,-0.059528,-0.011069,0.269335,1.0,-0.148258,-0.778359,0.163169,0.004829
Embarked_Q,0.00365,0.221009,-0.074115,-0.010738,-0.026354,-0.081228,-0.117216,-0.148258,1.0,-0.496624,-0.276459,-0.036151
Embarked_S,-0.15566,0.08172,0.125722,-0.029322,0.070941,0.063036,-0.166603,-0.778359,-0.496624,1.0,0.023852,0.011097
