In [1]:
import pandas as pd
import numpy as np

In [2]:
traindata=pd.read_csv('train.csv')
data=pd.read_csv('Titanic_preprocessed.csv')
data.head()

Unnamed: 0,Target,Pclass,Sex,Age,SibSp,Parch,Fare,Q,S
0,0,1,0,22.0,1,0,7.25,0,1
1,1,0,1,38.0,1,0,71.2833,0,0
2,1,1,1,26.0,0,0,7.925,0,1
3,1,0,1,35.0,1,0,53.1,0,1
4,0,1,0,35.0,0,0,8.05,0,1


In [3]:
targets=data['Target']

In [4]:
target_input=data.iloc[:,:1]
target_input

Unnamed: 0,Target
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


In [5]:
unscaled_inputs=data.iloc[:,1:]
unscaled_inputs.shape

(891, 8)

# Standardization

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [7]:
class CustomScaler(BaseEstimator,TransformerMixin):
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler=StandardScaler(copy,with_mean,with_std)
        self.columns=columns
        self.mean_=None
        self.var_=None
        
    def fit(self,X,y=None):
        self.scaler.fit(X[self.columns],y)
        self.mean_ =np.mean(X[self.columns])
        self.var_ =np.var(X[self.columns])
        return self
    
    def transform(self,X,y=None,copy=None):
        init_col_order=X.columns
        X_scaled=pd.DataFrame(self.scaler.transform(X[self.columns]),columns=self.columns)
        X_not_scaled=X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled,X_scaled],axis=1)[init_col_order]

In [8]:
unscaled_inputs.columns.values

array(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Q', 'S'],
      dtype=object)

In [9]:
columns_to_scale=['Pclass','Sex', 'Age', 'SibSp', 'Parch', 'Fare']

In [10]:
scaler=  CustomScaler(columns_to_scale)



In [11]:
scaler.fit(unscaled_inputs)

AttributeError: 'CustomScaler' object has no attribute 'copy'

AttributeError: 'CustomScaler' object has no attribute 'copy'

In [12]:
scaled_input=scaler.transform(unscaled_inputs)

In [13]:
scaled_input

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Q,S
0,0.902587,-0.737695,-0.597055,0.432793,-0.473674,-0.502445,0,1
1,-1.107926,1.355574,0.634162,0.432793,-0.473674,0.786845,0,0
2,0.902587,1.355574,-0.289251,-0.474545,-0.473674,-0.488854,0,1
3,-1.107926,1.355574,0.403309,0.432793,-0.473674,0.420730,0,1
4,0.902587,-0.737695,0.403309,-0.474545,-0.473674,-0.486337,0,1
...,...,...,...,...,...,...,...,...
886,-1.107926,-0.737695,-0.212299,-0.474545,-0.473674,-0.386671,0,1
887,-1.107926,1.355574,-0.827908,-0.474545,-0.473674,-0.044381,0,1
888,0.902587,1.355574,0.018554,0.432793,2.008933,-0.176263,0,1
889,-1.107926,-0.737695,-0.289251,-0.474545,-0.473674,-0.044381,0,0


# Train test split

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
x_train,x_test,y_train,y_test=train_test_split(scaled_input,targets,train_size=0.8,random_state=20)

In [16]:
print(x_train.shape,y_train.shape)

(712, 8) (712,)


In [17]:
print(x_test.shape,y_test.shape)

(179, 8) (179,)


# Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
reg=LogisticRegression()
reg.fit(x_train,y_train)

LogisticRegression()

In [20]:
reg.score(x_train,y_train)

0.8061797752808989

In [21]:
reg.intercept_

array([-0.35925779])

In [22]:
reg.coef_

array([[-0.80141898,  1.19205342, -0.39824705, -0.27231243, -0.11262552,
         0.31339715,  0.12434358, -0.41507393]])

In [23]:
feature_name=unscaled_inputs.columns.values

# SUMMARY TABLE

In [24]:
summary_table=pd.DataFrame(columns=['Feature Name'],data=feature_name)
summary_table['Coefficient']=np.transpose(reg.coef_)
summary_table.index=summary_table.index + 1
summary_table.loc[0]=['Intercept',reg.intercept_[0]]
summary_table=summary_table.sort_index()
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Intercept,-0.359258
1,Pclass,-0.801419
2,Sex,1.192053
3,Age,-0.398247
4,SibSp,-0.272312
5,Parch,-0.112626
6,Fare,0.313397
7,Q,0.124344
8,S,-0.415074


In [25]:
summary_table['Odds']=np.exp(summary_table.Coefficient)
summary_table.sort_values('Odds',ascending=False)

Unnamed: 0,Feature Name,Coefficient,Odds
2,Sex,1.192053,3.293838
6,Fare,0.313397,1.368065
7,Q,0.124344,1.132405
5,Parch,-0.112626,0.893485
4,SibSp,-0.272312,0.761616
0,Intercept,-0.359258,0.698194
3,Age,-0.398247,0.671496
8,S,-0.415074,0.660291
1,Pclass,-0.801419,0.448692


# Manually checking the accuracy

In [26]:
model_outputs=reg.predict(x_train)
x_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Q,S
811,0.902587,-0.737695,0.711113,-0.474545,-0.473674,-0.162169,0,1
29,0.902587,-0.737695,0.018554,-0.474545,-0.473674,-0.489442,0,1
49,0.902587,1.355574,-0.904859,0.432793,-0.473674,-0.290024,0,1
105,0.902587,-0.737695,-0.135348,-0.474545,-0.473674,-0.489442,0,1
616,0.902587,-0.737695,0.326358,0.432793,0.767630,-0.358482,0,1
...,...,...,...,...,...,...,...,...
218,-1.107926,1.355574,0.172456,-0.474545,-0.473674,0.887688,0,0
223,0.902587,-0.737695,0.018554,-0.474545,-0.473674,-0.489442,0,1
271,0.902587,-0.737695,-0.366202,-0.474545,-0.473674,-0.648422,0,1
474,0.902587,1.355574,-0.597055,-0.474545,-0.473674,-0.450347,0,1


In [27]:
np.sum(model_outputs==y_train)

574

In [28]:
model_outputs.shape

(712,)

In [29]:
acc=(np.sum(model_outputs==y_train)/model_outputs.shape)*100
print(*acc,"%")

80.6179775280899 %


# TEST

In [30]:
reg.score(x_test,y_test)

0.8324022346368715

In [39]:
predicted_proba=reg.predict_proba(x_test)
predicted_proba.shape

(179, 2)

In [40]:
#TEST DATASET

In [32]:
data=pd.read_csv('Titanic_test_preprocessed.csv')
data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Q,S
0,892,1,0,34.5,0,0,7.8292,1,0
1,893,1,1,47.0,1,0,7.0,0,1
2,894,0,0,62.0,0,0,9.6875,1,0
3,895,1,0,27.0,0,0,8.6625,0,1
4,896,1,1,22.0,1,1,12.2875,0,1


In [33]:
unscaled_inputs1=data.iloc[:,1:]
unscaled_inputs1.shape

(418, 8)

In [41]:
unscaled_inputs1.columns.values

array(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Q', 'S'],
      dtype=object)

In [42]:
columns_to_scale1=['Pclass','Sex', 'Age', 'SibSp', 'Parch', 'Fare']

In [43]:
scaler=  CustomScaler(columns_to_scale1)



In [44]:
scaler.fit(unscaled_inputs1)

AttributeError: 'CustomScaler' object has no attribute 'copy'

AttributeError: 'CustomScaler' object has no attribute 'copy'

In [45]:
scaled_input1=scaler.transform(unscaled_inputs1)

In [46]:
scaled_input1.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Q,S
0,0.957826,-0.755929,0.339424,-0.49947,-0.400248,-0.497811,1,0
1,0.957826,1.322876,1.329924,0.616992,-0.400248,-0.51266,0,1
2,-1.044031,-0.755929,2.518523,-0.49947,-0.400248,-0.464532,1,0
3,0.957826,-0.755929,-0.254876,-0.49947,-0.400248,-0.482888,0,1
4,0.957826,1.322876,-0.651076,0.616992,0.619896,-0.417971,0,1


In [65]:
scaled_input1['Fare'] = scaled_input1['Fare'].fillna(value = scaled_input1['Fare'].mode()[0])

In [67]:
y_predict=reg.predict(scaled_input1)

In [73]:
y_predict

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [71]:
scaled_input1.isnull().sum()

Pclass    0
Sex       0
Age       0
SibSp     0
Parch     0
Fare      0
Q         0
S         0
dtype: int64

# Submission

In [74]:
submission = pd.DataFrame()
submission['PassengerId'] = data['PassengerId']
submission['Survived'] = y_predict
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [75]:
submission.to_csv('submit.csv', index = False)