# TITANIC

In [92]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [93]:
train = pd.read_csv("/home/shaunak/Riddhi/Data Science/Data Science/inputs/train.csv")
test = pd.read_csv("/home/shaunak/Riddhi/Data Science/Data Science/inputs/test.csv")

In [94]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [95]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [96]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [97]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [98]:
train.shape

(891, 12)

## Data Cleaning

In [99]:
train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1, inplace = True)
passangerId = test['PassengerId']
test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1, inplace = True)

In [100]:
train['Fare'] = train['Fare'].replace(0, train['Fare'].mean())
test['Fare'] = test['Fare'].replace(0, test['Fare'].mean())
test['Fare'].fillna(test['Fare'].mean(), inplace = True)

In [101]:
train['Age'].fillna(train['Age'].mean(), inplace = True)
test['Age'].fillna(test['Age'].mean(), inplace = True)

In [102]:
train['Embarked'].fillna(train['Embarked'].mode()[0], inplace = True)
test['Embarked'].fillna(test['Embarked'].mode()[0], inplace = True)

In [103]:
train.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [104]:
train.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

## Label Encoding

In [105]:
train['Sex'] = train['Sex'].apply(lambda val: 1 if val == 'male' else 0)
test['Sex'] = test['Sex'].apply(lambda val: 1 if val == 'male' else 0)

In [106]:
train['Embarked'] = train['Embarked'].map({'S' : 0, 'C': 1, 'Q': 2})
test['Embarked'] = test['Embarked'].map({'S' : 0, 'C': 1, 'Q': 2})

In [107]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,0
1,1,1,0,38.0,1,0,71.2833,1
2,1,3,0,26.0,0,0,7.925,0
3,1,1,0,35.0,1,0,53.1,0
4,0,3,1,35.0,0,0,8.05,0


In [108]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,34.5,0,0,7.8292,2
1,3,0,47.0,1,0,7.0,0
2,2,1,62.0,0,0,9.6875,2
3,3,1,27.0,0,0,8.6625,0
4,3,0,22.0,1,1,12.2875,0


In [109]:
train.var()

Survived       0.236772
Pclass         0.699015
Sex            0.228475
Age          169.052400
SibSp          1.216043
Parch          0.649728
Fare        2451.663181
Embarked       0.404081
dtype: float64

In [110]:
test.var()

Pclass         0.708690
Sex            0.231960
Age          159.631454
SibSp          0.804178
Parch          0.963203
Fare        3112.044543
Embarked       0.469932
dtype: float64

## Normalizing

In [111]:
train['Age'] = np.log(train['Age'])
train['Fare'] = np.log(train['Fare'])
test['Age'] = np.log(test['Age'])
test['Fare'] = np.log(test['Fare'])

In [112]:
train.var()

Survived    0.236772
Pclass      0.699015
Sex         0.228475
Age         0.499292
SibSp       1.216043
Parch       0.649728
Fare        0.866881
Embarked    0.404081
dtype: float64

In [113]:
test.var()

Pclass      0.708690
Sex         0.231960
Age         0.482596
SibSp       0.804178
Parch       0.963203
Fare        0.973274
Embarked    0.469932
dtype: float64

In [114]:
X = train.drop('Survived', axis = 1)
y = train['Survived']

In [115]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [116]:
sgb = GradientBoostingClassifier(subsample = 0.90, max_features = 0.70)
sgb.fit(X_train, y_train)

GradientBoostingClassifier(max_features=0.7, subsample=0.9)

In [117]:
sgb_acc = accuracy_score(y_test, sgb.predict(X_test))

print(f"Training Accuracy of Decision Tree Classifier is {accuracy_score(y_train, sgb.predict(X_train))}")
print(f"Test Accuracy of Decision Tree Classifier is {sgb_acc} \n")

print(f"{confusion_matrix(y_test, sgb.predict(X_test))}\n")
print(classification_report(y_test, sgb.predict(X_test)))

Training Accuracy of Decision Tree Classifier is 0.8932584269662921
Test Accuracy of Decision Tree Classifier is 0.8547486033519553 

[[105   5]
 [ 21  48]]

              precision    recall  f1-score   support

           0       0.83      0.95      0.89       110
           1       0.91      0.70      0.79        69

    accuracy                           0.85       179
   macro avg       0.87      0.83      0.84       179
weighted avg       0.86      0.85      0.85       179



In [118]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,3.540959,0,0,2.05786,2
1,3,0,3.850148,1,0,1.94591,0
2,2,1,4.127134,0,0,2.270836,2
3,3,1,3.295837,0,0,2.159003,0
4,3,0,3.091042,1,1,2.508582,0


In [127]:
predictions = pd.DataFrame(sgb.predict(test))
passangerId = pd.DataFrame(passangerId)

In [128]:
print(predictions)
print(passangerId)

     0
0    0
1    0
2    0
3    0
4    0
..  ..
413  0
414  1
415  0
416  0
417  0

[418 rows x 1 columns]
     PassengerId
0            892
1            893
2            894
3            895
4            896
..           ...
413         1305
414         1306
415         1307
416         1308
417         1309

[418 rows x 1 columns]


In [130]:
result = pd.concat([passangerId, predictions], axis=1)

In [131]:
result.to_csv("Submission.csv", header=["PassengerId", "Survived"], index=False)

## Training Whole Data

In [132]:
sgb_w = GradientBoostingClassifier(subsample = 0.90, max_features = 0.70)
sgb_w.fit(X, y)

GradientBoostingClassifier(max_features=0.7, subsample=0.9)

In [133]:
predictions_w = pd.DataFrame(sgb_w.predict(test))

In [134]:
result_w = pd.concat([passangerId, predictions_w], axis=1)

In [135]:
result_w.to_csv("Submission_w.csv", header=["PassengerId", "Survived"], index=False)

In [138]:
submission_w = pd.read_csv("Submission_w.csv")

In [139]:
submission_w

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [140]:
submission = pd.read_csv("Submission.csv")

In [141]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
