In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data = pd.read_csv("titanic.csv")
data.head()

In [None]:
data.shape

##### Check for any missing values

In [None]:
data.isnull().sum()

##### Seperating the Independent &  Dependent(Target) variable

In [None]:
X = data.drop(["Survived"], axis=1 )#Independent variables
y = data["Survived"] #Target Variable

##### To create test Set, imort the train test function

In [None]:
from sklearn.model_selection import train_test_split
#By eqauting stratify=y, we can make the distribution same in train and test sets w.r.t "y" i.e the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101, stratify= y)

##### Distribution in Training and Testing Set

In [None]:
print("-" *75)
print("Distribution in Training Set:")
print(y_train.value_counts(normalize=True))
print("-" *75)
print("Distribution in Testing Set:")
print(y_test.value_counts(normalize=True))
print("-" *75)

##### Shape of Training Set and Testing Set

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

##### import Decisiontree classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import tree
model = DecisionTreeClassifier(random_state=10)
#Fitting the model
model.fit(X_train,y_train)

##### checking the training and testing score

In [None]:
print("-"*50)
print("Training Score:", model.score(X_train,y_train))
print("Testing Score:", model.score(X_test,y_test))
print("-"*50)

- As seen above, The Training accuracy is high as compared to the testing accuracy

### Optimize the performance of DecisionTress: 
1. Optimising max_depth
2. Optimising max_leaf nodes

#### 1.Optimising max_depth

In [None]:
train_accuracy = []
test_accuracy = []
#Take the range of the max_depth 1-10 and checck its train_accuracy and test_accuracy
for depth in range(1,10):
    dt_model = DecisionTreeClassifier(max_depth=depth, random_state=10)
    dt_model.fit(X_train,y_train)
    train_accuracy.append(dt_model.score(X_train,y_train))
    test_accuracy.append(dt_model.score(X_test,y_test))    
frame = pd.DataFrame({"max_depth": range(1,10), "train_accuracy" : train_accuracy, "test_accuracy": test_accuracy})
frame

In [None]:
#Plot the train_accuracy and test_accuracy w.r.t max_depth
plt.figure(figsize=(10,6))
plt.plot(frame['max_depth'], frame['test_accuracy'], marker='*')
plt.plot(frame['max_depth'], frame['train_accuracy'], marker='*')
plt.xlabel('Depth of tree')
plt.ylabel('performance')

- Keeping the low value of max_depth, didnt allow the model to learn the pattern. Hence low prformance or Underfitting
- Magnitude of increase in Training accuracy is higher as compared to that with the Testing accuracy-
- **Max_depth= 8** produces the **highest Testing accuracy**,Hence can set the **max_depth = 8**

#### 2.Optimising max_leaf nodes

###### Keeping the max_depth = 8, lets check the max_leaf node parameter to get highest teating accuracy

In [None]:
train_accuracy = []
test_accuracy = []
#Keeping the max_depth = 8,take the range of the max_leaf_nodes(5,35,5) and check its train_accuracy and test_accuracy
for leaf_nodes in range(5,35,5):
    dt_model = DecisionTreeClassifier(max_leaf_nodes=leaf_nodes, max_depth=8, random_state=10)
    dt_model.fit(X_train,y_train)
    train_accuracy.append(dt_model.score(X_train,y_train))
    test_accuracy.append(dt_model.score(X_test,y_test))    
frame = pd.DataFrame({"max_depth": 8,"max_leaf_nodes": range(5,35,5), "train_accuracy" : train_accuracy, "test_accuracy": test_accuracy})
frame

#### Cross Checking the Training Score and Testing Score

In [None]:
print("-"*50)
print("Training Score:", dt_model.score(X_train,y_train))
print("Testing Score:", dt_model.score(X_test,y_test))
print("-"*50)

In [None]:
plt.figure(figsize=(10,6))
plt.plot(frame['max_leaf_nodes'],frame['test_accuracy'], marker='*')
plt.plot(frame['max_leaf_nodes'],frame['train_accuracy'], marker='*')
plt.xlabel('Leaf_nodes of tree')
plt.ylabel('performance')

- it can thus be observed that the test_accuracy is maximum at 0.811659 when max_leaf_nodes= 20 and thereafter remains constant.
- Hence we can take max_leaf_nodes as 20,25 0r 30.

#### Using GridSearchCV, Optimise the hper parameters to get the to get highest teating accuracy

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
para = {#"min_samples_split":[int(x) for x in range(0,14,2)],
       "max_leaf_nodes": [int(x) for x in np.linspace(5,35,7)],
       "max_depth" :[8]}

In [None]:
dt_GridSearch = GridSearchCV(dt_model, param_grid= para, scoring= "accuracy", n_jobs= -1, cv=3, verbose=2)

In [None]:
dt_GridSearch.fit(X_train, y_train)

In [None]:
print("-"*50)
print("Training Score:", dt_GridSearch.score(X_train,y_train))
print("Testing Score:", dt_GridSearch.score(X_test,y_test))
print("-"*50)

- The accuracy scores are more in sync as compared to what we got previously

In [None]:
dt_GridSearch.best_params_

In [None]:
dt_GridSearch.best_estimator_

- Using GridSearchCV, we have got the max_leaf_nodes=20

#### Plot a Decision Tree using the best_estimators

In [None]:
plt.figure(figsize=(20,20))
plot_tree(dt_GridSearch.best_estimator_, feature_names= X.columns,class_names=['0','1'],filled=True, fontsize=11, rounded= True)
plt.show()

#### Exporting Decision Trees in Textual Format

In [None]:
print(tree.export_text(dt_GridSearch.best_estimator_,feature_names=list(X.columns)))