In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score


In [2]:
titanic_data = pd.read_csv('../ml/dataset/titanic-2.csv')
titanic_data.head()
titanic_data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [5]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [13]:
survived_counts = titanic_data['Survived'].value_counts(normalize=True) * 100
Pclass = titanic_data['Pclass'].value_counts()
sex = titanic_data['Sex'].value_counts()



# Display the counts
print(survived_counts)
print(Pclass)
print(sex)


Survived
0    61.616162
1    38.383838
Name: proportion, dtype: float64
Pclass
3    491
1    216
2    184
Name: count, dtype: int64
Sex
male      577
female    314
Name: count, dtype: int64


In [4]:
titanic_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [72]:
titanic_data = titanic_data.drop(['Name','Ticket','Cabin'],axis=1)


In [73]:
titanic_data['Age'].fillna(titanic_data['Age'].mean(),inplace=True)
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0],inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_data['Age'].fillna(titanic_data['Age'].mean(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0],inplace=True)


In [74]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [75]:
Label_encoder = LabelEncoder()
titanic_data['Sex'] = Label_encoder.fit_transform(titanic_data['Sex'])
titanic_data['Embarked'] = Label_encoder.fit_transform(titanic_data['Embarked'])


In [76]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.925,2
3,4,1,1,0,35.0,1,0,53.1,2
4,5,0,3,1,35.0,0,0,8.05,2


In [77]:
X = titanic_data.drop('Survived', axis=1)
y = titanic_data['Survived']

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(len(X_train))
print(len(X_test))

712
179


In [79]:
clf = DecisionTreeClassifier(random_state=42)
clf

In [80]:
param_grid = {
    'criterion' : ['gini','entropy'],
    'max_depth' : [None,10,20,30,40,50,60,70],
    'min_samples_split' : [1,2,3,4,5,6,7,8,9,10,12],
    'min_samples_leaf' : [2,3,4,5,6,7,8,9,50]
}
param_grid

{'criterion': ['global'],
 'max_depth': [None, 10, 20, 30, 40, 50, 60, 70],
 'min_samples_split': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12],
 'min_samples_leaf': [2, 3, 4, 5, 6, 7, 8, 9, 50]}

In [81]:
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=6, scoring='accuracy' , n_jobs=-1)
grid_search.fit(X_train,y_train)

ValueError: 
All the 4752 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
593 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Kuldip\anaconda3\envs\bia\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Kuldip\anaconda3\envs\bia\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\Kuldip\anaconda3\envs\bia\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Kuldip\anaconda3\envs\bia\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'criterion' parameter of DecisionTreeClassifier must be a str among {'entropy', 'log_loss', 'gini'}. Got 'global' instead.

--------------------------------------------------------------------------------
1202 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Kuldip\anaconda3\envs\bia\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Kuldip\anaconda3\envs\bia\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\Kuldip\anaconda3\envs\bia\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Kuldip\anaconda3\envs\bia\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'criterion' parameter of DecisionTreeClassifier must be a str among {'gini', 'log_loss', 'entropy'}. Got 'global' instead.

--------------------------------------------------------------------------------
1185 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Kuldip\anaconda3\envs\bia\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Kuldip\anaconda3\envs\bia\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\Kuldip\anaconda3\envs\bia\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Kuldip\anaconda3\envs\bia\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'criterion' parameter of DecisionTreeClassifier must be a str among {'gini', 'entropy', 'log_loss'}. Got 'global' instead.

--------------------------------------------------------------------------------
1181 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Kuldip\anaconda3\envs\bia\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Kuldip\anaconda3\envs\bia\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\Kuldip\anaconda3\envs\bia\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Kuldip\anaconda3\envs\bia\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'criterion' parameter of DecisionTreeClassifier must be a str among {'log_loss', 'entropy', 'gini'}. Got 'global' instead.

--------------------------------------------------------------------------------
591 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Kuldip\anaconda3\envs\bia\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Kuldip\anaconda3\envs\bia\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\Kuldip\anaconda3\envs\bia\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Kuldip\anaconda3\envs\bia\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'criterion' parameter of DecisionTreeClassifier must be a str among {'log_loss', 'gini', 'entropy'}. Got 'global' instead.


In [66]:
best_params = grid_search.best_params_
best_params

{'criterion': 'gini',
 'max_depth': 10,
 'min_samples_leaf': 7,
 'min_samples_split': 2}

In [67]:
best_clf = DecisionTreeClassifier(random_state=42, **best_params)
best_clf.fit(X_train,y_train)

In [68]:
y_pred =best_clf.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
print("accuracy===>",accuracy)

accuracy===> 0.8324022346368715
