# Here is my code for a Decision Tree Classifier with Grid Search Cv to try variables automatically.

First we import our libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Then we assign a url variable to our file path and read it with pandas pd.read_csv()

In [2]:
train_url='/kaggle/input/titanic/train.csv'
test_url='/kaggle/input/titanic/test.csv'
gender_submission_url='/kaggle/input/titanic/gender_submission.csv'
df_train=pd.read_csv(train_url)
df_test=pd.read_csv(test_url)



In [3]:
y_test=pd.read_csv(gender_submission_url)

In [4]:
y_test.shape


In [5]:
y_test.head()

Here we take the Survived column from df_train and assign it to y_train, also we take our PassengerId numbers and save them to a variable 'passenger' for use later.

In [6]:
y_train=df_train['Survived'].to_frame()
x_test=df_test
passengerid=x_test['PassengerId']

In [7]:
x_test["PassengerId"].head()

In [8]:
df_train.head()

We will include all the numerical data that can be used by our classifier function.

In [9]:
x_train = df_train.drop(columns=['Name', 'Ticket', 'Cabin', 'Survived'])

In [10]:
x_test = df_test.drop(columns=['Name', 'Ticket', 'Cabin'])

Now we must fill our empty cells with something that will help. Age can be filled with the average age.

In [11]:
age_mean_train = x_train['Age'].mean()
#x_test['Age'].fillna(value=age_mean_test, inplace=True)
age_mean_test = x_test['Age'].mean()

In [12]:
fare_mean_test = x_test['Fare'].mean()

In [13]:
x_train['Age'].fillna(value=age_mean_train, inplace=True)
x_test['Age'].fillna(value=age_mean_test, inplace=True)
x_test['Fare'].fillna(value=fare_mean_test, inplace=True)

In [14]:
x_train['Embarked'].fillna(value='S', inplace=True)
x_test['Embarked'].fillna(value='S', inplace=True)

This cell looks to see if there are any more empty cells.

In [15]:
print('x_train Age Empty Cells: ',x_train['Age'].isnull().sum())
print('x_train Embarked Empty Cells: ',x_train['Embarked'].isnull().sum())
print('x_train Fare Empty Cells: ',x_train['Fare'].isnull().sum())
print('x_train Parch Empty Cells: ',x_train['Parch'].isnull().sum())
print('x_train Passenger class Empty Cells: ',x_train['Pclass'].isnull().sum())
print('x_train Sex Empty Cells: ',x_train['Sex'].isnull().sum())
print('x_train SibSp Empty Cells: ',x_train['SibSp'].isnull().sum())
print('x_test Age Empty Cells: ',x_test['Age'].isnull().sum())
print('x_test Embarked Empty Cells: ',x_test['Embarked'].isnull().sum())
print('x_test Fare Empty Cells: ',x_test['Fare'].isnull().sum())
print('x_test Parch Empty Cells: ',x_test['Parch'].isnull().sum())
print('x_test Passenger class Empty Cells: ',x_test['Pclass'].isnull().sum())
print('x_test Sex Empty Cells: ',x_test['Sex'].isnull().sum())
print('x_test SibSp Empty Cells: ',x_test['SibSp'].isnull().sum())

In [16]:
x_train['Embarked'].value_counts()

Lets do one hot encoding to the Sex column so that our classifier can work with it.

In [17]:
x_train['Embarked']=x_train['Embarked'].replace(to_replace="S",value="1")
x_train['Embarked']=x_train['Embarked'].replace(to_replace="C",value="2")
x_train['Embarked']=x_train['Embarked'].replace(to_replace="Q",value="3")
x_train['Sex']=x_train['Sex'].replace(to_replace="male",value="1")
x_train['Sex']=x_train['Sex'].replace(to_replace="female",value="2")

In [18]:
x_test['Embarked']=x_test['Embarked'].replace(to_replace="S",value="1")
x_test['Embarked']=x_test['Embarked'].replace(to_replace="C",value="2")
x_test['Embarked']=x_test['Embarked'].replace(to_replace="Q",value="3")
x_test['Sex']=x_test['Sex'].replace(to_replace="male",value="1")
x_test['Sex']=x_test['Sex'].replace(to_replace="female",value="2")

In [19]:
x_train.set_index('PassengerId')

In [20]:
x_test.set_index('PassengerId')

We must set ouf parameters for our decision tree classifier and set our classifier to a variable for use in the GridSearchCV() function.

In [21]:
parameters = {'criterion': ['gini', 'entropy'],
     'splitter': ['best', 'random'],
     'max_depth': [2*n for n in range(1,10)],
     'max_features': ['auto', 'sqrt'],
     'min_samples_leaf': [1, 2, 4],
     'min_samples_split': [2, 5, 10]}

tree = DecisionTreeClassifier()

Finally we use GridSearchCV() to find the best parameters for our DecisionTreeClassifier() function.

In [22]:
tree_cv = GridSearchCV(tree, parameters, cv=10)
tree_cv.fit(x_train, y_train)

In [23]:
print("tuned hpyerparameters :(best parameters) ",tree_cv.best_params_)
print("accuracy :",tree_cv.best_score_)

We make our predictions with the GridSearchCV() function and assign it to the 'submission' variable.

In [24]:
#clf.predict([[2., 2.]])
submission = tree_cv.predict(x_test)

In [25]:
submission = pd.DataFrame(submission)

In [26]:
submission["PassengerId"]=passengerid


In [27]:
submission.rename(columns={0: "Survived"}, inplace=True)

In [28]:
submission.head()

In [29]:
#submission = submission[['PassengerId', 'Survived']]

In [30]:
submission.set_index('PassengerId', inplace=True)

After that, we are ready to export our dataframe to a .csv file for submission.

In [31]:
submission.to_csv('submission.csv')