In [1]:
# import necessary libraries
import pandas as pd  
from sklearn.model_selection import train_test_split  
from sklearn.tree import DecisionTreeClassifier  
from sklearn import tree 
from sklearn.metrics import accuracy_score, confusion_matrix  
import graphviz

In [8]:
# load the dataset into a pandas DataFrame
df = pd.read_csv('overdrawn.csv')

# clean the dataset of NA values
df = df.dropna()

# test opening file
print(df.head())

# convert the numeric DaysDrink variable to a categorical representation
for i in df.index: # use df.index due to missing indices after dropping NA values
    res = 0
    if 7 <= df.loc[i, 'DaysDrink'] < 14:
        res = 1
    elif df.loc[i, 'DaysDrink'] >= 14:
        res = 2
    df.loc[i, 'DaysDrink'] = res

# test changed 'DaysDrink' column
print(df.head())

    Age  Sex  DaysDrink  Overdrawn
0  19.0  1.0        3.0        0.0
1  19.0  0.0       20.0        0.0
2  19.0  0.0        6.0        0.0
3  19.0  1.0       10.0        0.0
4  19.0  1.0        0.0        0.0
    Age  Sex  DaysDrink  Overdrawn
0  19.0  1.0        0.0        0.0
1  19.0  0.0        2.0        0.0
2  19.0  0.0        0.0        0.0
3  19.0  1.0        1.0        0.0
4  19.0  1.0        0.0        0.0


In [3]:
X = df.drop(columns='Overdrawn') # define the feature variables
y = df['Overdrawn'] # define the target variable

# split the dataset into training and testing subsets using 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
# initilaize an Decision Tree Classifier
dtree = DecisionTreeClassifier()

# train the Decision Tree model on the training data
dtree.fit(X_train, y_train)

# make prediction on the test set
predictions = dtree.predict(X_test)

# evaluate the model's performance
print("Accuracy Score:", accuracy_score(y_test, predictions))

# print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))

Accuracy Score: 0.9318181818181818
Confusion Matrix:
 [[81  0]
 [ 6  1]]


In [5]:
# export decision tree to a format compatible with Graphviz
dot_data = tree.export_graphviz(dtree, out_file=None, 
                                feature_names=('Age', 'Sex', 'DaysDrink',),  # feature names
                                class_names=('0', '1'),  # class labels for target variables
                                filled=True) 

# create graph from exported decision tree data
graph = graphviz.Source(dot_data, format="png") # generate image of tree
graph.render('overdrawn_dt', view=True) # save and open image

'overdrawn_dt.png'

In [7]:
# Results:
# After testing this Decision Tree Classifier model with various changes (e.g., different 
# datasplit amounts, slightly changing classifications), I was able to create a decision tree
# based model to predict student overdrawing with accuracies of over 85%. Additionally, I did 
# take the suggestion from the problem statement of converting the numeric 'DaysDrink' variable
# to a categorical one. Overall, this conversion did result in an increased accuracy ranging from 
# 5% to 15%.