In [8]:
import pickle

import pandas as pd
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from src.config import CLEANED_TARP_CSV_FILENAME, TEST_DATA_PERCENTAGE, MODEL_PICKLE_FILENAME

In [9]:
class_column = "status"

In [10]:
tarp_df = pd.read_csv(CLEANED_TARP_CSV_FILENAME)
tarp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23995 entries, 0 to 23994
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   soil_moisture    23995 non-null  int64  
 1   temperature      23995 non-null  int64  
 2   time             23995 non-null  int64  
 3   air_temperature  23995 non-null  float64
 4   air_humidity     23995 non-null  float64
 5   pressure         23995 non-null  float64
 6   status           23995 non-null  int64  
dtypes: float64(3), int64(4)
memory usage: 1.3 MB


In [11]:
x_train, x_test, y_train, y_test = train_test_split(
    tarp_df.drop(columns=[class_column]),
    tarp_df[class_column],
    test_size=TEST_DATA_PERCENTAGE,
)

In [12]:
classifier = DecisionTreeClassifier(max_depth=10, min_samples_split=5)
classifier.fit(x_train, y_train)

In [13]:
y_pred_train = classifier.predict(x_train)
y_pred_test = classifier.predict(x_test)

In [14]:
confusion_matrix(y_train, y_pred_train)

array([[8094,  742],
       [ 708, 9652]])

In [15]:
confusion_matrix(y_test, y_pred_test)

array([[1989,  197],
       [ 207, 2406]])

In [16]:
f1_score(y_train, y_pred_train)

0.9301339500819119

In [17]:
f1_score(y_test, y_pred_test)

0.9225460122699386

In [18]:
with open(MODEL_PICKLE_FILENAME, "wb") as f:
    pickle.dump(classifier, f)

In [19]:
classifier.feature_importances_

array([0.22317366, 0.25787714, 0.50116836, 0.00730925, 0.0050804 ,
       0.00539119])