In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

### Training data

In [2]:
df = pd.read_csv("./Train.csv")
df.head()

Unnamed: 0,Person_id,Survey_date,Round,Status,Tenure,Geography,Province,Matric,Degree,Diploma,...,Math,Mathlit,Additional_lang,Home_lang,Science,Female,Sa_citizen,Birthyear,Birthmonth,Target
0,Id_eqz61wz7yn,2022-02-23,2,studying,,Rural,Mpumalanga,1.0,0.0,0.0,...,0 - 29 %,,50 - 59 %,,0 - 29 %,1,1,2000,5,0
1,Id_kj5k3g5wud,2023-02-06,4,unemployed,427.0,Suburb,North West,1.0,0.0,0.0,...,30 - 39 %,,40 - 49 %,,30 - 39 %,1,1,1989,4,1
2,Id_9h0isj38y4,2022-08-08,3,other,,Urban,Free State,1.0,0.0,0.0,...,30 - 39 %,,40 - 49 %,,30 - 39 %,0,1,1996,7,1
3,Id_5ch3zwpdef,2022-03-16,2,unemployed,810.0,Urban,Eastern Cape,,,,...,,,,,,0,1,2000,1,0
4,Id_g4elxibjej,2023-03-22,4,studying,,Urban,Limpopo,,,,...,,,,,,1,1,1998,12,0


In [3]:
df.columns

Index(['Person_id', 'Survey_date', 'Round', 'Status', 'Tenure', 'Geography',
       'Province', 'Matric', 'Degree', 'Diploma', 'Schoolquintile', 'Math',
       'Mathlit', 'Additional_lang', 'Home_lang', 'Science', 'Female',
       'Sa_citizen', 'Birthyear', 'Birthmonth', 'Target'],
      dtype='object')

In [4]:
# df.columns
df.isnull().sum()

Person_id             0
Survey_date           0
Round                 0
Status                0
Tenure             1394
Geography             0
Province              0
Matric             1008
Degree             1831
Diploma            1809
Schoolquintile     1661
Math               3023
Mathlit            2667
Additional_lang    2002
Home_lang          3639
Science            3288
Female                0
Sa_citizen            0
Birthyear             0
Birthmonth            0
Target                0
dtype: int64

In [5]:
null_columns = [
    "Tenure",
    "Matric",
    "Degree",
    "Diploma",
    "Schoolquintile",
    "Math",
    "Mathlit",
    "Additional_lang",
    "Home_lang",
    "Science",
]

not_null_columns = [ col for col in df.columns if col not in null_columns ]

In [6]:
df = df[ not_null_columns ]
df

Unnamed: 0,Person_id,Survey_date,Round,Status,Geography,Province,Female,Sa_citizen,Birthyear,Birthmonth,Target
0,Id_eqz61wz7yn,2022-02-23,2,studying,Rural,Mpumalanga,1,1,2000,5,0
1,Id_kj5k3g5wud,2023-02-06,4,unemployed,Suburb,North West,1,1,1989,4,1
2,Id_9h0isj38y4,2022-08-08,3,other,Urban,Free State,0,1,1996,7,1
3,Id_5ch3zwpdef,2022-03-16,2,unemployed,Urban,Eastern Cape,0,1,2000,1,0
4,Id_g4elxibjej,2023-03-22,4,studying,Urban,Limpopo,1,1,1998,12,0
...,...,...,...,...,...,...,...,...,...,...,...
4015,Id_3lxfvtmbkr,2023-02-13,4,wage employed,Rural,Limpopo,1,1,2000,1,0
4016,Id_u3uc3v9pts,2023-02-02,4,unemployed,Urban,Gauteng,0,1,1999,3,0
4017,Id_enkksj5q5r,2023-03-20,4,unemployed,Urban,KwaZulu-Natal,1,1,1991,10,0
4018,Id_3rtpd7kc1g,2022-03-15,2,studying,Urban,Eastern Cape,1,1,2000,1,1


In [7]:
status_col_coded = {status: index for index, status in enumerate(df['Status'].unique()) }
status_col_coded

{'studying': 0,
 'unemployed': 1,
 'other': 2,
 'wage employed': 3,
 'self employed': 4,
 'employment programme': 5,
 'wage and self employed': 6}

In [8]:
geography_col_coded = {status:index for index, status in enumerate(df['Geography'].unique())}
geography_col_coded

{'Rural': 0, 'Suburb': 1, 'Urban': 2}

In [9]:
province_col_coded = { status: index for index, status in enumerate(df['Province'].unique()) }
province_col_coded

{'Mpumalanga': 0,
 'North West': 1,
 'Free State': 2,
 'Eastern Cape': 3,
 'Limpopo': 4,
 'KwaZulu-Natal': 5,
 'Gauteng': 6,
 'Western Cape': 7,
 'Northern Cape': 8}

In [10]:
data = df.copy()
data.head()

Unnamed: 0,Person_id,Survey_date,Round,Status,Geography,Province,Female,Sa_citizen,Birthyear,Birthmonth,Target
0,Id_eqz61wz7yn,2022-02-23,2,studying,Rural,Mpumalanga,1,1,2000,5,0
1,Id_kj5k3g5wud,2023-02-06,4,unemployed,Suburb,North West,1,1,1989,4,1
2,Id_9h0isj38y4,2022-08-08,3,other,Urban,Free State,0,1,1996,7,1
3,Id_5ch3zwpdef,2022-03-16,2,unemployed,Urban,Eastern Cape,0,1,2000,1,0
4,Id_g4elxibjej,2023-03-22,4,studying,Urban,Limpopo,1,1,1998,12,0


In [11]:
data['Status'] : pd.DataFrame = df['Status'].map(status_col_coded)
data['Geography'] : pd.DataFrame = df['Geography'].map(geography_col_coded)
data['Province'] : pd.DataFrame = df['Province'].map(province_col_coded)
data

Unnamed: 0,Person_id,Survey_date,Round,Status,Geography,Province,Female,Sa_citizen,Birthyear,Birthmonth,Target
0,Id_eqz61wz7yn,2022-02-23,2,0,0,0,1,1,2000,5,0
1,Id_kj5k3g5wud,2023-02-06,4,1,1,1,1,1,1989,4,1
2,Id_9h0isj38y4,2022-08-08,3,2,2,2,0,1,1996,7,1
3,Id_5ch3zwpdef,2022-03-16,2,1,2,3,0,1,2000,1,0
4,Id_g4elxibjej,2023-03-22,4,0,2,4,1,1,1998,12,0
...,...,...,...,...,...,...,...,...,...,...,...
4015,Id_3lxfvtmbkr,2023-02-13,4,3,0,4,1,1,2000,1,0
4016,Id_u3uc3v9pts,2023-02-02,4,1,2,6,0,1,1999,3,0
4017,Id_enkksj5q5r,2023-03-20,4,1,2,5,1,1,1991,10,0
4018,Id_3rtpd7kc1g,2022-03-15,2,0,2,3,1,1,2000,1,1


In [12]:
train_cols = [
    "Round",
    "Status",
    "Geography",
    "Province",
    "Female",
    "Sa_citizen",
    # "Birthyear",
    # "Birthmonth",
]
train_cols

['Round', 'Status', 'Geography', 'Province', 'Female', 'Sa_citizen']

In [13]:
x_train, x_test, y_train, y_test = train_test_split(data[train_cols], data['Target'], test_size=.3, random_state=424 )
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((2814, 6), (1206, 6), (2814,), (1206,))

### Testing Data

In [20]:
test_df = pd.read_csv("./Test.csv")
test_df.head()

Unnamed: 0,Person_id,Survey_date,Round,Status,Tenure,Geography,Province,Matric,Degree,Diploma,Schoolquintile,Math,Mathlit,Additional_lang,Home_lang,Science,Female,Sa_citizen,Birthyear,Birthmonth
0,Id_r90136smvl,2022-08-03,3,other,,Urban,KwaZulu-Natal,1.0,0.0,0.0,2.0,0 - 29 %,,50 - 59 %,,40 - 49 %,0,1,2002,12
1,Id_wawdqhmu6s,2023-03-16,4,unemployed,979.0,Urban,Western Cape,1.0,0.0,0.0,,,,40 - 49 %,,,1,1,1989,12
2,Id_ap2czff2bu,2023-03-14,4,unemployed,339.0,Urban,KwaZulu-Natal,0.0,0.0,0.0,1.0,,,,,,1,1,1989,12
3,Id_uhgink7iha,2023-02-16,4,studying,,Urban,Gauteng,1.0,0.0,0.0,1.0,,80 - 100 %,60 - 69 %,,,0,1,2002,11
4,Id_5j6bzk3k81,2023-03-23,4,unemployed,613.0,Urban,Gauteng,0.0,0.0,0.0,5.0,,,,,,1,1,1993,10


In [23]:
test_df = test_df[train_cols]
test_df.head()

Unnamed: 0,Round,Status,Geography,Province,Female,Sa_citizen
0,3,other,Urban,KwaZulu-Natal,0,1
1,4,unemployed,Urban,Western Cape,1,1
2,4,unemployed,Urban,KwaZulu-Natal,1,1
3,4,studying,Urban,Gauteng,0,1
4,4,unemployed,Urban,Gauteng,1,1


### Models

#### Decision Trees

In [14]:
# dt = DecisionTreeClassifier(criterion="gini", max_depth=10, min_samples_leaf=4, min_samples_split=5)
dt = DecisionTreeClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=5)

cv = StratifiedKFold(random_state=42, n_splits=10, shuffle=True)

scores = cross_val_score(dt, x_train, y_train, cv=cv, scoring="accuracy")

print(f"mean score: {scores.mean()}")

mean score: 0.801714999621413


In [15]:
dt.fit(x_train, y_train)
predictions = dt.predict(x_test)
predictions

array([0, 0, 0, ..., 1, 0, 1])

### Metrics

In [16]:
print( classification_report(y_test, predictions, zero_division=1))

              precision    recall  f1-score   support

           0       0.81      0.94      0.87       869
           1       0.73      0.45      0.55       337

    accuracy                           0.80      1206
   macro avg       0.77      0.69      0.71      1206
weighted avg       0.79      0.80      0.78      1206



In [17]:
accuracy_score(y_test, predictions)

0.7993366500829188

In [18]:
f1_score(y_test, predictions)

0.5535055350553505