In [1]:
import numpy as np
import pandas as pd
from keras import Model
from sklearn.svm import SVC
from keras.optimizers import Adam
from keras.regularizers import l2, l1
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from keras.layers import Bidirectional, LSTM, Dense, Dropout, BatchNormalization, Input, Activation

## Data

In [2]:
df_train = pd.read_csv("./clean_train.csv")
df_train.head()

Unnamed: 0,Tenure,Matric,Degree,Diploma,Female,Sa_citizen,Birthyear,Birthmonth,Target,Year_survey,...,Home_lang_70_79,Home_lang_80_100,Home_lang_nan,Science_30_39,Science_40_49,Science_50_59,Science_60_69,Science_70_79,Science_80_100,Science_nan
0,0.0,1.0,0.0,0.0,1,1,2000,5,0,2022,...,False,False,True,False,False,False,False,False,False,False
1,427.0,1.0,0.0,0.0,1,1,1989,4,1,2023,...,False,False,True,True,False,False,False,False,False,False
2,0.0,1.0,0.0,0.0,0,1,1996,7,1,2022,...,False,False,True,True,False,False,False,False,False,False
3,810.0,0.0,0.0,0.0,0,1,2000,1,0,2022,...,False,False,True,False,False,False,False,False,False,True
4,0.0,0.0,0.0,0.0,1,1,1998,12,0,2023,...,False,False,True,False,False,False,False,False,False,True


In [3]:
df_train.columns

Index(['Tenure', 'Matric', 'Degree', 'Diploma', 'Female', 'Sa_citizen',
       'Birthyear', 'Birthmonth', 'Target', 'Year_survey', 'Age_survey',
       'Subjects_over_70', 'Round_20', 'Round_30', 'Round_40', 'Round_nan',
       'Status_other', 'Status_self_employed', 'Status_studying',
       'Status_unemployed', 'Status_wage_and_self_employed',
       'Status_wage_employed', 'Status_nan', 'Geography_Suburb',
       'Geography_Urban', 'Geography_nan', 'Province_Free_State',
       'Province_Gauteng', 'Province_KwaZuluNatal', 'Province_Limpopo',
       'Province_Mpumalanga', 'Province_North_West', 'Province_Northern_Cape',
       'Province_Western_Cape', 'Province_nan', 'Schoolquintile_10',
       'Schoolquintile_20', 'Schoolquintile_30', 'Schoolquintile_40',
       'Schoolquintile_50', 'Schoolquintile_nan', 'Math_30_39', 'Math_40_49',
       'Math_50_59', 'Math_60_69', 'Math_70_79', 'Math_80_100', 'Math_nan',
       'Mathlit_30_39', 'Mathlit_40_49', 'Mathlit_50_59', 'Mathlit_60_69',
  

In [4]:
df_train['Province_nan'].unique()

array([False])

In [5]:
df_train.shape

(4020, 74)

In [6]:
train = df_train.drop('Target', axis=1)
targets = df_train['Target']

train.shape, targets.shape

((4020, 73), (4020,))

## Test Data

In [7]:
# unclean data
test_unclean_df = pd.read_csv("./Test.csv")
test_unclean_df.head()

Unnamed: 0,Person_id,Survey_date,Round,Status,Tenure,Geography,Province,Matric,Degree,Diploma,Schoolquintile,Math,Mathlit,Additional_lang,Home_lang,Science,Female,Sa_citizen,Birthyear,Birthmonth
0,Id_r90136smvl,2022-08-03,3,other,,Urban,KwaZulu-Natal,1.0,0.0,0.0,2.0,0 - 29 %,,50 - 59 %,,40 - 49 %,0,1,2002,12
1,Id_wawdqhmu6s,2023-03-16,4,unemployed,979.0,Urban,Western Cape,1.0,0.0,0.0,,,,40 - 49 %,,,1,1,1989,12
2,Id_ap2czff2bu,2023-03-14,4,unemployed,339.0,Urban,KwaZulu-Natal,0.0,0.0,0.0,1.0,,,,,,1,1,1989,12
3,Id_uhgink7iha,2023-02-16,4,studying,,Urban,Gauteng,1.0,0.0,0.0,1.0,,80 - 100 %,60 - 69 %,,,0,1,2002,11
4,Id_5j6bzk3k81,2023-03-23,4,unemployed,613.0,Urban,Gauteng,0.0,0.0,0.0,5.0,,,,,,1,1,1993,10


In [8]:
# clean data
test_data = pd.read_csv("./clean_test.csv")
test_data.head()

Unnamed: 0,Tenure,Matric,Degree,Diploma,Female,Sa_citizen,Birthyear,Birthmonth,Year_survey,Age_survey,...,Home_lang_70_79,Home_lang_80_100,Home_lang_nan,Science_30_39,Science_40_49,Science_50_59,Science_60_69,Science_70_79,Science_80_100,Science_nan
0,0.0,1.0,0.0,0.0,0,1,2002,12,2022,20,...,False,False,True,False,True,False,False,False,False,False
1,979.0,1.0,0.0,0.0,1,1,1989,12,2023,34,...,False,False,True,False,False,False,False,False,False,True
2,339.0,0.0,0.0,0.0,1,1,1989,12,2023,34,...,False,False,True,False,False,False,False,False,False,True
3,0.0,1.0,0.0,0.0,0,1,2002,11,2023,21,...,False,False,True,False,False,False,False,False,False,True
4,613.0,0.0,0.0,0.0,1,1,1993,10,2023,30,...,False,False,True,False,False,False,False,False,False,True


## Models

In [9]:
predictions = []

#### Naive Bayes Gaussian

In [10]:
# # NB
# grid_search_params = {"var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}

# cls_gnb = GaussianNB()

# grid_search = GridSearchCV(
#     estimator=cls_gnb, param_grid=grid_search_params, cv=10, scoring="accuracy"
# )

# grid_search.fit(train, targets)

# print(f"{grid_search.best_params_}\n\n")
# print(f"{grid_search.best_estimator_}")

In [11]:
# # perform cross val on data
# gnb = GaussianNB(var_smoothing=1e-07)

# cv = StratifiedKFold(random_state=42, n_splits=10, shuffle=True)

# scores = cross_val_score(gnb, train, targets, cv=cv, scoring='accuracy')

# print(f"mean score: {scores.mean()}")

In [12]:
# gnb.fit(train, targets)
# predictions = gnb.predict(test_data)
# predictions

#### SVM - SVC

In [13]:
# # svc takes times
# # "C": [0.08, 0.05, 0.03, 0.01, 0.1, 1, 3, 5, 7, 10],
# grid_search_params = {
#     "C": [0.01, 0.05, 0.1, 1, 5, 10],
#     "kernel": ["linear", "rbf", "poly", "sigmoid"],
#     "degree": [2, 3, 4, 5],
#     "gamma": ["scale", "auto"] + [0.001, 0.01, 0.1, 1],
# }

# cls_svc = SVC()

# grid_search = GridSearchCV(
#     estimator=cls_svc, param_grid=grid_search_params, cv=5, scoring="accuracy"
# )

# grid_search.fit(train, targets)

# print(f"{grid_search.best_params_}\n\n")
# print(f"{grid_search.best_estimator_}")

#### Deep Learning

In [14]:
# # remove province nan
# dl_data = np.array( train.drop('Province_nan', axis=1)).astype(np.float32)
# dl_data = dl_data.reshape(dl_data.shape[0], 9, -1 )
# dl_data.shape

In [15]:
# dl_labels = np.array(targets)
# np.unique( dl_labels)

In [16]:
# input_shape = (9, 8)

# input_layer = Input(shape=input_shape)

# # units = [256, 128, 64, 32]
# x = Bidirectional(LSTM(64, return_sequences=True, kernel_regularizer=(l2(0.01)) ))(input_layer)
# x = BatchNormalization()(x)
# x = Activation('relu')(x)
# x = Dropout(0.5)(x)
# #
# x = Bidirectional(LSTM(32, return_sequences=True, kernel_regularizer=(l2(0.01))))(x)
# x = BatchNormalization()(x)
# x = Activation('relu')(x)
# x = Dropout(0.5)(x)
# #
# x = Bidirectional(LSTM(16, return_sequences=True, kernel_regularizer=(l2(0.01))))(x)
# x = BatchNormalization()(x)
# x = Activation('relu')(x)
# x = Dropout(0.5)(x)

# output_layer = Dense(1, activation="sigmoid")(x)

# model = Model(inputs=input_layer, outputs=output_layer)

# # 
# learning_rate = 0.001
# optim_ = Adam(learning_rate=learning_rate)

# # 
# model.compile(optimizer=optim_, loss="binary_crossentropy", metrics=["accuracy"])

# model.summary()

In [17]:
# dl_labels = np.asarray(dl_labels).astype('int').reshape((-1,1))

In [18]:
# model.fit(dl_data, dl_labels, batch_size=16, epochs=20, validation_split=0.2)

In [19]:
# # remove province nan
# dl_test_data = np.array( test_data.drop('Province_nan', axis=1)).astype(np.float32)
# dl_test_data = dl_test_data.reshape(dl_test_data.shape[0], 9, -1 )
# dl_test_data.shape

# y_pred = model.predict(dl_test_data)

In [20]:
# y_pred = np.squeeze(y_pred)
# predictions= ( y_pred >= .5).astype('int')[:,0]
# predictions

#### Multinomial NB

In [21]:
# # NB
# grid_search_params = {
#     "alpha": [0.1, 0.5, 1.0, 2.0],
#     "fit_prior": [True, False],
# }

# cls_mnb = MultinomialNB()

# grid_search = GridSearchCV(
#     estimator=cls_mnb, param_grid=grid_search_params, cv=30, scoring="accuracy"
# )

# grid_search.fit(train, targets)

# print(f"Best Params: {grid_search.best_params_}")
# print(f"Best Estimator: {grid_search.best_estimator_}")
# print(f"Best Score: {grid_search.best_score_}")

In [22]:
# cls_mnb.fit(train, targets)

In [23]:
# predictions = cls_mnb.predict(test_data)
# predictions

#### Random Forest Classifier

In [24]:
# grid_search_params = {
#     'n_estimators': [50, 100, 200],  # Number of trees in the forest
#     'max_depth': [None, 10, 20, 30],  # Maximum depth of each tree
#     'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
#     'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
#     'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider for the best split
# }

# cls_rf = RandomForestClassifier(random_state=42)

# grid_search = GridSearchCV(
#     estimator=cls_rf, param_grid=grid_search_params, cv=10, scoring="accuracy"
# )

# grid_search.fit(train, targets)

# # print(f"{grid_search.best_params_}\n\n")
# print(f"{grid_search.best_estimator_}")

In [25]:
# rf = RandomForestClassifier(
#     random_state=42, max_depth=30, min_samples_leaf=2, min_samples_split=5
# )

# cv = StratifiedKFold(random_state=42, n_splits=10, shuffle=True)

# scores = cross_val_score(rf, train, targets, cv=cv, scoring='accuracy')

# print(f"mean score: {scores.mean()}")

In [26]:
# rf.fit(train, targets)
# predictions = rf.predict(test_data)
# predictions

#### KNN

In [None]:
# grid_search_params = {
#     "n_neighbors": [3, 5, 7, 9],
#     "weights": ["uniform", "distance"],
#     "p": [1, 2],
#     # "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
# }

# cls_knn = KNeighborsClassifier()

# grid_search = GridSearchCV(
#     estimator=cls_knn, param_grid=grid_search_params, cv=10, scoring="accuracy"
# )

# grid_search.fit(train, targets)

# # print(f"{grid_search.best_params_}\n\n")
# print(f"{grid_search.best_estimator_}")

In [71]:
# knn = KNeighborsClassifier( n_neighbors=5, weights="distance", p=2 )

# cv = StratifiedKFold(random_state=42, n_splits=10, shuffle=True)

# scores = cross_val_score(knn, train, targets, cv=cv, scoring='accuracy')

# print(f"mean score: {scores.mean()}")

mean score: 0.7417910447761193


In [72]:
# knn.fit(train, targets)
# predictions = knn.predict(test_data)
# predictions

array([0, 1, 0, ..., 0, 0, 0])

## Save to file

In [73]:

# df_submission = pd.DataFrame({"ID": test_unclean_df["Person_id"], "Target": predictions.astype(int)})
# print(df_submission.head())

              ID  Target
0  Id_r90136smvl       0
1  Id_wawdqhmu6s       1
2  Id_ap2czff2bu       0
3  Id_uhgink7iha       0
4  Id_5j6bzk3k81       0


In [74]:
# df_submission.to_csv("submission.csv", index=False)