In [None]:
import time
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, plot_confusion_matrix
import plotly.graph_objects as go
df = pd.read_csv('./dataset/Apply_Rate_2019.csv').drop(['class_id'], axis=1)
df.isnull().sum()

In [None]:
# drop duplicate
df.drop_duplicates(inplace=True)

fig = go.Figure([go.Bar(x=df['apply'].unique(), y=df['apply'].value_counts())])
fig.update_layout(xaxis_type='category', title_text='Apply frequency', title_x=0.5)
fig.show()  # imbalanced

In [None]:
# start fking preprocessing
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# fill missing values by MICE, then change city_match back to 0/1
# because directly assign 0 to city_match will worsen the result.
# df.iloc[:,:6] = IterativeImputer().fit_transform(df.iloc[:,:6])
df.iloc[:,:2] = IterativeImputer().fit_transform(df.iloc[:,:2])
df.iloc[:,5:6] = IterativeImputer().fit_transform(df.iloc[:,5:6])
df['city_match'] = df['city_match'].round()

df.describe()

In [None]:
print('number of NaN in title_proximity_tfidf:',len(df[df['title_proximity_tfidf'].isnull()]))
print('number of NaN in description_proximity_tfidf:',len(df[df['description_proximity_tfidf'].isnull()]))
print('number of NaN in city_match:',len(df[df['city_match'].isnull()]))

In [None]:
df.head()

In [None]:
test_x = df[df['search_date_pacific']=='2018-01-27'].drop(['search_date_pacific'], axis=1)
x = len(test_x)
train_x = df[df['search_date_pacific']!='2018-01-27'].drop(['search_date_pacific'], axis=1)
y = len(train_x)
print('before separated:',len(df))
print('after separated:',x+y)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
#from imblearn.over_sampling import SMOTE

# train_x, test_x = train_test_split(df, test_size=0.2, random_state=4211)
train_y = train_x.pop('apply')
test_y = test_x.pop('apply')

# Normalisation using the sklearn StandardScaler. Set the mean to 0 and sd to 1.
transform_columns = ['title_proximity_tfidf', 'description_proximity_tfidf', 'main_query_tfidf', 'query_jl_score', 'query_title_score', 'job_age_days']
ct = ColumnTransformer(
        remainder='passthrough',
        transformers=[('std', StandardScaler(), transform_columns)])
train_x = ct.fit_transform(train_x)
test_x = ct.fit_transform(test_x)

# after ColumnTransformm, city_match is at the ENDDDDDDD of the dataframe, this took me hours to figure out :)
x_columns = ['title_proximity_tfidf', 'description_proximity_tfidf', 'main_query_tfidf', 'query_jl_score', 'query_title_score', 'job_age_days', 'city_match']
train_x = pd.DataFrame(train_x, columns=x_columns)
test_x = pd.DataFrame(test_x, columns=x_columns)

# undersampling to reduce imbalance
#train_x, train_y = SMOTE(sampling_strategy=0.2, random_state=0).fit_resample(train_x, train_y)  # not much different
train_x, train_y = RandomUnderSampler(random_state=0).fit_resample(train_x, train_y)

train_x.describe()

In [None]:
train_x.head()

In [None]:
def report(clf):
    pred_y = clf.predict(test_x)

    print(classification_report(test_y, pred_y))
    disp = plot_confusion_matrix(clf, test_x, test_y)
    disp.figure_.suptitle("Confusion Matrix")

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(80, 60, ), learning_rate='adaptive', early_stopping=True, random_state=4211)

start = time.time()
mlp.fit(train_x, train_y)
print(time.time()-start)
report(mlp) # city_match not normalised, undersampling only

In [None]:
from sklearn.model_selection import GridSearchCV  # this thing cpu intensive.
tuned_parameters = {
    'hidden_layer_sizes': [(120,), (80, 60,), (60, 40, 20,)]
    'solver': ['lbfgs', 'sgd', 'adam'],
    'learning_rate': ['invscaling', 'adaptive'],
    'learning_rate_init': [0.001, 0.002, 0.005],
    'momentum', [0.9, 0.8]
    'early_stopping': [True],
    'random_state': [4211]
}

clf = GridSearchCV(MLPClassifier(), tuned_parameters, n_jobs=-1, verbose=2)
clf.fit(train_x, train_y)
report(clf)

In [None]:
from sklearn.svm import LinearSVC
clf = LinearSVC()

start = time.time()
clf.fit(train_x, train_y)
print(time.time()-start)
report(clf)  # true negative more but true positive less than above nn, lul

In [None]:
from sklearn.model_selection import GridSearchCV  # this thing cpu intensive.
tuned_parameters = {'C':np.arange(0.01,100,50)}

clf = GridSearchCV(LinearSVC(), tuned_parameters, n_jobs=-1, verbose=2)
clf.fit(train_x, train_y)
report(clf)  # no much difference