### Try List

* ~~Try `faiss`~~

* Try different weights
    ```python
    weights = {i : 1 for i in df_x.columns}
    weights
    ```

* Try splitting jobs
    ```python
    df_esun = df_all[df_all['source'] == '玉證']
    df_fugle = df_all[df_all['source'] == 'FUGLE']

    df_esun = df_esun.drop('source', axis=1)
    df_fugle = df_fugle.drop('source', axis=1)
    ```
* ~~Try `KDTree` and `BallTree Classes`~~

### FUGLE
```
                 Specs         Score
10          occupation  25743.328624
2           incomeYear   1523.175415
3          totalWealth    592.795473
12         lead_job_id    318.785655
9               salary    291.957395
11  hasOtherComAccount    285.595706
4        expInvestment    265.252679
8          quotaCredit    118.790131
0                  age     71.436991
7           srcCapital     52.197881
6        frqInvestment     20.931034
1             eduLevel     15.665915
5        yrsInvestment      1.380907
```

### ESUN
```
                 Specs        Score
4        expInvestment  3679.967411
2           incomeYear  3358.210131
3          totalWealth  3096.975501
11  hasOtherComAccount  2696.629454
8          quotaCredit   678.532252
10          occupation   354.281921
0                  age   304.563784
12         lead_job_id   175.972638
6        frqInvestment    86.063416
1             eduLevel    66.027508
9               salary    24.979539
7           srcCapital    12.214207
5        yrsInvestment     3.872606
```

In [72]:
import numpy as np
import pandas as pd
import math
from IPython.display import display
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.neighbors import NearestNeighbors
import numpy as np
from scipy import stats
from sklearn.metrics import precision_recall_fscore_support as score

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [73]:
def make_quota(a, b):
    if math.isnan(b):
        return a
    else:
        return min(a, b)

def to_class(x):
    '''
    0~10萬
    10~30萬(不含10萬)
    30~50萬(不含30萬)
    50~100萬(不含50萬)
    '''
    if x < 1E5:
        return 0
    if 1E5 <= x and x < 3E5:
        return 1
    if 3E5 <= x and x < 5E5:
        return 2
    else:
        return 3

# feature exploration
def plot_corr(df):
    f = plt.figure(figsize=(10, 8))
    plt.matshow(df.corr(), fignum=f.number)
    plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=14, rotation=90)
    plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=14)
    cb = plt.colorbar()
    cb.ax.tick_params(labelsize=10)
    plt.title('Correlation Matrix', fontsize=10)
    plt.show()
    plt.close()

def get_weights(df_x, df_y):
    x_values = df_x.values
    y_values = df_y.values
    top_k = len(df_x.columns)
    bestfeatures = SelectKBest(score_func=chi2, k=top_k)
    fit = bestfeatures.fit(x_values, y_values)
    df_scores = pd.DataFrame(fit.scores_)

    # visualization
    # df_columns = pd.DataFrame(df_x.columns)
    # featureScores = pd.concat([df_columns, df_scores],axis=1)
    # featureScores.columns = ['Specs','Score']  # naming the dataframe columns
    # print(featureScores.nlargest(top_k, 'Score'))  # print 10 best feature
    
    return np.log(df_scores[0].values)

def get_distance(x: np.array, y: np.array, weights=None) -> float:
    """Compute the distance between the instance x and y (numpy arrays)."""
    global num_features
    global cat_features

    n_num = len(num_features)
    n_cat = len(cat_features)

    res = 0

    if weights is not None:
        for i in range(n_num):
            res += (float(x[i]) - float(y[i]))**2 * weights[i]

        for i in range(n_num, n_num+n_cat):
            if x[i] != y[i]:
                res += weights[i]
    else:
        for i in range(n_num):
            res += (float(x[i]) - float(y[i]))**2

        for i in range(n_num, n_num+n_cat):
            if x[i] != y[i]:
                res += 1

    return res

def predict(test_x, num, nbrs):
    global train_y
    pred_indices = nbrs.kneighbors(test_x.iloc[:num])
    pred_y = [train_y.iloc[x].values for x in pred_indices[1]]
    return stats.mode(pred_y, axis=1).mode.squeeze()

In [75]:
df_all = pd.read_csv('./data/ooa_features_v1.csv')

selected_features =[
    'source',
    'age',
    'occupation',
    'hasOtherComAccount',
    'eduLevel',
    'isReject',
    'incomeYear',
    'totalWealth',
    'expInvestment',
    'yrsInvestment',
    'frqInvestment',
    'srcCapital',
    'quotaCredit',
    'quota_now',
    'quota_now_elec',
    'salary',
    'lead_job_id'
]

# FUGLE
# selected_features =[
#     'source',
#     # 'age',
#     'occupation',
#     'hasOtherComAccount',
#     # 'eduLevel',
#     'isReject',
#     'incomeYear',
#     'totalWealth',
#     'expInvestment',
#     # 'yrsInvestment',
#     # 'frqInvestment',
#     # 'srcCapital',
#     # 'quotaCredit',
#     'quota_now',
#     'quota_now_elec',
#     'salary',
#     'lead_job_id'
# ]

# ESUN
selected_features =[
    'source',
    'age',
    'occupation',
    'hasOtherComAccount',
    # 'eduLevel',
    'isReject',
    'incomeYear',
    'totalWealth',
    'expInvestment',
    # 'yrsInvestment',
    # 'frqInvestment',
    # 'srcCapital',
    'quotaCredit',
    'quota_now',
    'quota_now_elec',
    # 'salary',
    # 'lead_job_id'
]

# select features
df_all  = df_all[selected_features]
df_all = df_all[df_all['occupation'] <= 33]

# define the label to predict
df_all['y_num'] = df_all[['quota_now', 'quota_now_elec']].apply(lambda x: make_quota(*x), axis=1)
df_all = df_all[df_all['quota_now']<=1e6]
df_all['y_cat'] = df_all['quota_now'].apply(lambda x: to_class(x))
df_all = df_all.drop(['quota_now', 'quota_now_elec'], axis=1)

# drop: isReject
df_all = df_all[df_all['isReject']==0]
df_all = df_all.drop('isReject', axis=1)

# drop source Anue 
df_all = df_all[df_all['source'] != 'Anue']
df_all = df_all.replace({"source": {'FUGLE': 0, '玉證': 1}})

df_all = df_all[df_all['source'] == 0]
df_all = df_all.drop('source', axis=1)

# take the absolute value of salary to avoid negative values
# df_all['salary'] = df_all['salary'].apply(lambda x: abs(x))

df_all = df_all.dropna()
# display(df_all.head())

# normalization
df_x_raw = df_all.iloc[:, :-2]
df_y = df_all.iloc[:, -1]
# cat_features = ['source', 'occupation', 'hasOtherComAccount', 'lead_job_id']
# cat_features = ['occupation', 'hasOtherComAccount', 'lead_job_id']
cat_features = ['occupation', 'hasOtherComAccount',]
num_features = [col for col in df_x_raw.columns if col not in cat_features]
df_x_num = df_x_raw[num_features].apply(lambda x: x/x.max(), axis=0)
df_x_cat = df_x_raw[cat_features]
df_x = pd.concat([df_x_num, df_x_cat], axis=1)
display(df_x.head())
display(df_y.head())
# df_x.info()

Unnamed: 0,age,incomeYear,totalWealth,expInvestment,quotaCredit,occupation,hasOtherComAccount
0,0.533333,0.5,0.5,0.0,0.0,21.0,0
1,0.533333,0.5,0.5,1.0,0.0,18.0,1
3,0.466667,1.0,1.0,1.0,0.0,2.0,1
4,0.333333,0.5,0.5,0.75,0.5,2.0,1
7,0.466667,1.0,0.5,1.0,0.5,11.0,1


0    3
1    3
3    3
4    3
7    3
Name: y_cat, dtype: int64

In [69]:
train_x, test_x, train_y, test_y = train_test_split(df_x, df_y, test_size=0.015, random_state=42)
print(f'{len(train_x) = }')
print(f'{len(test_x) = }')

len(train_x) = 66230
len(test_x) = 1009


In [70]:
# build model
weights = get_weights(df_x, df_y)
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree', metric=get_distance, metric_params={'weights': weights})

# # train model
nbrs.fit(train_x)

NearestNeighbors(algorithm='ball_tree',
                 metric=<function get_distance at 0x7fcb5f008310>,
                 metric_params={'weights': array([5.71888054, 8.11916341, 8.03818127, 8.21065918, 6.51993201,
       5.87009298, 7.89975792])},
                 n_neighbors=2)

In [71]:
# for n in [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22]:
for n in [1, 2, 3, 4, 5, 6, 7, 8]:
    print(f'{n = }')
    
    # build model
    nbrs = NearestNeighbors(n_neighbors=n, algorithm='ball_tree', metric=get_distance, metric_params={'weights': weights})

    # train model
    nbrs.fit(train_x)

    n_test = len(test_x)
    print(n_test)
    pred = predict(test_x, n_test, nbrs)
    gt = list(test_y[:n_test].values)

    precision, recall, fscore, support = score(gt, pred)

    res_df = pd.DataFrame({
        'precision' : precision,
        'recall' : recall,
        'fscore' : fscore,
        'support' : support
    })

    display(res_df)

n = 1
1009


Unnamed: 0,precision,recall,fscore,support
0,0.122449,0.09375,0.106195,128
1,0.375676,0.42378,0.398281,328
2,0.15,0.121951,0.134529,123
3,0.519274,0.532558,0.525832,430


n = 2
1009


Unnamed: 0,precision,recall,fscore,support
0,0.2,0.367188,0.258953,128
1,0.356195,0.490854,0.412821,328
2,0.12963,0.113821,0.121212,123
3,0.598131,0.297674,0.397516,430


n = 3
1009


Unnamed: 0,precision,recall,fscore,support
0,0.194313,0.320312,0.241888,128
1,0.374684,0.45122,0.409405,328
2,0.217391,0.081301,0.118343,123
3,0.577031,0.47907,0.523507,430


n = 4
1009


Unnamed: 0,precision,recall,fscore,support
0,0.167939,0.171875,0.169884,128
1,0.36211,0.460366,0.405369,328
2,0.161765,0.089431,0.115183,123
3,0.552163,0.504651,0.527339,430


n = 5
1009


Unnamed: 0,precision,recall,fscore,support
0,0.216667,0.304688,0.253247,128
1,0.358396,0.435976,0.393398,328
2,0.163934,0.081301,0.108696,123
3,0.574526,0.493023,0.530663,430


n = 6
1009


Unnamed: 0,precision,recall,fscore,support
0,0.237805,0.304688,0.267123,128
1,0.374408,0.481707,0.421333,328
2,0.102564,0.03252,0.049383,123
3,0.583333,0.52093,0.550369,430


n = 7
1009


Unnamed: 0,precision,recall,fscore,support
0,0.25,0.226562,0.237705,128
1,0.382353,0.515244,0.438961,328
2,0.176471,0.04878,0.076433,123
3,0.580336,0.562791,0.571429,430


n = 8
1009


Unnamed: 0,precision,recall,fscore,support
0,0.285714,0.21875,0.247788,128
1,0.388646,0.542683,0.452926,328
2,0.121212,0.03252,0.051282,123
3,0.604762,0.590698,0.597647,430
