# Deep Learning

### FUGLE
```
                 Specs         Score
10          occupation  25743.328624
2           incomeYear   1523.175415
3          totalWealth    592.795473
12         lead_job_id    318.785655
9               salary    291.957395
11  hasOtherComAccount    285.595706
4        expInvestment    265.252679
8          quotaCredit    118.790131
0                  age     71.436991
7           srcCapital     52.197881
6        frqInvestment     20.931034
1             eduLevel     15.665915
5        yrsInvestment      1.380907
```

### ESUN
```
                 Specs        Score
4        expInvestment  3679.967411
2           incomeYear  3358.210131
3          totalWealth  3096.975501
11  hasOtherComAccount  2696.629454
8          quotaCredit   678.532252
10          occupation   354.281921
0                  age   304.563784
12         lead_job_id   175.972638
6        frqInvestment    86.063416
1             eduLevel    66.027508
9               salary    24.979539
7           srcCapital    12.214207
5        yrsInvestment     3.872606
```

In [1]:
import numpy as np
import pandas as pd
import math
from IPython.display import display
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.neighbors import NearestNeighbors
import numpy as np
from scipy import stats
from sklearn.metrics import precision_recall_fscore_support as score

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import torch
import torch.nn.functional as F
from torch import nn

from sklearn.preprocessing import OneHotEncoder
import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, WeightedRandomSampler
from torch import optim


In [7]:
def make_quota(a, b):
    if math.isnan(b):
        return a
    else:
        return min(a, b)

def to_class(x):
    '''
    0~10萬
    10~30萬(不含10萬)
    30~50萬(不含30萬)
    50~100萬(不含50萬)
    '''
    if x < 1E5:
        return 0
    if 1E5 <= x and x < 3E5:
        return 1
    if 3E5 <= x and x < 5E5:
        return 2
    else:
        return 3

# feature exploration
def plot_corr(df):
    f = plt.figure(figsize=(10, 8))
    plt.matshow(df.corr(), fignum=f.number)
    plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=14, rotation=90)
    plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=14)
    cb = plt.colorbar()
    cb.ax.tick_params(labelsize=10)
    plt.title('Correlation Matrix', fontsize=10)
    plt.show()
    plt.close()

def get_weights(df_x, df_y):
    x_values = df_x.values
    y_values = df_y.values
    top_k = len(df_x.columns)
    bestfeatures = SelectKBest(score_func=chi2, k=top_k)
    fit = bestfeatures.fit(x_values, y_values)
    df_scores = pd.DataFrame(fit.scores_)

    # visualization
    # df_columns = pd.DataFrame(df_x.columns)
    # featureScores = pd.concat([df_columns, df_scores],axis=1)
    # featureScores.columns = ['Specs','Score']  # naming the dataframe columns
    # print(featureScores.nlargest(top_k, 'Score'))  # print 10 best feature
    
    return np.log(df_scores[0].values)

def get_distance(x: np.array, y: np.array, l2=True, weights=None) -> float:
    """Compute the distance between the instance x and y (numpy arrays)."""
    global num_features
    global cat_features
    
    res = 0

    if l2:
        if weights is not None:
            res = np.dot((x - y)**2, np.array(weights))
        else:
            res = np.sum((x - y)**2)
    else:
        n_num = len(num_features)
        n_cat = len(cat_features)

        if weights is not None:
            for i in range(n_num):
                res += (float(x[i]) - float(y[i]))**2 * weights[i]

            for i in range(n_num, n_num+n_cat):
                if x[i] != y[i]:
                    res += weights[i]
        else:
            for i in range(n_num):
                res += (float(x[i]) - float(y[i]))**2

            for i in range(n_num, n_num+n_cat):
                if x[i] != y[i]:
                    res += 1

    return res

def predict(test_x, num, nbrs):
    global train_y
    pred_indices = nbrs.kneighbors(test_x.iloc[:num])
    pred_y = [train_y.iloc[x].values for x in pred_indices[1]]
    return stats.mode(pred_y, axis=1).mode.squeeze()

In [8]:
df_all = pd.read_csv('./data/ooa_features_v1.csv')

# selected_features =[
#     'source',
#     'age',
#     'occupation',
#     'hasOtherComAccount',
#     'eduLevel',
#     'isReject',
#     'incomeYear',
#     'totalWealth',
#     'expInvestment',
#     'yrsInvestment',
#     'frqInvestment',
#     'srcCapital',
#     'quotaCredit',
#     'quota_now',
#     'quota_now_elec',
#     'salary',
#     'lead_job_id'
# ]

# FUGLE
selected_features =[
    'source',
    # 'age',
    'occupation',
    'hasOtherComAccount',
    # 'eduLevel',
    'isReject',
    'incomeYear',
    'totalWealth',
    'expInvestment',
    # 'yrsInvestment',
    # 'frqInvestment',
    # 'srcCapital',
    # 'quotaCredit',
    'quota_now',
    'quota_now_elec',
    'salary',
    'lead_job_id'
]

# ESUN
# selected_features =[
#     'source',
#     'age',
#     'occupation',
#     'hasOtherComAccount',
#     # 'eduLevel',
#     'isReject',
#     'incomeYear',
#     'totalWealth',
#     'expInvestment',
#     # 'yrsInvestment',
#     # 'frqInvestment',
#     # 'srcCapital',
#     'quotaCredit',
#     'quota_now',
#     'quota_now_elec',
#     # 'salary',
#     # 'lead_job_id'
# ]

# select features
df_all  = df_all[selected_features]
df_all = df_all[df_all['occupation'] <= 33]

# define the label to predict
df_all['y_num'] = df_all[['quota_now', 'quota_now_elec']].apply(lambda x: make_quota(*x), axis=1)
df_all = df_all[df_all['quota_now']<=1e6]
df_all['y_cat'] = df_all['quota_now'].apply(lambda x: to_class(x))
df_all = df_all.drop(['quota_now', 'quota_now_elec'], axis=1)

# drop: isReject
df_all = df_all[df_all['isReject']==0]
df_all = df_all.drop('isReject', axis=1)

# drop source Anue 
df_all = df_all[df_all['source'] != 'Anue']
df_all = df_all.replace({"source": {'FUGLE': 0, '玉證': 1}})

df_all = df_all[df_all['source'] == 0]
df_all = df_all.drop('source', axis=1)

# take the absolute value of salary to avoid negative values
df_all['salary'] = df_all['salary'].apply(lambda x: abs(x))

df_all = df_all.dropna()
# display(df_all.head())

# normalization
df_x_raw = df_all.iloc[:, :-2]
df_y = df_all.iloc[:, -1]
# cat_features = ['source', 'occupation', 'hasOtherComAccount', 'lead_job_id']
cat_features = ['occupation', 'hasOtherComAccount', 'lead_job_id']
num_features = [col for col in df_x_raw.columns if col not in cat_features]
df_x_num = df_x_raw[num_features].apply(lambda x: x/x.max(), axis=0)
df_x_cat = df_x_raw[cat_features]
df_x = pd.concat([df_x_num, df_x_cat], axis=1)
df_x.reset_index(drop=True, inplace=True)
df_y.reset_index(drop=True, inplace=True)

# one-hot encoding the categorical data
encoder = OneHotEncoder(handle_unknown='ignore')
encoder_df = pd.DataFrame(encoder.fit_transform(df_x[cat_features]).toarray())
df_x = df_x.join(encoder_df)
df_x.drop(cat_features, axis=1, inplace=True)

display(df_x.head())
display(df_y.head())
n_input = len(df_x.columns)

Unnamed: 0,incomeYear,totalWealth,expInvestment,salary,0,1,2,3,4,5,...,32,33,34,35,36,37,38,39,40,41
0,0.5,0.5,0.0,0.309462,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.5,0.5,1.0,0.087075,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,0.378655,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.5,0.5,0.75,0.378655,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.5,1.0,0.194422,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


0    3
1    3
2    3
3    3
4    3
Name: y_cat, dtype: int64

In [9]:
class MyNetwork(nn.Module):
    def __init__(self, n_input):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(n_input, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 4)
        )
        
    def forward(self, x):
        x = self.fc(x)
        return x

In [10]:
class MyDataset(Dataset):
    def __init__(self, df_x, df_y):
        x = df_x.values
        y = df_y.values

        self.x = torch.tensor(x, dtype=torch.float)
        self.y = torch.tensor(y, dtype=torch.int64)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

train_x, test_x, train_y, test_y = train_test_split(df_x, df_y, test_size=0.2, random_state=42)
print(f'{len(train_x) = }')
print(f'{len(test_x) = }')

train_dataset = MyDataset(train_x, train_y)
valid_dataset = MyDataset(test_x, test_y)

label_counter = train_y.value_counts()
total_counts = len(train_y)
for k in label_counter.keys():
    label_counter[k] = total_counts/label_counter[k]
example_weights = [label_counter[e] for e in train_y]
sampler = WeightedRandomSampler(example_weights, total_counts)
train_loader = DataLoader(train_dataset, sampler=sampler, batch_size=64)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)

len(train_x) = 49067
len(test_x) = 12267


In [11]:
def train(model, train_loader, optimizer, epoch):
    global device
    model.train()
    train_loss = 0
    n_correct = 0
    n_total = 0
    
    for batch_idx, (x, y) in enumerate(train_loader):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        output = model(x)
        loss = nn.CrossEntropyLoss()(output, y)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, y_pred = output.max(1)
        n_total += y.size(0)
        n_correct += y_pred.eq(y).sum().item()
        
    train_loss /= len(train_loader)
    train_acc = n_correct / n_total
    print(f'[epoch] {epoch}, [train acc] {train_acc:.2%}, [train loss] {train_loss:.6f}', end=' ')

def valid(model, valid_loader, epoch):
    global device
    model.eval()
    valid_loss = 0
    n_correct = 0
    n_total = 0
    
    y_pred_list = []
    
    with torch.no_grad():
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            output = model(x)
            valid_loss += nn.CrossEntropyLoss()(output, y).item()
            _, y_pred = output.max(1)
            n_total += y.size(0)
            n_correct += y_pred.eq(y).sum().item()
            
            y_pred_list.append(y_pred.detach().cpu().numpy())

    valid_loss /= len(valid_loader)
    valid_acc = n_correct / n_total

    print(f'[valid acc] {valid_acc:.2%}, [valid loss] {valid_loss:.6f}')
    
    yp = np.concatenate(y_pred_list)
    yg = valid_loader.dataset.y.numpy()

    precision, recall, fscore, support = score(yg, yp, zero_division=0)
    res_df = pd.DataFrame({
        'precision' : precision,
        'recall' : recall,
        'fscore' : fscore,
        'support' : support
    })
    display(res_df)


model = MyNetwork(n_input)
device = torch.device("cuda:0" if torch.cuda.is_available else "cpu")
model.to(device)
optimizer=optim.SGD(model.parameters(),lr=0.001,momentum=0.9)

for i in range(100):
    train(model, train_loader, optimizer, i)
    valid(model, valid_loader, i)


RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x128 and 64x4)