In [971]:
import pandas as pd

data1 = pd.read_csv('../../Data/linear-regression-data/data1.csv')
data2 = pd.read_csv('../../Data/linear-regression-data/data2.csv')


In [972]:
data1 = data1.iloc[:10000,:]
data2 = data2.iloc[:10000,:]

In [973]:
data1.columns

Index(['idcard', 'deprice_level', 'tag'], dtype='object')

In [974]:
data2.columns

Index(['idcard', 'gjj_status', 'dep_balance_level', 'dep_base_level',
       'loan_status', 'pay_base_level', 'insured_status',
       'series_pay_year_level', 'last_year_level'],
      dtype='object')

In [975]:
data1.shape[0]

10000

In [976]:
data2.shape[0]

10000

In [977]:
data1.dropna(inplace=True)
data1.drop_duplicates(subset=['idcard'])
data1.shape[0]

9996

In [978]:
data2.drop_duplicates(subset=['idcard'])
data2.shape[0]

10000

In [979]:
intersect = pd.merge(data2, data1, on='idcard', how='inner')

In [980]:
intersect.tail(10)

Unnamed: 0,idcard,gjj_status,dep_balance_level,dep_base_level,loan_status,pay_base_level,insured_status,series_pay_year_level,last_year_level,deprice_level,tag
9986,11010219880628455X,1,1,3,1,3,1,2,2,3,1.96
9987,110109198511114179,1,1,2,1,2,1,2,1,3,1.36
9988,110105197502107209,1,1,2,1,2,1,2,1,2,1.36
9989,110000195611217603,1,1,3,1,3,1,2,2,5,2.16
9990,110115196105074165,1,1,3,1,3,1,2,2,3,1.86
9991,110100197508117991,1,1,1,1,1,1,1,2,5,1.26
9992,110113197001189912,1,1,2,1,2,1,2,1,4,1.56
9993,110101195609142683,1,1,3,1,3,1,1,1,4,1.86
9994,110104199601044349,1,1,3,1,3,1,1,1,3,1.76
9995,110106197812146728,1,1,2,1,2,1,2,1,5,1.66


In [981]:
intersect.shape[0]

9996

In [982]:
import pprint as pp
pp.pprint(intersect.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9996 entries, 0 to 9995
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   idcard                 9996 non-null   object 
 1   gjj_status             9996 non-null   int64  
 2   dep_balance_level      9996 non-null   int64  
 3   dep_base_level         9996 non-null   int64  
 4   loan_status            9996 non-null   int64  
 5   pay_base_level         9996 non-null   int64  
 6   insured_status         9996 non-null   int64  
 7   series_pay_year_level  9996 non-null   int64  
 8   last_year_level        9996 non-null   int64  
 9   deprice_level          9996 non-null   int64  
 10  tag                    9996 non-null   float64
dtypes: float64(1), int64(9), object(1)
memory usage: 937.1+ KB
None


In [983]:
intersect_corr = intersect.iloc[:, 1:].corr()

In [984]:
target_col = intersect_corr['tag']
target_col

gjj_status               0.048944
dep_balance_level             NaN
dep_base_level           0.935970
loan_status             -0.052087
pay_base_level           0.930297
insured_status          -0.052087
series_pay_year_level    0.105726
last_year_level          0.121406
deprice_level            0.268581
tag                      1.000000
Name: tag, dtype: float64

In [985]:
feature = target_col[target_col > 0.1]
feature

dep_base_level           0.935970
pay_base_level           0.930297
series_pay_year_level    0.105726
last_year_level          0.121406
deprice_level            0.268581
tag                      1.000000
Name: tag, dtype: float64

In [986]:
feature_cols = feature.index.tolist()
feature_cols.pop()
feature_cols

['dep_base_level',
 'pay_base_level',
 'series_pay_year_level',
 'last_year_level',
 'deprice_level']

In [987]:
cols = list(intersect.columns) 
feats_selected = [cols.index(col) for col in feature_cols]  #获取该特征对应列索引编号，后续就可以用feats + feats_selected作为特征值
feats_selected

[3, 5, 7, 8, 9]

In [988]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
class MyDataset(Dataset):
    def __init__(self, mode='train'):
        self.mode = mode

        data = np.array(intersect.iloc[:, feats_selected]).astype(float)
        target = np.array(intersect.iloc[:, 10:]).astype(float).squeeze(1)
        print('data shape', data.shape)

            
        # Splitting training data into train & dev sets
        indices_tr, indices_dev = train_test_split([i for i in range(data.shape[0])], test_size = 0.3, random_state = 0)
        if mode == 'train':
            indices = indices_tr
        elif mode == 'dev':
            indices = indices_dev
            
        # Convert data into PyTorch tensors
        self.data = torch.FloatTensor(data[indices])
        self.target = torch.FloatTensor(target[indices])

        # Normalize features (you may remove this part to see what will happen)
        # eps = 1e-6
        self.data[:,:] = \
            (self.data[:,:] - self.data[:,:].mean(dim=0, keepdim=True)) \
            / (self.data[:,:].max(dim=0, keepdim=True) + self.data[:,:].min(dim=0, keepdim=True))

        self.dim = self.data.shape[1]

        print('Finished reading the {} set of  Dataset ({} samples found, each dim = {})'
              .format(mode, len(self.data), self.dim))

    def __getitem__(self, index):
        # Returns one sample at a time
        return self.data[index], self.target[index]

    def __len__(self):
        # Returns the size of the dataset
        return len(self.data)

In [989]:
class NeuralNet(nn.Module):
    def __init__(self, input_dim):
        super(NeuralNet, self).__init__()
        self.net = nn.Linear(input_dim, 1)
        # self.net = nn.Sequential(
        #     nn.Linear(input_dim, 64),
        #     nn.ReLU(),  
        #     nn.Linear(64, 32),
        #     nn.ReLU(), 
        #     nn.Linear(32, 1),
        # )

        # Mean squared error loss
        self.criterion = nn.MSELoss(reduction='mean')

    def forward(self, x):
        ''' Given input of size (batch_size x input_dim), compute output of the network '''
        return self.net(x).squeeze(1)

    def cal_loss(self, pred, target):
        ''' Calculate loss '''
        return self.criterion(pred, target)
        # eps = 1e-6
        # l2_reg = 0
        # alpha = 0.0001
        # for name, w in self.linear.named_parameters():
        #     if 'weight'  in name:
        #         l2_reg += alpha * torch.square(torch.norm(w, p = 2).to(device))
        # return torch.sqrt(self.criterion(pred, target)+eps)+l2_reg

In [990]:
def accu(dv_set, model, device):
    model.eval()                                # set model to evalutation mode
    count = 0
    samples = 0
    for x, y in dv_set:                         # iterate through the dataloader
        samples = samples + dv_set.batch_size
        x, y = x.to(device), y.to(device)       # move data to device (cpu/cuda)
        with torch.no_grad():                   # disable gradient calculation
            pred = model(x)                     # forward pass (compute output)
            res = np.abs(pred.numpy()-y.numpy())
            c = np.sum(res < 0.01)
            count = count + c
    return count/samples

In [991]:
def train(tr_set, dv_set, model, config, device):
    ''' DNN training '''

    n_epochs = config['n_epochs']  # Maximum number of epochs

    # Setup optimizer
    optimizer = getattr(torch.optim, config['optimizer'])(
        model.parameters(), **config['optim_hparas'])

    min_mse = 1000.
    loss_record = {'train': [], 'dev': []}      # for recording training loss
    early_stop_cnt = 0
    epoch = 0
    while epoch < n_epochs:
        model.train()                           # set model to training mode
        for x, y in tr_set:                     # iterate through the dataloader
            optimizer.zero_grad()               # set gradient to zero
            x, y = x.to(device), y.to(device)   # move data to device (cpu/cuda)
            pred = model(x)                     # forward pass (compute output)
            mse_loss = model.cal_loss(pred, y)  # compute loss
            mse_loss.backward()                 # compute gradient (backpropagation)
            optimizer.step()                    # update model with optimizer
            loss_record['train'].append(mse_loss.detach().cpu().item())

        # After each epoch, test your model on the validation (development) set.
        dev_mse = dev(dv_set, model, device)
        if dev_mse < min_mse:
            # Save model if your model improved
            min_mse = dev_mse
            cur_accu = accu(dv_set, model, device)
            print('Saving model (epoch = {:4d}, loss = {:.4f}), accu = {:.4f}'
                .format(epoch + 1, min_mse, cur_accu))
            torch.save(model.state_dict(), config['save_path'])  # Save model to specified path
            early_stop_cnt = 0
        else:
            early_stop_cnt += 1

        epoch += 1
        loss_record['dev'].append(dev_mse)
        if early_stop_cnt > config['early_stop']:
            # Stop training if your model stops improving for "config['early_stop']" epochs.
            break

    print('Finished training after {} epochs'.format(epoch))
    return min_mse, loss_record

In [992]:
def dev(dv_set, model, device):
    model.eval()                                # set model to evalutation mode
    total_loss = 0
    for x, y in dv_set:                         # iterate through the dataloader
        x, y = x.to(device), y.to(device)       # move data to device (cpu/cuda)
        with torch.no_grad():                   # disable gradient calculation
            pred = model(x)                     # forward pass (compute output)
            mse_loss = model.cal_loss(pred, y)  # compute loss
        total_loss += mse_loss.detach().cpu().item() * len(x)  # accumulate loss
    total_loss = total_loss / len(dv_set.dataset)              # compute averaged loss

    return total_loss

In [993]:
def get_device():
    ''' Get device (if GPU is available, use GPU) '''
    return 'cuda' if torch.cuda.is_available() else 'cpu'

In [994]:
def prep_dataloader(mode, batch_size, n_jobs=0):
    ''' Generates a dataset, then is put into a dataloader. '''
    dataset = MyDataset(mode=mode)  # Construct dataset
    dataloader = DataLoader(
        dataset, batch_size,
        shuffle=(mode == 'train'), drop_last=False,
        num_workers=n_jobs, pin_memory=True)                            # Construct dataloader
    return dataloader

In [966]:
myseed = 42069  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

device = get_device()                 # get the current available device ('cpu' or 'cuda')
os.makedirs('models', exist_ok=True)  # The trained model will be saved to ./models/

# TODO: How to tune these hyper-parameters to improve your model's performance?
config = {
    'n_epochs': 300,                # maximum number of epochs
    'batch_size': 1000,               # mini-batch size for dataloader
    'optimizer': 'Adam',              # optimization algorithm (optimizer in torch.optim)
    'optim_hparas': {                # hyper-parameters for the optimizer (depends on which optimizer you are using)
        'lr': 0.001,                 # learning rate of SGD
        # 'momentum': 0.9              # momentum for SGD
        'betas': [0.9, 0.99]
    },
    'early_stop': 200,               # early stopping epochs (the number epochs since your model's last improvement)
    'save_path': 'models/model.pth'  # your model will be saved here
}

tr_set = prep_dataloader('train', config['batch_size'])
dv_set = prep_dataloader('dev', config['batch_size'])

data shape (9996, 5)
Finished reading the train set of  Dataset (6997 samples found, each dim = 5)
data shape (9996, 5)
Finished reading the dev set of  Dataset (2999 samples found, each dim = 5)


In [967]:
model = NeuralNet(tr_set.dataset.dim).to(device)  # Construct model and move to device

In [968]:
model_loss, model_loss_record = train(tr_set, dv_set, model, config, device)

Saving model (epoch =    1, loss = 1.3998), accu = 0.0000
Saving model (epoch =    2, loss = 1.3799), accu = 0.0000
Saving model (epoch =    3, loss = 1.3604), accu = 0.0000
Saving model (epoch =    4, loss = 1.3413), accu = 0.0000
Saving model (epoch =    5, loss = 1.3228), accu = 0.0000
Saving model (epoch =    6, loss = 1.3047), accu = 0.0000
Saving model (epoch =    7, loss = 1.2869), accu = 0.0000
Saving model (epoch =    8, loss = 1.2695), accu = 0.0000
Saving model (epoch =    9, loss = 1.2524), accu = 0.0000
Saving model (epoch =   10, loss = 1.2356), accu = 0.0000
Saving model (epoch =   11, loss = 1.2191), accu = 0.0000
Saving model (epoch =   12, loss = 1.2030), accu = 0.0000
Saving model (epoch =   13, loss = 1.1870), accu = 0.0000
Saving model (epoch =   14, loss = 1.1713), accu = 0.0000
Saving model (epoch =   15, loss = 1.1559), accu = 0.0000
Saving model (epoch =   16, loss = 1.1407), accu = 0.0000
Saving model (epoch =   17, loss = 1.1257), accu = 0.0000
Saving model (

In [969]:
#model.net.weight

In [970]:
accu = accu(dv_set, model, device)
accu

0.4713333333333333