In [1]:
import pandas as pd
import scipy.io
import numpy as np
import time
from collections import defaultdict
import scipy.sparse as sp
import sys
import time
from utils import sparse_to_adjlist
from scipy.io import loadmat
import pickle

import torch
# from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM	
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('Labelled Yelp Dataset.csv')

### 词向量训练---Bert

In [3]:
import os
os.environ['TRANSFORMERS_CACHE'] = ''

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows	
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
# Load pretrained model/tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
text_line = []
for text in data['Review']:
    text_line.append(text)

In [5]:
text_batch = tokenizer(text_line, padding=True, truncation=True, max_length=70, return_tensors='pt')

### 处理时间

In [6]:
def func(datestring):
    month, day, year = datestring.split('/')
    if len(month) == 1:
        month = '0' + month
    if len(day) == 1:
        day = '0' + day
    return year + month + day
data['Date'] = data.apply(lambda x:func(x['Date']), axis=1)

In [7]:
data = data.sort_values(by='Date')

In [8]:
data.head()

Unnamed: 0,User_id,Product_id,Rating,Date,Review,Label
172805,30262,468,4,20041020,Excellent Soup Dumplings. It's a must if you g...,1
200617,107234,510,4,20041102,One of the best hidden no-name neighborhood pl...,1
58364,19015,142,5,20041209,"Really lovely Italian food, very simple and we...",1
221175,116117,708,5,20050302,"Mario Batali at his best, this is my current f...",1
161710,59929,454,5,20050307,Best place for brunch if you can handle the wa...,1


### 比例要求

In [9]:
import numpy as np
dict(data['Label'].value_counts())

{1: 322167, -1: 36885}

### 划分数据集

#### 总量

In [10]:
data.shape[0]

359052

#### 取前30w作为训练集，剩余作为测试集，然后每1000行直接划分，得到300个mat数据集

In [11]:
data_train, data_test = data[:300000], data[300000:]
data_train.shape

(300000, 6)

### 构建数据集（mat格式）

#### 构建映射

In [12]:
def CooGet(name, subdata, number):
    TId = subdata['Date']

    dicts = {}
    def func(a, b):
        dicts[a] = b
    subdata.apply(lambda x:func(x['Date'], x[name]), axis=1)
    
    dictset = defaultdict(set)

    # def funcset(a, b):
    #     if dicts[a] == dicts[b]:
    #         dictset[a].add(b)
    #         dictset[b].add(a)

    # data.apply(lambda x:funcset(x['TransactionId'], x['TransactionId']), axis=1)
    total = subdata.shape[0]

    row = []
    col = []
    value = []
    for i in range(subdata.shape[0]):
        D = dicts[TId[i]]
        E = (D == subdata[name])
        L = list(np.where(E == True)[0])
        # print(len(L))
        R = [i]*len(L)
        V = [1]*len(L)
        C = L
        row.extend(R)
        col.extend(C)
        value.extend(V)
    
    return sp.coo_matrix((value, (row, col)), shape=(subdata.shape[0], subdata.shape[0]))

#### 处理构建标签

In [13]:
def labelGet(labeltrain):
    labelnew = []
    for l in labeltrain:
        if l == -1:
            labelnew.append(0)
        else:
            labelnew.append(1)
    return labelnew

#### 处理特征

In [34]:
def funcOfTextNumpy(text_batch, number):
    # wordembedding_list = np.empty((0, 70*768))
    # for num in range(number*1000, (number+1)*1000, 100):
    #     wordembedding_batch = model(text_batch['input_ids'][num:num+100])  
    #     t = wordembedding_batch[0].reshape(100, -1).detach().numpy().T
    #     tt = (t - np.min(t))/(np.max(t)-np.min(t))
    #     wordembedding_sub = tt.T
    #     print(wordembedding_sub.shape)
    #     wordembedding_list = np.concatenate((wordembedding_list, wordembedding_sub), axis=0)
    #     print('\r' + '{:.2f}'.format(num/10), end='', flush=True)
    # return np.array(wordembedding_list)
    wordembedding_batch = model(text_batch['input_ids'][number*100:(number+1)*100])  
    t = wordembedding_batch[0].detach().numpy()
    features_text = t
    return features_text

In [35]:
def funcOfstrToint(strnum):
    return int(strnum)

In [36]:
def featureGet(f_data):
    f_data['Date'] = f_data.apply(lambda x:funcOfstrToint(x['Date']), axis=1)
    f_data = f_data.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))) 
    features = np.array(f_data)
    features_coo = sp.coo_matrix(features)
    return features_coo

### 整合

In [37]:
def train(subdata, number):
    coo_user = CooGet(name='User_id', subdata=subdata, number=number)
    coo_product = CooGet(name='Product_id', subdata=subdata, number=number)
    
    subdata1 = subdata.drop(['User_id', 'Product_id', 'Review', 'Label'], axis=1)
    subdata1 = subdata1.reset_index(drop=True)
    label = subdata['Label']
    labelnew = labelGet(label)
    features_text = funcOfTextNumpy(text_batch, number)
    features_coo = featureGet(subdata1)
    scipy.io.savemat('matFolder\FraudDataset_{}.mat'.format(number),{'rur':coo_user, 'rpr':coo_product, "features":features_coo,"text":features_text, "label":labelnew})
    print('\r' + '{:.2f}'.format((number+1)/300 * 100), end='', flush=True)
    

In [42]:
for i in range(3):
    subdata = data_train[i*100 : (i+1)*100]
    subdata = subdata.reset_index(drop=True)
    train(subdata, i)
    

1.00

In [3]:


"""
	Read data and save the adjacency matrices to adjacency lists
"""


# if __name__ == "__main__":

	# prefix = ''

test = loadmat('matFolder/FraudDataset_0.mat')
test1 = loadmat('matFolder/FraudDataset_2.mat')

# sparse_to_adjlist(net_rur, prefix + 'yelp_rur_adjlists.pickle')
# sparse_to_adjlist(net_rtr, prefix + 'yelp_rtr_adjlists.pickle')
# sparse_to_adjlist(net_rsr, prefix + 'yelp_rsr_adjlists.pickle')
# sparse_to_adjlist(yelp_homo, prefix + 'yelp_homo_adjlists.pickle')

# amz = loadmat('Amazon.mat')
# net_upu = amz['net_upu']
# net_usu = amz['net_usu']
# net_uvu = amz['net_uvu']
# amz_homo = amz['homo']


# sparse_to_adjlist(net_upu, prefix + 'amz_upu_adjlists.pickle')
# sparse_to_adjlist(net_usu, prefix + 'amz_usu_adjlists.pickle')
# sparse_to_adjlist(net_uvu, prefix + 'amz_uvu_adjlists.pickle')
# sparse_to_adjlist(amz_homo, prefix + 'amz_homo_adjlists.pickle')


In [5]:
test1

{'__header__': b'MATLAB 5.0 MAT-file Platform: nt, Created on: Fri Jun 17 09:16:42 2022',
 '__version__': '1.0',
 '__globals__': [],
 'rur': <100x100 sparse matrix of type '<class 'numpy.int32'>'
 	with 968 stored elements in Compressed Sparse Column format>,
 'rpr': <100x100 sparse matrix of type '<class 'numpy.int32'>'
 	with 222 stored elements in Compressed Sparse Column format>,
 'features': <100x2 sparse matrix of type '<class 'numpy.float64'>'
 	with 194 stored elements in Compressed Sparse Column format>,
 'text': array([[[-0.16176604, -0.10799506, -0.28463998, ..., -0.05414923,
           0.6224663 ,  0.456627  ],
         [ 0.43537915, -0.9114257 , -0.02543993, ..., -0.21906473,
           0.7397299 ,  0.19465987],
         [ 0.09081059, -1.5069907 ,  0.06811708, ..., -0.23250109,
           0.11629332,  0.01919228],
         ...,
         [ 0.05886769, -0.66331977,  0.05971935, ..., -0.09282777,
           0.5895977 ,  0.09413048],
         [-0.3042255 , -1.2610446 , -0.0959

In [22]:
a = sparse_to_adjlist(test['rur'],'fsdfs')

In [40]:
H = torch.zeros(len(a), len(a))

In [64]:
H[0,[0,1,2]]=0

In [27]:
test['text'].todense().shape

(100, 53760)

In [66]:
list(a[0])

[0, 723]

In [67]:
for num in range(1000):
    H[num,list(a[num])]=1

In [68]:
H

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 1., 0.],
        [0., 0., 0.,  ..., 1., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]])

In [48]:
test['text']

array([[[-0.36804953, -0.31136674,  0.0919463 , ...,  0.18546656,
          1.0369701 , -0.23599096],
        [-0.30748078, -0.29594746, -0.93212026, ...,  0.4892611 ,
          0.82009196, -0.46048644],
        [-0.30938405,  0.09215397, -0.04307485, ...,  0.01188965,
          0.11065535, -0.31620052],
        ...,
        [-0.0892612 , -0.34226352,  0.96008897, ..., -0.50839555,
          0.25884992, -1.1857784 ],
        [-0.06850254, -0.3405885 ,  0.91618574, ..., -0.5493145 ,
          0.09982642, -1.2260445 ],
        [ 0.06488391, -0.27242485,  0.6762425 , ..., -0.49933496,
          0.09576812, -1.3604608 ]],

       [[ 0.30862743, -0.00673346,  0.38006878, ..., -0.36603507,
          0.49364388,  0.5412597 ],
        [-0.5156873 , -0.33566415,  0.55577534, ..., -0.939682  ,
          0.3632382 ,  0.8425638 ],
        [-0.43586975, -0.384663  ,  0.24438003, ..., -0.46155035,
          0.22115153,  0.31721544],
        ...,
        [ 0.3067816 ,  0.2894043 ,  1.0027261 , ...,  

In [50]:
test1['text']

array([[[-0.16176604, -0.10799506, -0.28463998, ..., -0.05414923,
          0.6224663 ,  0.456627  ],
        [ 0.43537915, -0.9114257 , -0.02543993, ..., -0.21906473,
          0.7397299 ,  0.19465987],
        [ 0.09081059, -1.5069907 ,  0.06811708, ..., -0.23250109,
          0.11629332,  0.01919228],
        ...,
        [ 0.05886769, -0.66331977,  0.05971935, ..., -0.09282777,
          0.5895977 ,  0.09413048],
        [-0.3042255 , -1.2610446 , -0.09593259, ...,  0.73638296,
          0.8037771 , -0.35535902],
        [ 0.4226743 ,  0.3449973 , -0.14128107, ...,  0.21562795,
         -0.47158208, -0.54015166]],

       [[ 0.07740402, -0.16268715,  0.08382545, ..., -0.05852576,
          0.5629257 ,  0.18334633],
        [ 0.44464454, -0.13380633,  0.10992998, ..., -0.49102828,
          0.95711637, -0.02002793],
        [ 1.4503806 , -0.0778251 ,  0.02747957, ..., -0.8862255 ,
         -0.12581679,  0.11644251],
        ...,
        [-0.2622453 , -0.25279975,  0.4833384 , ..., -