In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.utils import shuffle
from sklearn.model_selection import KFold

In [2]:
def preprocess_df(df_train):
    df_train = df_train[pd.notnull(df_train['document'])]
    df_train.columns = ['label', 'document']
    df_train['label_id'] = df_train['label'].factorize()[0]
    df_train.head()
    return df_train

def get_classifier(df_train, df_test):
    classifier = {}
    tfidf = TfidfVectorizer(sublinear_tf=False, min_df=0, max_df=0.8, norm='l2', encoding='utf-8', ngram_range=(1, 4),
                         analyzer='char')
    print('initiating data...')
    train_doc = [str(ll)[1:-1] for ll in [val for val in df_train.document]]
    X_train = tfidf.fit_transform(train_doc).toarray()
    y_train = df_train.label_id
    
    test_doc = [str(ll)[1:-1] for ll in [val for val in df_test.document]]
    X_test = tfidf.transform(test_doc).toarray()
    y_test = df_test.label_id
    
    print('training...')
    model = LinearSVC()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    classifier['features'] = X_train
    classifier['y_pred'] = y_pred
    classifier['y_true'] = y_test
    
    print('finished.')
    return classifier

In [3]:
shp_len = 10
shp_num = 100
shp_loc_ts = np.load('./data/money_laundrying_dataset/shp_{}_{}/shp_loc_ts.npy'.format(shp_len, shp_num),\
                    allow_pickle=True)[()]
shp_loc_ts.keys()

dict_keys(['shp_loc', 'shp_ts', 'shp_seq', 'seq_label'])

In [4]:
shp_loc = shp_loc_ts['shp_loc']
shp_ts = shp_loc_ts['shp_ts']
shp_seq = shp_loc_ts['shp_seq']
seq_label = shp_loc_ts['seq_label']
print(shp_loc.shape, shp_ts.shape, shp_seq.shape, seq_label.shape)

(1875, 100) (100, 10, 1) (1875, 100) (1875,)


In [5]:
X_train,X_test,y_train,y_test = train_test_split(shp_seq, seq_label, random_state=0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1406, 100) (469, 100) (1406,) (469,)


In [6]:
#train set
ts_num = X_train.shape[0]
dataset = []
for i in range(ts_num):
    doc = [val for val in X_train[i,:] if val != -1]
    dataset.append((y_train[i], doc))
df_train = pd.DataFrame(data=dataset, columns=['label','document'])
df_train = preprocess_df(df_train)
#打乱seq中shapelet的顺序
# for ll in [val for val in df_train.document]:
#     random.shuffle(ll)
print(df_train.shape)

(1406, 3)


In [7]:
#test set
ts_num = X_test.shape[0]
dataset = []
for i in range(ts_num):
    doc = [val for val in X_test[i,:] if val != -1]
    dataset.append((y_test[i], doc))
df_test = pd.DataFrame(data=dataset, columns=['label','document'])
df_test = preprocess_df(df_test)
print(df_test.shape)

(469, 3)


In [8]:
df_train.to_csv('train.csv')
df_test.to_csv('test.csv')

In [9]:
classifier = get_classifier(df_train, df_test)

initiating data...
training...
finished.


In [10]:
print('acc:', accuracy_score(classifier['y_true'], classifier['y_pred']))
print('prec:', precision_score(classifier['y_true'], classifier['y_pred']))
print('f1:', f1_score(classifier['y_true'], classifier['y_pred']))

acc: 0.3304904051172708
prec: 0.33695652173913043
f1: 0.16489361702127656
