In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import json
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from load import train,validate,test
from submit import utils
from features import extract

In [3]:
pubs_train = train.load_df_pubs()
truth_train = train.load_assignment()

In [4]:
names = list(pubs_train.keys())

In [5]:
name0 = names[1]
single_name_pubs = pubs_train[name0]

In [6]:
single_name_assignment =truth_train[name0] 

# Features

## Titles & Abstracts

In [7]:
abstracts = single_name_pubs.abstract.values.copy().astype(str)
abstracts

array(['This paper presents the principle, functions, features and implementation of a general graphical user interface management system based on OSF/Motif-JB-UIDS, which is a part of the integrated software engineering environment CASE (Computer Aided Software Engineering), named JB (Jade Bird). The visual and interactive UIMS can help the interface designer to generate user interface automatically and then refine it interactively. It adopts a new method of describing internal application interface based on Object-Oriented ideas to support the separation of user interface component from computational component. JB-UIDS has been implemented on SCO-ODT and has good portability and flexibility.',
       'A compact XeCl laser system made up of an oscillator and an amplifier was described. By applying a stimulated Brillouin scattering mirror (SBSM) to the amplifier, an output laser beam, whose optical and spectral characteristics are very close to those of the oscillator, has been obtaine

In [8]:
vectorizer_abstracts = TfidfVectorizer(strip_accents='unicode',decode_error='ignore',stop_words='english')

In [9]:
abstracts_mat = vectorizer_abstracts.fit_transform(abstracts)
abstracts_mat.shape

(2902, 20764)

In [10]:
vectorizer_titles = TfidfVectorizer(strip_accents='unicode',decode_error='ignore',stop_words='english')

In [11]:
titles = single_name_pubs.title.values.copy().astype(str)

In [12]:
titles_vec = vectorizer_titles.fit_transform(titles)
titles_vec.shape

(2902, 7586)

In [13]:
titles_vec

<2902x7586 sparse matrix of type '<class 'numpy.float64'>'
	with 27085 stored elements in Compressed Sparse Row format>

In [14]:
np.hstack([titles_vec,abstracts_mat])

array([<2902x7586 sparse matrix of type '<class 'numpy.float64'>'
	with 27085 stored elements in Compressed Sparse Row format>,
       <2902x20764 sparse matrix of type '<class 'numpy.float64'>'
	with 151124 stored elements in Compressed Sparse Row format>],
      dtype=object)

## co-Authors

In [15]:
authors_features = extract.get_authors_features(single_name_df=single_name_pubs)

## Keywords

In [16]:
keywords_features = extract.get_keywords_features(single_name_df=single_name_pubs)

## Concatenating features

In [34]:
features = sp.sparse.csr.csr_matrix(sp.concatenate((authors_features.toarray(),keywords_features.toarray()),axis=1))

In [35]:
features

<2902x15564 sparse matrix of type '<class 'numpy.float64'>'
	with 27599 stored elements in Compressed Sparse Row format>

# Model

In [37]:
from sklearn.cluster import DBSCAN

In [38]:
%%time
db = DBSCAN(eps=0.9,metric='cosine',min_samples=2).fit(features)

Wall time: 206 ms


In [39]:
np.unique(db.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
       33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
       50], dtype=int64)

In [40]:
(db.labels_==-1).sum()

229

In [41]:
single_name_sub = utils.single_name_cluster(cluster_labels=db.labels_,name=name0,pub_ids=single_name_pubs.id.values)

In [42]:
single_name_sub

{'min_chen': [['5bc6bfdd486cef66309bced6',
   '5bc6bfdd486cef66309bba61',
   '5bc6bfdd486cef66309aa3c0',
   '5bc6bfdd486cef66309b3910',
   '5bc6bfdd486cef66309e4677',
   '5bc6bfdd486cef66309aa8ef',
   '5bc6bfdd486cef66309c1836',
   '5bc6bfdd486cef66309d3308',
   '5bc6bfdd486cef66309bd863',
   '5bc6bfdd486cef66309ad105',
   '5bc6bfdd486cef66309c7870',
   '5bc6bfdd486cef66309df1dc',
   '5bc6bfdd486cef66309afafe',
   '5bc6bfdd486cef66309deea0',
   '5bc6bfdd486cef66309d446b',
   '5bc6bfdd486cef66309acadc',
   '5bc6bfdd486cef66309ea37e',
   '5bc6bfdd486cef66309b593a',
   '5bc6bfdd486cef66309b246c',
   '5bc6bfdd486cef66309d00bc',
   '5bc6bfdd486cef66309e2599',
   '5bc6bfdd486cef66309d9ee5',
   '5bc6bfdd486cef66309b86e7',
   '5bc6bfdd486cef66309eb824',
   '5bc6bfdd486cef66309d6929',
   '5bc6bfdd486cef66309b77c1',
   '5bc6bfdd486cef66309bd3d4',
   '5bc6bfdd486cef66309afa8a',
   '5bc6bfdd486cef66309c91be',
   '5bc6bfdd486cef66309e5701',
   '5bc6bfdd486cef66309bbe37',
   '5bc6bfdd486cef66309c48b

In [43]:
from evaluate import score

In [None]:
%%time
score.f_score_single_name(sub=single_name_sub,truth=truth_train,author=name0)

Truth Pairs built in 3.7428088188171387
Sub Pairs built in 5.127032279968262


In [None]:
test