In [1]:
import re
import string
import distance
import numpy as np
import pandas as pd
import sklearn.cluster
from gensim.utils import deaccent
from nltk.corpus import stopwords
from sklearn.cluster import MeanShift
from sklearn.feature_extraction.text import TfidfVectorizer

AUTHOR = 'AUTOR'
AUTHOR_ID = 'authid'
PUBLICATION_ID = 'pubid'
PUBLICATION = 'TITULO'

raw_data = pd.read_excel('Kennis.xlsx')

data = pd.DataFrame(columns=['authid', 'pubid', *raw_data.columns]).append(raw_data, sort=False)

authid = 0

for a in raw_data[AUTHOR].unique():
    data.loc[raw_data[AUTHOR] == a, [AUTHOR_ID]] = authid
    authid = authid + 1

def preprocessing(line):
    line = str(line).lower()
    line = re.sub(r"[{}]".format(string.punctuation), '', line)
    return deaccent(line)

stp_words = [preprocessing(line) for line in [*stopwords.words('portuguese'), *stopwords.words('english')]] 

publication_titles = raw_data[PUBLICATION]

pubs = np.asarray([preprocessing(p) for p in publication_titles])

tfidf_vectorizer = TfidfVectorizer(
    preprocessor=preprocessing,
    ngram_range=(1, 3),
    dtype=np.float32,
    stop_words=stp_words
)

tfidf = tfidf_vectorizer.fit_transform(pubs)

In [2]:
tfidf

<16793x163694 sparse matrix of type '<class 'numpy.float32'>'
	with 342081 stored elements in Compressed Sparse Row format>

In [3]:
tfidf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [3]:
ms = MeanShift(bandwidth=0.3, cluster_all=False, n_jobs=2).fit(tfidf.toarray())

print(len(np.unique(ms.labels_)), 'uniqs')

data[PUBLICATION_ID] = ms.labels_

PicklingError: Could not pickle the task to send it to the workers.

In [None]:
data.to_csv('kennis_ids_03_ngram-03', index=False)

data.sort_values(by=PUBLICATION_ID)

In [None]:
data[data['pubid'] > -1].sort_values(by='pubid').head(200)

In [None]:
print(len(np.unique(ms.labels_)), 'uniqs')
