In [1]:
from nltk.corpus import brown
import nltk
import numpy as np
import pandas as pd

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("reuters")
nltk.download("punkt")
nltk.download('omw-1.4')
nltk.download('brown')

[nltk_data] Downloading package stopwords to /Users/tatsu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/tatsu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package reuters to /Users/tatsu/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /Users/tatsu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/tatsu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package brown to /Users/tatsu/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [2]:
import re

def cleaning_word(text):
    # 記号文字の消去
    pattern1 = r'\n'
    text = re.sub(pattern1, ' ', text)    
    pattern2 = r'<.*>|[|]'
    text = re.sub(pattern2, '', text)    
    pattern3 = r'\(|\)'
    text = re.sub(pattern3, '', text)
    pattern4 = r'|:|;|\+|-|\?|/|!|~|='
    text = re.sub(pattern4, '', text)
    pattern5 = r'`|\'|$|&|#'
    text = re.sub(pattern5, '', text)
    return text

a = "a\nb"
print(cleaning_word(a))


a b


In [3]:
from nltk.corpus import wordnet as wn #lemmatize関数のためのimport

def lemmatize_word(word):
    # make words lower  example: Python =>python
    word=word.lower()
    
    # lemmatize  example: cooked=>cook
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
      return lemma

In [4]:
def tokenize_text(text):
  text = re.sub('[.,]', '', text)
  return text.split()

In [5]:
en_stop = nltk.corpus.stopwords.words('english')
def remove_stopwords(word, stopwordset):
  if word in stopwordset:
    return None
  else:
    return word

In [6]:
def preprocessing_text(text):
  text = cleaning_word(text)
  tokens = tokenize_text(text)
  tokens = [lemmatize_word(word) for word in tokens]
  tokens = [remove_stopwords(word, en_stop) for word in tokens]
  tokens = [word for word in tokens if word is not None]
  return tokens


In [7]:
corpus_paper = pd.read_csv('paper_info.csv', header=None)
corpus_paper = corpus_paper.set_axis(['field', 'abst'], axis=1)
corpus_paper = corpus_paper.to_dict(orient='records')
for i, paper_dict in enumerate(corpus_paper):
    corpus_paper[i]["abst"] = preprocessing_text(paper_dict["abst"])

corpus_paper

[{'field': 'cs.AI',
  'abst': ['explainability',
   'key',
   'challenge',
   'major',
   'research',
   'theme',
   'ai',
   'research',
   'developing',
   'intelligent',
   'system',
   'capable',
   'working',
   'humans',
   'effectively',
   'obvious',
   'choice',
   'developing',
   'explainable',
   'intelligent',
   'system',
   'rely',
   'employ',
   'knowledge',
   'representation',
   'formalism',
   'inherently',
   'tailor',
   'towards',
   'express',
   'human',
   'knowledge',
   'eg',
   'interrogative',
   'agenda',
   'scope',
   'work',
   'focus',
   'formal',
   'concept',
   'analysis',
   'fca',
   'standard',
   'knowledge',
   'representation',
   'formalism',
   'express',
   'interrogative',
   'agenda',
   'particular',
   'categorize',
   'object',
   'wrt',
   'given',
   'set',
   'feature',
   'several',
   'fcabased',
   'algorithm',
   'already',
   'use',
   'standard',
   'machine',
   'learning',
   'task',
   'classification',
   'outlier',
   

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

doc_list = [dict["abst"] for dict in corpus_paper]
doc_list=[" ".join(doc) for doc in doc_list]
vectorizer = TfidfVectorizer(token_pattern=u'(?u)\\b\\w+\\b' )
tf_idf = vectorizer.fit_transform(doc_list)
tf_idf

<100x3076 sparse matrix of type '<class 'numpy.float64'>'
	with 7432 stored elements in Compressed Sparse Row format>

In [35]:
num_clusters = 3
km = KMeans(n_clusters=num_clusters, random_state = 0)
clusters = km.fit_predict(tf_idf)

In [36]:

n_category = 5
n_per_corpus = 20
cm = [[0]*num_clusters for _ in range(n_category)]
for i, cls in enumerate(clusters):
    cls = int(cls)
    line = int(i/n_per_corpus)
    cm[line][cls] +=1
print(cm)



[[0, 15, 5], [0, 18, 2], [20, 0, 0], [0, 1, 19], [2, 14, 4]]
