PREPARING DATA

In [9]:
import pandas as pd
import numpy as np
import re
from gensim.utils import simple_preprocess
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

SOME_FIXED_SEED = 42

# before training/inference:
np.random.seed(SOME_FIXED_SEED)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
lemmatizer = WordNetLemmatizer()
p_stemmer = PorterStemmer()
topWordsLessMeaning = ['paper','research','study','ha','year',
                       'group','example','wa','type','change','value',
                       'work','source','resource','issue','show',
                       'part','review','need','article','learning',
                       'order','way','chapter','use','result','method',
                       'approach','process','development','property',
                       'model','parameter','simulation','science',
                       'processing','project','technology','application',
                       'analysis','problem']
stop_words = stopwords.words('english')
stop_words.extend(topWordsLessMeaning)

def proccessing_base(s):
    """
    :param s: string to be processed
    :return: processed string: see comments in the source code for more info
    """
    # Loại từ đặc biệt
    s = s.replace('\xa0Read more', '')
    # lower case
    s = s.lower()
    # Loại bỏ các dấu ngoặc
    s = re.sub(r'\(.*?\)', '. ', s)
    # normalization 10: ' ing ', noise text
    s = re.sub(r' ing ', ' ', s)
    # Loại bỏ dấu chấm
    s = re.sub(r'[^\w\s]','',s)
    # Loại bỏ dấu ,
    s = s.replace(',', '')
    # Loại bỏ số
    s = re.sub("\d+", "", s)


    return s.strip()

def remove_word_nonenglish(text):
  list_checked = [i if i.isalpha() else i[:-3] for i in text.split()]
  return ' '.join(list_checked)

def filter_noun(w_list):
    return ' '.join([word for (word, pos) in nltk.pos_tag(w_list.split()) if pos[:2] == 'NN'])

def stem(w_list):
    return ' '.join([p_stemmer.stem(word) for word in w_list.split()])

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatizing(text):
    return " ".join([lemmatizer.lemmatize(ele) for ele in text.split()])

def findTheMostWordApperance(Doc, n_words = 40):
    dicTF = {}
    for d in Doc :
      for word in d:
        if dicTF.get(word) == None:
          dicTF[word] = 1
        else: dicTF[word] = dicTF[word] + 1
    dictRe = {k: v for k, v in sorted(dicTF.items(), key=lambda item: item[1], reverse= True)}
    count = 0
    for item,value in dictRe.items():
      if count > n_words :
        break
      else: print('sst : ',count,' từ : ',item, "- số lượng : ", value)
      count +=1
def preprocessing(text):
  corpus = []
  for i in text:
    i = proccessing_base(i)
    i = lemmatizing(i)
    i = filter_noun(i)
    corpus.append(i)
  corpus = remove_stopwords(corpus)
  return corpus
def removeTheMostWordLessMeaning(corpus):
  result = []
  for doc in corpus:
     item = [value for value in doc if value not in topWordsLessMeaning]
     result.append(item)
  return result

LOAD DATA

In [3]:
import pickle
with open('train_id2word4.pkl', 'rb') as f:
    train_id2word4 = pickle.load(f)

LOAD MODEL

In [4]:
## để load model thì phải tạo folder tên models bỏ hết file liên quan đến LDA vào
import gensim
lda_train = gensim.models.ldamulticore.LdaMulticore.load('models/lda_train2.model')

In [39]:
def pipeline(docs,lda_train):
  """Trả về label cho của tác giả"""
  # tien xu li
  pre_docs = preprocessing(docs)

  # bigram 
  bigram = gensim.models.Phrases(pre_docs, min_count = 15)
  bigram_mod = gensim.models.phrases.Phraser(bigram)
  bigram = [bigram_mod[review] for review in pre_docs]
  corpus = [train_id2word4.doc2bow(text) for text in bigram]

  ## dự đoán cho mỗi đoạn văn
  labels_topic = []
  for topics in lda_train[corpus]:
    topics_label_sort = sorted(topics[0], key=lambda x: x[1], reverse=True)
    labels_topic.append(topics_label_sort[0][0])

  ## Tìm chủ đề phổ biến
  NumberOfTopic = [0]*9
  for i in labels_topic :
    NumberOfTopic[int(i)] += 1
  return NumberOfTopic.index(max(NumberOfTopic))

Dự đoán

In [42]:
### Load data các abstract của 1 tác giả
data = [
    "Keyword extraction is an indispensable step formany natural language processing and information retrievalapplications such as; \
    text summarization and search engineoptimization. Keywords hold the most important informationdescribing the content of a document. \
    With the increasing volumeand variety of unlabeled documents on the Internet, the need forautomatic keyword extraction methods increases.",
    " In this article we would like to present our experimental approach to automatic  keyphrases  extraction  based  on  statistical  methods"
]

### dự đoán
label = pipeline(data,lda_train)
print("Author belong to : ",label)

Tác giả thuộc nhãn :  3
