In [37]:
import word_wakati as wkt
import os

In [38]:
def neologd():
    dicdir = os.popen("mecab-config --dicdir").read().strip()
    neodic = os.path.join(dicdir, "mecab-ipadic-neologd")
    if (os.path.isdir(neodic)):
        return neodic
    return None

neologd()

'/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd'

In [39]:
pos = ['名詞','動詞','形容詞']
mecab = wkt.create_parser(worker='mecab', parts_of_speech=pos)
janome = wkt.create_parser(worker='janome', parts_of_speech=pos)
text = '拡張現実感のための実物体の奥行きを考慮した陰面消去'
rs_j = wkt.word_seq(text,parser=janome)
rs_m = wkt.word_seq(text,parser=mecab)
print('Janome')
print(rs_j)
print('MeCab with ipadic-neologd')
print(rs_m)

Janome
['拡張', '現実', '感', 'ため', '物体', '奥行き', '考慮', 'する', '陰', '面', '消去']
MeCab with ipadic-neologd
['拡張現実', '感', 'ため', '物体', '奥行き', '考慮', 'した陰', '面', '消去']


In [4]:
text = '研究室配属における学生の研究室に対する理解を深める情報共有システム'
rs_j = wkt.word_seq(text,parser=janome)
rs_m = wkt.word_seq(text,parser=mecab)
print('Janome')
print(rs_j)
print('MeCab with ipadic-neologd')
print(rs_m)

Janome
['研究', '室', '配属', '学生', '研究', '室', '理解', '深める', '情報', '共有', 'システム']
MeCab with ipadic-neologd
['研究室', '配属', '学生', '研究室', '理解', '深める', '情報共有', 'システム']


In [5]:
text ='人感センサーを利用した乗降客数計測Androidアプリケーションの開発'
rs_j = wkt.word_seq(text,parser=janome)
rs_m = wkt.word_seq(text,parser=mecab)
print('Janome')
print(rs_j)
print('MeCab with ipadic-neologd')
print(rs_m)

Janome
['人', '感', 'センサー', '利用', 'する', '乗降', '客数', '計測', 'Android', 'アプリケーション', '開発']
MeCab with ipadic-neologd
['人感センサー', '利用', 'する', '乗降客数', '計測', 'Android', 'アプリケーション', '開発']


In [6]:
# 英語対応

# Stemming and Lemmatization of English Text in Python with NLTK
# cf. https://www.datacamp.com/tutorial/stemming-lemmatization-python

import nltk
from nltk.stem import PorterStemmer
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
# Initialize Python porter stemmer
ps = PorterStemmer()
# Example inflections to reduce
example_words = ["program","programming","programer","programs","programmed"]

# Perform stemming
print("{0:20}{1:20}".format("--Word--","--Stem--"))
for word in example_words:
    print ("{0:20}{1:20}".format(word, ps.stem(word)))

--Word--            --Stem--            
program             program             
programming         program             
programer           program             
programs            program             
programmed          program             


In [8]:
import string
from nltk.tokenize import word_tokenize

In [9]:
example_sentence = "Python programmers often tend like programming in python because it's like english. We call people who program in python pythonistas."

# Remove punctuation
example_sentence_no_punct = example_sentence.translate(str.maketrans("", "", string.punctuation))

# Create tokens
word_tokens = word_tokenize(example_sentence_no_punct)

# Perform stemming
print("{0:20}{1:20}".format("--Word--","--Stem--"))
for word in word_tokens:
    print ("{0:20}{1:20}".format(word, ps.stem(word)))

--Word--            --Stem--            
Python              python              
programmers         programm            
often               often               
tend                tend                
like                like                
programming         program             
in                  in                  
python              python              
because             becaus              
its                 it                  
like                like                
english             english             
We                  we                  
call                call                
people              peopl               
who                 who                 
program             program             
in                  in                  
python              python              
pythonistas         pythonista          


## Note
Some of the output words are not part of the english dictionary (i.e., "becaus", "people", and "programm"). Another thing to notice is that context is not taken into consideration. For instance, “programmers” is a plural noun but it was reduced down to "program", which can be a noun or a verb – in other words, the root words are ambiguous. This motivated context-sensitive lemmatizers to improve the performance on unseen and ambiguous words.s.

In [16]:
# WordNet lemmatizer 

from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [20]:
# Initialize wordnet lemmatizer
wnl = WordNetLemmatizer()

# Example inflections to reduce
example_words = ["program","programming","programer","programs","programmed"]
# Perform lemmatization
print("{0:20}{1:20}".format("--Word--","--Lemma--"))
for word in example_words:
    print ("{0:20}{1:20}".format(word, wnl.lemmatize(word, pos="v")))

--Word--            --Lemma--           
program             program             
programming         program             
programer           programer           
programs            program             
programmed          program             


In [21]:
example_sentence = "Python programmers often tend like programming in python because it's like english. We call people who program in python pythonistas."

# Remove punctuation
example_sentence_no_punct = example_sentence.translate(str.maketrans("", "", string.punctuation))
word_tokens = word_tokenize(example_sentence_no_punct)
# Perform lemmatization
print("{0:20}{1:20}".format("--Word--","--Lemma--"))
for word in word_tokens:
    print ("{0:20}{1:20}".format(word, wnl.lemmatize(word, pos="v")))

--Word--            --Lemma--           
Python              Python              
programmers         programmers         
often               often               
tend                tend                
like                like                
programming         program             
in                  in                  
python              python              
because             because             
its                 its                 
like                like                
english             english             
We                  We                  
call                call                
people              people              
who                 who                 
program             program             
in                  in                  
python              python              
pythonistas         pythonistas         


In [14]:
text1 = "The two may also differ in that stemming most commonly collapses derivationally related words, whereas lemmatization commonly only collapses the different forms offical forms a lemma"
if text1.isascii():
    print('Text is in English')
else:
    print('Text is not in English')

Text is in English


In [33]:
text = "For instance, programmers is a plural noun but it was reduced down to program, which can be a noun or a verb? in other words, the root words are ambiguous. This motivated context-sensitive lemmatizers to improve the performance on unseen and ambiguous words."
text_no_punct = text.translate(str.maketrans("", "", string.punctuation))
word_tokens = word_tokenize(text_no_punct)
print("{0:20}{1:20}".format("--Word--","--Lemma--"))
for token in word_tokens:
    word = wnl.lemmatize(token, pos="v")
    word = wnl.lemmatize(word, pos="n")
    word = wnl.lemmatize(word, pos="s")
    print ("{0:20}{1:20}".format(token, word))

--Word--            --Lemma--           
For                 For                 
instance            instance            
programmers         programmer          
is                  be                  
a                   a                   
plural              plural              
noun                noun                
but                 but                 
it                  it                  
was                 be                  
reduced             reduce              
down                down                
to                  to                  
program             program             
which               which               
can                 can                 
be                  be                  
a                   a                   
noun                noun                
or                  or                  
a                   a                   
verb                verb                
in                  in                  
other           

In [23]:
help(wnl.lemmatize)

Help on method lemmatize in module nltk.stem.wordnet:

lemmatize(word: str, pos: str = 'n') -> str method of nltk.stem.wordnet.WordNetLemmatizer instance
    Lemmatize `word` using WordNet's built-in morphy function.
    Returns the input word unchanged if it cannot be found in WordNet.
    
    :param word: The input word to lemmatize.
    :type word: str
    :param pos: The Part Of Speech tag. Valid options are `"n"` for nouns,
        `"v"` for verbs, `"a"` for adjectives, `"r"` for adverbs and `"s"`
        for satellite adjectives.
    :param pos: str
    :return: The lemma of `word`, for the given `pos`.



In [36]:
help(word_tokenize)

Help on function word_tokenize in module nltk.tokenize:

word_tokenize(text, language='english', preserve_line=False)
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently an improved :class:`.TreebankWordTokenizer`
    along with :class:`.PunktSentenceTokenizer`
    for the specified language).
    
    :param text: text to split into words
    :type text: str
    :param language: the model name in the Punkt corpus
    :type language: str
    :param preserve_line: A flag to decide whether to sentence tokenize the text or not.
    :type preserve_line: bool

