In [1]:
import collections

In [2]:
stop_words=['\n', 'or', 'are', 'they', 'i', 'some', 'by', '—', 
            'even', 'the', 'to', 'a', 'and', 'of', 'in', 'on', 'for', 
            'that', 'with', 'is', 'as', 'could', 'its', 'this', 'other',
            'an', 'have', 'more', 'at','don’t', 'can', 'only', 'most']

In [3]:
maxlen = 1000

In [4]:
word_freqs = collections.Counter()

In [13]:
word_freqs = collections.Counter()
with open('./NLP_data/news.txt','r+', encoding='UTF-8') as f:
    for line in f:
        # 轉小寫、分詞
        words = line.lower().split(' ')
        # 統計字詞出現次數
        if len(words) > maxlen:
            maxlen = len(words)
        for word in words:
            if not (word in stop_words):
                word_freqs[word] += 1
                
print(word_freqs.most_common(20))     

[('were', 2), ('not', 2), ('strong', 2), ('has', 2), ("yankees'", 1), ('reinforcements', 1), ('offseason', 1), ('past.', 1), ('did', 1), ('sign', 1), ('four', 1), ('major', 1), ('free-market', 1), ('shortstops', 1), ('when', 1), ('needed', 1), ('shortstops.', 1), ('instead,', 1), ('chose', 1), ('welcome', 1)]


In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np

In [16]:
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
]

In [17]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
word = vectorizer.get_feature_names()
print ("Vocabulary：", word)

Vocabulary： ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [20]:
print ("BOW=\n", X.toarray())


BOW=
 [[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]


In [22]:
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
print('TF-IDF = \n', np.around(tfidf.toarray(), 4))

TF-IDF = 
 [[0.     0.4388 0.542  0.4388 0.     0.     0.3587 0.     0.4388]
 [0.     0.2723 0.     0.2723 0.     0.8532 0.2226 0.     0.2723]
 [0.5528 0.     0.     0.     0.5528 0.     0.2885 0.5528 0.    ]
 [0.     0.4388 0.542  0.4388 0.     0.     0.3587 0.     0.4388]]


In [28]:
from sklearn.metrics.pairwise import cosine_similarity
print (cosine_similarity(tfidf[-1], tfidf[:-1], dense_output=False))

  (0, 2)	0.1034849000930086
  (0, 1)	0.43830038447620107
  (0, 0)	1.0


In [25]:
tfidf[-1]

<1x9 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [29]:
import nltk

In [30]:
text="Today is a great day. It is even better than yesterday." + \
     " And yesterday was the best day ever."

In [32]:
nltk.sent_tokenize(text)


['Today is a great day.',
 'It is even better than yesterday.',
 'And yesterday was the best day ever.']

In [33]:
nltk.word_tokenize(text)


['Today',
 'is',
 'a',
 'great',
 'day',
 '.',
 'It',
 'is',
 'even',
 'better',
 'than',
 'yesterday',
 '.',
 'And',
 'yesterday',
 'was',
 'the',
 'best',
 'day',
 'ever',
 '.']

In [35]:
text = 'My system keeps crashing his crashed yesterday, ours crashes daily'
ps = nltk.porter.PorterStemmer()
' '.join([ps.stem(word) for word in text.split()])

'my system keep crash hi crash yesterday, our crash daili'

In [36]:
text = 'My system keeps crashing his crashed yesterday, ours crashes daily'
lem = nltk.WordNetLemmatizer()
' '.join([lem.lemmatize(word) for word in text.split()])

'My system keep crashing his crashed yesterday, ours crash daily'

In [34]:
import string
print('標點符號: ', string.punctuation)

標點符號:  !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [37]:
text="Today is a great day. It is even better than yesterday." + \
     " And yesterday was the best day ever."

stopword_list = set(nltk.corpus.stopwords.words('english') + list(string.punctuation))

In [48]:
def remove_stopwords(text, is_lower_case = False):
    if is_lower_case:
        text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text, filtered_tokens

In [49]:
filtered_text, filtered_tokens = remove_stopwords(text) 
filtered_text

'Today great day It even better yesterday And yesterday best day ever'

In [66]:
with open('./NLP_data/news.txt', 'r+', encoding = 'UTF-8') as f:
    text = f.read()
filtered_text, filtered_tokens = remove_stopwords(text, True)

word_freqs = collections.Counter()
for word in filtered_tokens:
    word_freqs[word] += 1
print(word_freqs.most_common(20))

[('strong', 2), ('shortstops', 2), ('yankees', 1), ('reinforcements', 1), ('offseason', 1), ('past', 1), ('sign', 1), ('four', 1), ('major', 1), ('free-market', 1), ('needed', 1), ('instead', 1), ('chose', 1), ('welcome', 1), ('defense-oriented', 1), ('isiah', 1), ('kiner-falefa', 1), ('trades', 1), ('move', 1), ('aroused', 1)]


In [70]:
lem = nltk.WordNetLemmatizer()
def remove_stopwords_regex(text, is_lower_case = False):
    if is_lower_case:
        text = text.lower()
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    tokens = [lem.lemmatize(token.strip()) for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text, filtered_tokens

In [71]:
filtered_text, filtered_tokens = remove_stopwords_regex(text, True) 
word_freqs = collections.Counter()
for word in filtered_tokens:
    word_freqs[word] += 1
print(word_freqs.most_common(20))     

[('strong', 2), ('shortstop', 2), ('ha', 2), ('yankee', 1), ('reinforcement', 1), ('offseason', 1), ('past', 1), ('sign', 1), ('four', 1), ('major', 1), ('free', 1), ('market', 1), ('needed', 1), ('instead', 1), ('chose', 1), ('welcome', 1), ('defense', 1), ('oriented', 1), ('isiah', 1), ('kiner', 1)]


In [72]:
lem.lemmatize('korean')


'korean'

In [75]:
synonyms = nltk.corpus.wordnet.synsets('love')
synonyms

[Synset('love.n.01'),
 Synset('love.n.02'),
 Synset('beloved.n.01'),
 Synset('love.n.04'),
 Synset('love.n.05'),
 Synset('sexual_love.n.02'),
 Synset('love.v.01'),
 Synset('love.v.02'),
 Synset('love.v.03'),
 Synset('sleep_together.v.01')]

In [78]:
synonyms[0].definition()

'a strong positive emotion of regard and affection'

In [80]:
synonyms[0].examples()

['his love for his work', 'children need a lot of love']

In [86]:
antonyms = []
for syn in nltk.corpus.wordnet.synsets('ugly'):
    for l in syn.lemmas():
        print(l)
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
antonyms

Lemma('ugly.a.01.ugly')
Lemma('surly.s.01.surly')
Lemma('surly.s.01.ugly')
Lemma('despicable.s.01.despicable')
Lemma('despicable.s.01.ugly')
Lemma('despicable.s.01.vile')
Lemma('despicable.s.01.slimy')
Lemma('despicable.s.01.unworthy')
Lemma('despicable.s.01.worthless')
Lemma('despicable.s.01.wretched')
Lemma('atrocious.s.03.atrocious')
Lemma('atrocious.s.03.frightful')
Lemma('atrocious.s.03.horrifying')
Lemma('atrocious.s.03.horrible')
Lemma('atrocious.s.03.ugly')


['beautiful']

In [88]:
text='I am a human being, capable of doing terrible things'
sentences = nltk.sent_tokenize(text)
for sent in sentences:
    print(nltk.pos_tag(nltk.word_tokenize(sent)))

[('I', 'PRP'), ('am', 'VBP'), ('a', 'DT'), ('human', 'JJ'), ('being', 'VBG'), (',', ','), ('capable', 'JJ'), ('of', 'IN'), ('doing', 'VBG'), ('terrible', 'JJ'), ('things', 'NNS')]


In [89]:
sentences

['I am a human being, capable of doing terrible things']