In [None]:
!wget https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/preprocess.py
import preprocess
import re
import pandas as pd

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from wordcloud import WordCloud
import matplotlib.pyplot as plt
% matplotlib inline


def plotWordCloud(wordcloud):
  plt.figure(figsize=[15,10])
  plt.imshow(wordcloud, interpolation='bilinear')
  plt.axis("off")
  plt.show()

In [None]:
text = "How are you, Tom? How many works do you have to work?"
print(preprocess._removeSymbols(text))

In [None]:
text = "How are you, Tom? How many works do you have to work?"
print(text.lower())

In [None]:
text = "How are you, Tom? How many works do you have to work?"
print(preprocess._stem(text))

In [None]:
# Stemming by PorterStemmer
# https://www.nltk.org/_modules/nltk/stem/porter.html

s = PorterStemmer()
print(s.stem('Having'))
print(s.stem('Have'))
print(s.stem('Had'))

print(s.stem('Fishing'))
print(s.stem('Fish'))
print(s.stem('Fisher'))
print(s.stem('Fishes'))
print(s.stem('Fished'))

print(s.stem('am'))
print(s.stem('is'))
print(s.stem('was'))

In [None]:
# Lemmatization by WordNet
# Lemmatization is the process of converting a word to its base form. 
# Lemmatization considers the context and converts the word to its meaningful base form, 
# whereas stemming just removes the last few characters
# Sometimes the same word can have multiple different lemmas. 
# Based on the context (by POS tag), extract the appropriate lemma.

s = WordNetLemmatizer()
print(s.lemmatize('having', pos='v'))
print(s.lemmatize('have', pos='v'))
print(s.lemmatize('had', pos='v'))

print(s.lemmatize('fishing', pos='v'))
print(s.lemmatize('fish', pos='v'))
print(s.lemmatize('fisher', pos='n'))
print(s.lemmatize('fishes', pos='v'))
print(s.lemmatize('fished', pos='v'))

print(s.lemmatize('am', pos='v'))
print(s.lemmatize('is', pos='v'))
print(s.lemmatize('was', pos='v'))

In [None]:
text = "How are you, Tom? How many works do you have to work?"
print(preprocess._stop(text))

In [None]:
preprocess.stop_words

In [None]:
text = "How are you, Tom? How many works do you have to work?"
print(preprocess.process(text))

In [None]:
# Segment Chinese sentence into words
import jieba

seg_list = jieba.cut("兒子生性病母倍感安慰")
print("/".join(seg_list))  

In [None]:
seg_list = jieba.cut("白石角新發展區位於沙田與大埔之間，2012年起發展成住宅區，多個樓盤如天賦海灣、逸瓏灣、海日灣及朗濤等相繼落成及入伙")
print("/".join(seg_list))  

In [None]:
seg_list = jieba.cut("雷军称不送充电器创意是他首创的，不是抄苹果")
print("/".join(seg_list)) 

In [None]:
# Install the cantonese
!pip install pycantonese

In [None]:
import pycantonese as pc

pc.segment("兒子生性病母倍感安慰")

In [None]:
# Combine English words to phrases
from gensim.models.phrases import Phrases, Phraser

documents = [
    "the cheif executive officer of new york was there", 
    "machine learning can be useful sometimes",
    "new york cheif executive officer was present",
    "machine learning is good"
]

sentence_stream = [doc.split(" ") for doc in documents]

In [None]:
bigram = Phraser(Phrases(sentence_stream, min_count=1, threshold=2))

for sent in bigram[sentence_stream]:
    print(sent)

In [None]:
trigram = Phraser(Phrases(bigram[sentence_stream], min_count=1, threshold=2))

for sent in trigram[bigram[sentence_stream]]:
    print(sent)

In [None]:
# Generate word cloud from Apple Tweets
url = 'https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/data/appleTweets.xlsx'
appleTweets = pd.read_excel(url)
appleTweets

In [None]:
text = " ".join(words for words in appleTweets['Tweet content'])
text[:5000]

In [None]:
# Word cloud
MAX_WORDS = 50
WIDTH, HEIGHT = 800, 600
BG_COLOR = "black" # white

plotWordCloud(WordCloud(max_words=MAX_WORDS, width=WIDTH, height=HEIGHT, stopwords='', background_color=BG_COLOR).generate(text))

In [None]:
appleTweets['cleaned'] = appleTweets['Tweet content'].apply(preprocess.process)
appleTweets[['Tweet content', 'cleaned']]

In [None]:
cleaned_text = " ".join(words for words in appleTweets['cleaned'])
cleaned_text[:5000]

In [None]:
# application level stop words
STOP_WORDS = ['apple', 'aapl', 'http', 'co', 'inc', 'read', 'ha']
cleanedCloud = WordCloud(max_words=MAX_WORDS, width=WIDTH, height=HEIGHT, stopwords=STOP_WORDS, background_color=BG_COLOR).generate(cleaned_text)
plotWordCloud(cleanedCloud)

In [None]:
# Save the word cloud to a file
cleanedCloud.to_file("wordcloud.png")

In [None]:
# https://www.datacamp.com/community/tutorials/wordcloud-python
?WordCloud

In [None]:
# Remove the unsupported characters and save to a file
appleTweets = appleTweets.applymap(lambda x: x.encode('unicode_escape').decode('utf-8') if isinstance(x, str) else x)
appleTweets.to_excel('appleTweetsCleaned.xlsx')