<img src="images/prepro.jpg" width="800">

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Text-gathering" data-toc-modified-id="Text-gathering-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Text gathering</a></span><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Web-Scrapping" data-toc-modified-id="Web-Scrapping-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Web Scrapping</a></span></li><li><span><a href="#Wikipedia-library" data-toc-modified-id="Wikipedia-library-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Wikipedia library</a></span></li><li><span><a href="#Data-visualization" data-toc-modified-id="Data-visualization-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Data visualization</a></span></li></ul></li><li><span><a href="#Text-preprocessing" data-toc-modified-id="Text-preprocessing-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Text preprocessing</a></span><ul class="toc-item"><li><span><a href="#Normalizing-the-data" data-toc-modified-id="Normalizing-the-data-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Normalizing the data</a></span><ul class="toc-item"><li><span><a href="#Lowercase" data-toc-modified-id="Lowercase-2.1.1"><span class="toc-item-num">2.1.1&nbsp;&nbsp;</span>Lowercase</a></span></li><li><span><a href="#Removing-selected-characters" data-toc-modified-id="Removing-selected-characters-2.1.2"><span class="toc-item-num">2.1.2&nbsp;&nbsp;</span>Removing selected characters</a></span></li></ul></li><li><span><a href="#Tokenizing" data-toc-modified-id="Tokenizing-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Tokenizing</a></span></li><li><span><a href="#Remove-Stop-Words" data-toc-modified-id="Remove-Stop-Words-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Remove Stop Words</a></span></li><li><span><a href="#Lemmatizing" data-toc-modified-id="Lemmatizing-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Lemmatizing</a></span></li><li><span><a href="#Stemming" data-toc-modified-id="Stemming-2.5"><span class="toc-item-num">2.5&nbsp;&nbsp;</span>Stemming</a></span></li><li><span><a href="#All-together" data-toc-modified-id="All-together-2.6"><span class="toc-item-num">2.6&nbsp;&nbsp;</span>All together</a></span></li><li><span><a href="#Analyze" data-toc-modified-id="Analyze-2.7"><span class="toc-item-num">2.7&nbsp;&nbsp;</span>Analyze</a></span></li></ul></li><li><span><a href="#Information-Extraction" data-toc-modified-id="Information-Extraction-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Information Extraction</a></span><ul class="toc-item"><li><span><a href="#Part-of-Speech-tagging-(POS)" data-toc-modified-id="Part-of-Speech-tagging-(POS)-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Part of Speech tagging (POS)</a></span></li><li><span><a href="#Chunking" data-toc-modified-id="Chunking-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Chunking</a></span></li><li><span><a href="#Name-Entity-Recognition" data-toc-modified-id="Name-Entity-Recognition-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Name Entity Recognition</a></span></li><li><span><a href="#Using-spacy-for-different-tasks" data-toc-modified-id="Using-spacy-for-different-tasks-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Using spacy for different tasks</a></span></li></ul></li><li><span><a href="#Text-Representation" data-toc-modified-id="Text-Representation-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Text Representation</a></span><ul class="toc-item"><li><span><a href="#Count-vectorizer" data-toc-modified-id="Count-vectorizer-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Count vectorizer</a></span></li></ul></li></ul></div>

# Text gathering

## Imports

In [2]:
import pandas as pd
pd.set_option('max_colwidth',500)

%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [16, 6]

# web scrapping
from bs4 import BeautifulSoup
import requests

# Terminal / Anaconda Prompt: pip install wikipedia
import wikipedia

# WordClouds
# Terminal / Anaconda Prompt: pip install wordcloud
from wordcloud import WordCloud
from nltk.corpus import stopwords


## Web Scrapping
 Download content from any website

In [3]:
names = []
# Scrapes transcript data from wikipedia.com
def url_to_transcript(url):
    '''Returns transcript data specifically from wikipedia'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="mw-parser-output").find_all('p')]
    text = ' '.join(text)
    print(url, 'article successfully transcripted')
    names.append(url)
    return text

In [4]:
# Write down the websites to download
urls = ['https://en.wikipedia.org/wiki/Chess', 
        'https://en.wikipedia.org/wiki/Go_(game)']

In [5]:
# Download the page content
transcripts = [url_to_transcript(u) for u in urls]

https://en.wikipedia.org/wiki/Chess article successfully transcripted
https://en.wikipedia.org/wiki/Go_(game) article successfully transcripted


In [6]:
df = pd.DataFrame(transcripts, index = [names], columns = ['transcript'] )
df

Unnamed: 0,transcript
https://en.wikipedia.org/wiki/Chess,"\n \n Chess is a two-player strategy board game played on a checkered board with 64 squares arranged in an 8×8 grid.[1] The game is played by millions of people worldwide. Chess is believed to be derived from the Indian game chaturanga sometime before the 7th century. Chaturanga is also the likely ancestor of the Eastern strategy games xiangqi (Chinese chess), janggi (Korean chess), and shogi (Japanese chess). Chess reached Europe by the 9th century, due to the Umayyad conquest of Hispania. ..."
https://en.wikipedia.org/wiki/Go_(game),"Go is an abstract strategy board game for two players, in which the aim is to surround more territory than the opponent. The game was invented in China more than 2,500 years ago and is believed to be the oldest board game continuously played to the present day.[1][2] \nA 2016 survey by the International Go Federation's 75 member nations found that there are over 46 million people worldwide who know how to play Go and over 20 million current players, the majority of whom live in East Asia.[3]..."


## Wikipedia library
Specific python package for downloading Wikipedia content

In [7]:
chessWikiName = wikipedia.search('Chess')[0]
goWikiName = wikipedia.search('go game')[0]

In [8]:
goWikiName

'Go (game)'

In [9]:
chess_page = wikipedia.page(chessWikiName)
content_chess = chess_page.content

go_page = wikipedia.page(goWikiName)
content_go = go_page.content

In [10]:
df = pd.DataFrame(data=[content_chess, content_go], index=[chessWikiName, goWikiName], columns=['transcript'])
df.T

Unnamed: 0,Chess,Go (game)
transcript,"Chess is a two-player strategy board game played on a checkered board with 64 squares arranged in an 8×8 grid. The game is played by millions of people worldwide. Chess is believed to be derived from the Indian game chaturanga sometime before the 7th century. Chaturanga is also the likely ancestor of the Eastern strategy games xiangqi (Chinese chess), janggi (Korean chess), and shogi (Japanese chess). Chess reached Europe by the 9th century, due to the Umayyad conquest of Hispania. The piece...","Go is an abstract strategy board game for two players, in which the aim is to surround more territory than the opponent. The game was invented in China more than 2,500 years ago and is believed to be the oldest board game continuously played to the present day. \nA 2016 survey by the International Go Federation's 75 member nations found that there are over 46 million people worldwide who know how to play Go and over 20 million current players, the majority of whom live in East Asia.The playi..."


## Data visualization
Visualizing the data is the first step to analyze the text you'll be using!

In [11]:
# Let's make some word clouds!

# We will upload the stop_words to don't use them in the wordcloud
stop_words = set(stopwords.words('english'))

wc = WordCloud(stopwords=stop_words, background_color="white", colormap="Dark2",
               max_font_size=150, random_state=42)


In [None]:
articles = ['Chees', 'Go']

# Create subplots for each article
for index, article in enumerate(df.T):
    wc.generate(df.transcript[article])
    
    plt.subplot(1, 2, index+1)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(articles[index])

plt.show()

# Text preprocessing

In [13]:
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer

# Download all the packages
#nltk.download()

In [14]:
#!pip install spacy
import spacy
# Run the following code to install the en_core_web_sm library
#!python -m spacy download en_core_web_sm
spacy_nlp = spacy.load('en_core_web_sm')

In [15]:
text = "\n\nChess is a two-player strategy board game played on a checkered board with 64 squares arranged in an 8×8 grid.[1]"

## Normalizing the data


### Lowercase

In [16]:
text = text.lower()
text

'\n\nchess is a two-player strategy board game played on a checkered board with 64 squares arranged in an 8×8 grid.[1]'

### Removing selected characters
Normally applying regular expressions (testing page: https://regexr.com/)

In [17]:
print("Punctuation to erase:", re.escape(string.punctuation))

# Remove punctuation
text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
print("\nText without punctuation:\n\t",text)

# Remove numbers
text = re.sub('\d*', '', text)
#text = re.sub('\w*\d\w*', '', text)
print("\nText without numbers:\n\t",text)

# Remove returns
text = re.sub('\n', '', text)
print("\nText without returns:\n\t",text)

Punctuation to erase: !"\#\$%\&'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^_`\{\|\}\~

Text without punctuation:
	 

chess is a two player strategy board game played on a checkered board with 64 squares arranged in an 8×8 grid  1 

Text without numbers:
	 

chess is a two player strategy board game played on a checkered board with  squares arranged in an × grid   

Text without returns:
	 chess is a two player strategy board game played on a checkered board with  squares arranged in an × grid   


## Tokenizing

In [18]:
tokenized_text = word_tokenize(text)
print(tokenized_text)

['chess', 'is', 'a', 'two', 'player', 'strategy', 'board', 'game', 'played', 'on', 'a', 'checkered', 'board', 'with', 'squares', 'arranged', 'in', 'an', '×', 'grid']


## Remove Stop Words

In [19]:
stop_words = set(stopwords.words('english')) 
print(stop_words)

{'me', 'at', "hasn't", 'wouldn', "that'll", "needn't", 'when', 'myself', 'theirs', 'we', 'hadn', 'by', 'he', 'are', 'had', 'shan', 'were', 'below', 'now', "isn't", "shouldn't", 'which', 'during', 'does', 'no', 'hers', 'ain', "you're", "weren't", 'so', 'do', 'his', 'where', 't', 'our', 'mightn', 'of', 'its', "don't", 'i', 're', 'didn', 'weren', "doesn't", 'to', 'if', 'each', 'other', 'such', 'o', 'again', 'too', 'in', "wasn't", 'aren', "it's", 'these', 'did', 'm', 'wasn', 'don', 'between', 'until', 'but', 'this', 'only', 'own', 've', 'those', 'before', 'needn', "shan't", 'was', 'about', 'she', 'am', 'some', 'because', 'just', 'couldn', 'how', 'herself', 'been', 'mustn', 'won', 'why', 'is', 'itself', 'both', 'my', 'here', 'few', 'after', 'any', 'as', "mightn't", 'yourselves', 'above', 'more', 'can', "you've", "should've", "wouldn't", 'further', 'ours', 'll', 'their', 'while', 'who', 'd', 'shouldn', 'doesn', 'it', 'have', 'or', 'off', 'what', 'isn', 'haven', 'out', 'on', 'will', "couldn't

In [20]:
# Erase the stop words from the tokenized text
stopped_text = [w for w in tokenized_text if not w in stop_words]

print(stopped_text)

#Erased words
set(tokenized_text).difference(set(stopped_text))

['chess', 'two', 'player', 'strategy', 'board', 'game', 'played', 'checkered', 'board', 'squares', 'arranged', '×', 'grid']


{'a', 'an', 'in', 'is', 'on', 'with'}

## Lemmatizing

In [21]:
spacy_text = spacy_nlp(' '.join(stopped_text))
[token.lemma_ for token in spacy_text]

['chess',
 'two',
 'player',
 'strategy',
 'board',
 'game',
 'play',
 'checker',
 'board',
 'square',
 'arrange',
 '×',
 'grid']

In [22]:
lemma = nltk.wordnet.WordNetLemmatizer()
lemmatized_text = [lemma.lemmatize(w,'v') for w in stopped_text]
lemmatized_text

['chess',
 'two',
 'player',
 'strategy',
 'board',
 'game',
 'play',
 'checker',
 'board',
 'square',
 'arrange',
 '×',
 'grid']

## Stemming

In [23]:
snow = nltk.stem.SnowballStemmer('english')

snow_text = [snow.stem(w) for w in lemmatized_text]
snow_text

['chess',
 'two',
 'player',
 'strategi',
 'board',
 'game',
 'play',
 'checker',
 'board',
 'squar',
 'arrang',
 '×',
 'grid']

## All together

In [24]:
# Apply a first round of text cleaning techniques
snow = nltk.stem.SnowballStemmer('english')
lemma = nltk.wordnet.WordNetLemmatizer()

def clean_text(text):
    '''Make text lowercase, 
       remove punctuation,
       remove numbers and words containing numbers
       remove especial characters as \n
       strip the text
       lemmatize
       stemmize.'''
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\n', '', text)
    word_tokens = word_tokenize(text)
    text = [lemma.lemmatize(w.strip(), 'v') for w in word_tokens if not w in stop_words]
    text = [snow.stem(w.strip()) for w in text]

    return ' '.join(text)

In [25]:
cleaned_text = clean_text(text)
cleaned_text

'chess two player strategi board game play checker board squar arrang × grid'

In [26]:
data_clean = pd.DataFrame(df.transcript.apply(clean_text))

In [27]:
df.T

Unnamed: 0,Chess,Go (game)
transcript,"Chess is a two-player strategy board game played on a checkered board with 64 squares arranged in an 8×8 grid. The game is played by millions of people worldwide. Chess is believed to be derived from the Indian game chaturanga sometime before the 7th century. Chaturanga is also the likely ancestor of the Eastern strategy games xiangqi (Chinese chess), janggi (Korean chess), and shogi (Japanese chess). Chess reached Europe by the 9th century, due to the Umayyad conquest of Hispania. The piece...","Go is an abstract strategy board game for two players, in which the aim is to surround more territory than the opponent. The game was invented in China more than 2,500 years ago and is believed to be the oldest board game continuously played to the present day. \nA 2016 survey by the International Go Federation's 75 member nations found that there are over 46 million people worldwide who know how to play Go and over 20 million current players, the majority of whom live in East Asia.The playi..."


In [28]:
data_clean.T

Unnamed: 0,Chess,Go (game)
transcript,chess two player strategi board game play checker board squar arrang × grid game play million peopl worldwid chess believ deriv indian game chaturanga sometim centuri chaturanga also like ancestor eastern strategi game xiangqi chines chess janggi korean chess shogi japanes chess chess reach europ centuri due umayyad conquest hispania piec assum current power spain late centuri modern rule standard centuri play involv hide inform player begin piec one king one queen two rook two knight two bi...,go abstract strategi board game two player aim surround territori oppon game invent china year ago believ oldest board game continu play present day survey intern go feder member nation find million peopl worldwid know play go million current player major live east asia play piec call stone one player use white stone black player take turn place stone vacant intersect point board place board stone may move stone remov board captur captur happen stone group stone surround oppos stone orthogon...


## Analyze

In [29]:
wc = WordCloud(stopwords=stop_words, background_color="white", colormap="Dark2", max_font_size=150, random_state=42)

In [None]:
articles = ['Chees', 'Go']

# Create subplots for each article
for index, article in enumerate(data_clean.T.columns):
    wc.generate(data_clean.transcript[article])
    
    plt.subplot(1, 2, index+1)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(articles[index])

plt.show()

# Information Extraction

## Part of Speech tagging (POS) 

In [31]:
# Example text

example = 'today is a sunny day for mr james and mrs ana.'
example = 'we saw the yellow dog.'

# Tokenize
example_token = word_tokenize(example)

In [32]:
# Tag the tokens
nltk.pos_tag(example_token)

[('we', 'PRP'),
 ('saw', 'VBD'),
 ('the', 'DT'),
 ('yellow', 'JJ'),
 ('dog', 'NN'),
 ('.', '.')]

## Chunking

In [33]:
grammar = r"""
  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
  PP: {<IN><NP>}               # Chunk prepositions followed by NP
  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
  CLAUSE: {<NP><VP>}           # Chunk NP, VP
  """

tag = nltk.pos_tag(example_token)
cp = nltk.RegexpParser(grammar)
result = cp.parse(tag)
print(result)
result.draw()

(S we/PRP saw/VBD (NP the/DT yellow/JJ dog/NN) ./.)


## Name Entity Recognition

In [34]:
ne_tree = nltk.ne_chunk(tag)
print(ne_tree)

(S we/PRP saw/VBD the/DT yellow/JJ dog/NN ./.)


## Using spacy for different tasks
* **Text:** The original word text.
* **Lemma:** The base form of the word.
* **POS:** The simple part-of-speech tag.
* **Tag:** The detailed part-of-speech tag.
* **Dep:** Syntactic dependency, i.e. the relation between tokens.
* **Shape:** The word shape – capitalization, punctuation, digits.
* **is alpha:** Is the token an alpha character?
* **is stop:** Is the token part of a stop list, i.e. the most common words of the language?

In [35]:
doc = spacy_nlp("Apple is looking at buying U.K. startup for $1 billion")
doc = spacy_nlp(text)

spacydf = pd.DataFrame(columns=['token.text', 'token.lemma_', 'token.pos_', 'token.tag_', 'token.dep_',
            'token.shape_', 'token.is_alpha', 'token.is_stop'])

for i, token in enumerate(doc):
    spacydf.loc[i]= [token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop]

spacydf

Unnamed: 0,token.text,token.lemma_,token.pos_,token.tag_,token.dep_,token.shape_,token.is_alpha,token.is_stop
0,chess,chess,NOUN,NN,nsubj,xxxx,True,False
1,is,be,AUX,VBZ,ROOT,xx,True,True
2,a,a,DET,DT,det,x,True,True
3,two,two,NUM,CD,nummod,xxx,True,True
4,player,player,NOUN,NN,compound,xxxx,True,False
5,strategy,strategy,NOUN,NN,compound,xxxx,True,False
6,board,board,NOUN,NN,compound,xxxx,True,False
7,game,game,NOUN,NN,attr,xxxx,True,False
8,played,play,VERB,VBD,acl,xxxx,True,False
9,on,on,ADP,IN,prep,xx,True,True


In [36]:
spacynerdf = pd.DataFrame(columns=['ent.text', 'ent.start_char', 'ent.end_char', 'ent.label_'])

for i, ent in enumerate(doc.ents):
    spacynerdf.loc[i] = [ent.text, ent.start_char, ent.end_char, ent.label_]
spacynerdf

Unnamed: 0,ent.text,ent.start_char,ent.end_char,ent.label_
0,two,11,14,CARDINAL


In [39]:
from spacy import displacy

displacy.render(spacy_nlp(str(example)), style='dep', jupyter = True, options = {'distance': 120})

In [40]:

displacy.render(spacy_nlp(str(df.transcript[1][:500])), jupyter=True, style='ent')


# Text Representation

In [41]:
from sklearn.feature_extraction.text import CountVectorizer


In [42]:
training_data = ['this is good', 'this is bad', 'this is awesome']

## Count vectorizer
It converts a collection of text documents to a matrix of token counts.

In [43]:
vectorizer = CountVectorizer(max_features=1000)
vectorizer.fit(training_data)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=1000, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [44]:
# The list of the vocabulary created with the count vectorizer.
vectorizer.vocabulary_
print(vectorizer.get_feature_names())

['awesome', 'bad', 'good', 'is', 'this']


In [45]:
X = vectorizer.transform(training_data)
print(vectorizer.transform(training_data).toarray())

[[0 0 1 1 1]
 [0 1 0 1 1]
 [1 0 0 1 1]]


In [46]:
data_cv = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
data_cv.index = training_data
tdm = data_cv
tdm.head()

Unnamed: 0,awesome,bad,good,is,this
this is good,0,0,1,1,1
this is bad,0,1,0,1,1
this is awesome,1,0,0,1,1
