# Text Processing

## Capturing Text Data

### Plain Text

In [4]:
import os

# Read in a plain text file
with open(os.path.join("data","hieroglyph.txt"),"r") as f:
    text = f.read()
    print(text)

### Tabular Data

In [None]:
import pandas as pd

# Extract text column from a dataframe
df = pd.read_csv(os.path.join("data", "news.csv"))
df.head()[['publisher', 'title']]

# Convert text column to a lowercase
df["title"] = df["title"].str.lower()
df.head()[['publisher', 'title']]

### Online Resource

In [5]:
import requests
import json

# Fetch data from a REST API
r = requests.get("https://quotes.rest/qod.json")
res = r.json()
print(json.dumps(res,indent=4))

# Extract relevant object and field
q = res["contents"]["quotes"][0]
print(q["quote"], "\n--", q["author"])

### Cleaning

In [6]:
import requests

# Fetch a web page
r = requests.get("https://news.ycombinator.com")
print(r.text)

<html lang="en" op="news"><head><meta name="referrer" content="origin"><meta name="viewport" content="width=device-width, initial-scale=1.0"><link rel="stylesheet" type="text/css" href="news.css?wRBQvoWTalI3lUyNpVaE">
        <link rel="shortcut icon" href="favicon.ico">
          <link rel="alternate" type="application/rss+xml" title="RSS" href="rss">
        <title>Hacker News</title></head><body><center><table id="hnmain" border="0" cellpadding="0" cellspacing="0" width="85%" bgcolor="#f6f6ef">
        <tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" width="100%" style="padding:2px"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img src="y18.gif" width="18" height="18" style="border:1px white solid;"></a></td>
                  <td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>
              <a href="newest">new</a> | <a href="front">past</a> | <a href=

In [7]:
import re

# Remove HTML tags using RegEx
pattern = re.compile(r'<.*?>')  # tags look like <...>
print(pattern.sub('',r.text))   # replace them with blank


        
          
        Hacker News
        
                  Hacker News
              new | past | comments | ask | show | jobs | submit            
                              login
                          
              

              
      1.      The phrase “welcome back” on a page causes Safari to autofill a password (github.com/livewire-ui)
        259 points by knorthfield 2 hours ago  | hide | 145&nbsp;comments              
      
                
      2.      Sandwell Bitcoin mine found stealing electricity (bbc.co.uk)
        52 points by frereubu 1 hour ago  | hide | 42&nbsp;comments              
      
                
      3.      QUIC is now RFC 9000 (fastly.com)
        411 points by blucell 11 hours ago  | hide | 118&nbsp;comments              
      
                
      4.      Boring Avatars – React library to generate custom avatars (boringavatars.com)
        201 points by arnklint 7 hours ago  | hide | 43&nbsp;comments              
      
    

In [11]:
from bs4 import BeautifulSoup

# Remove HTML tags using Beautiful Soup library
soup = BeautifulSoup(r.text, "html5lib")
print(soup.get_text())


        
          
        Hacker News
        
                  Hacker News
              new | past | comments | ask | show | jobs | submit            
                              login
                          
              

              
      1.      The phrase “welcome back” on a page causes Safari to autofill a password (github.com/livewire-ui)
        259 points by knorthfield 2 hours ago  | hide | 145 comments              
      
                
      2.      Sandwell Bitcoin mine found stealing electricity (bbc.co.uk)
        52 points by frereubu 1 hour ago  | hide | 42 comments              
      
                
      3.      QUIC is now RFC 9000 (fastly.com)
        411 points by blucell 11 hours ago  | hide | 118 comments              
      
                
      4.      Boring Avatars – React library to generate custom avatars (boringavatars.com)
        201 points by arnklint 7 hours ago  | hide | 43 comments              
      
                
      5

In [13]:
# Find all articles
summaries = soup.find_all("tr",class_="athing")
summaries[0]

<tr class="athing" id="27313284">
      <td align="right" class="title" valign="top"><span class="rank">1.</span></td>      <td class="votelinks" valign="top"><center><a href="vote?id=27313284&amp;how=up&amp;goto=news" id="up_27313284"><div class="votearrow" title="upvote"></div></a></center></td><td class="title"><a class="storylink" href="https://github.com/livewire-ui/spotlight/issues/25">The phrase “welcome back” on a page causes Safari to autofill a password</a><span class="sitebit comhead"> (<a href="from?site=github.com/livewire-ui"><span class="sitestr">github.com/livewire-ui</span></a>)</span></td></tr>

In [14]:
# Extract title
summaries[0].find("a", class_="storylink").get_text().strip()

'The phrase “welcome back” on a page causes Safari to autofill a password'

In [19]:
# Find all articles, extract titles

articles = []
# for i in range(len(summaries)):
#     articles.append(summaries[i].find("a", class_="storylink").get_text().strip())
summaries = soup.find_all("tr", class_="athing")
for summary in summaries:
    articles.append(summary.find("a", class_="storylink").get_text().strip())

In [20]:
articles

['The phrase “welcome back” on a page causes Safari to autofill a password',
 'Sandwell Bitcoin mine found stealing electricity',
 'QUIC is now RFC 9000',
 'Boring Avatars – React library to generate custom avatars',
 'Gnat 2021 GPL Community Edition Ada 202x compiler released',
 'The rise of crypto laundries: how criminals cash out of Bitcoin',
 'Cortex (YC W20) Is Hiring Founding Engineers',
 'Extracting Data from an Old iOS App Broken by iOS 14.5',
 'The Last Days of Tokyo’s Nakagin Capsule Tower',
 'Mars Helicopter Lands Safely After Serious In-Flight Anomaly',
 'Show HN: I Built Multiplayer Deathmatch Joust',
 'Call it a comeback: Turntable.fm raises $7.5M',
 'On the Road: The Woman and the Car (1909)',
 'Americans are on the move, but their stuff doesn’t always follow',
 'First Tasmanian Devils born in the wild of Australia mainland in 3k years',
 'ClickHouse: An open-source column-oriented database management system',
 'Oldest recovered TV images (2013)',
 '80M I/O Per Second wi

In [31]:
r = requests.get("https://scis.smu.edu.sg/master-it-business/analytics-track/curriculum")
print(r.text)

<!DOCTYPE html>
<!--[if lt IE 7]> <html class="ie6 ie" lang="en" dir="ltr"> <![endif]-->
<!--[if IE 7]>    <html class="ie7 ie" lang="en" dir="ltr"> <![endif]-->
<!--[if IE 8]>    <html class="ie8 ie" lang="en" dir="ltr"> <![endif]-->
<!--[if IE 9]>    <html class="ie9 ie" lang="en" dir="ltr"> <![endif]-->
<!--[if !IE]> --> <html lang="en" dir="ltr" ng-controller="SMUController"> <!-- <![endif]-->

<head>
<meta http-equiv="X-UA-Compatible" content="IE=Edge,chrome=1">
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="Generator" content="Drupal 7 (http://drupal.org)" />
<link rel="canonical" href="/master-it-business/analytics-track/curriculum" />
<link rel="shortlink" href="/node/30326" />
<link rel="shortcut icon" href="https://scis.smu.edu.sg/sites/all/themes/smu/images/smu_favicon.png" type="image/png" />
  <!-- Set the viewport width to device width for mobile -->
  <meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=

In [32]:
soup = BeautifulSoup(r.text, "html5lib")
print(soup.get_text())



  




  
  
   
    
    
  
    Analytics Track | School of Computing and Information Systems (SMU)  
  




  
















<!--//--><![CDATA[//><!--
try{Typekit.load();}catch(e){}
//--><!]]>











<!--//--><![CDATA[//><!--
jQuery.extend(Drupal.settings, {"basePath":"\/","pathPrefix":"","ajaxPageState":{"theme":"smu_pg","theme_token":"7UCl8c1CYvAWG2TZNt-fpoRLmsOXtZEWl4-m8TIHPfQ","js":{"sites\/all\/libraries\/respondjs\/respond.min.js":1,"sites\/all\/modules\/jquery_update\/replace\/jquery\/1.9\/jquery.min.js":1,"misc\/jquery-extend-3.4.0.js":1,"misc\/jquery-html-prefilter-3.5.0-backport.js":1,"misc\/jquery.once.js":1,"misc\/drupal.js":1,"sites\/all\/modules\/jquery_update\/replace\/ui\/ui\/minified\/jquery.ui.effect.min.js":1,"\/sites\/all\/libraries\/angular\/angular.js":1,"sites\/all\/modules\/back_to_top\/js\/back_to_top.js":1,"sites\/all\/libraries\/colorbox\/jquery.colorbox-min.js":1,"sites\/all\/modules\/colorbox\/js\/colorbox.js":1,"sites\/all\/modules\/colorbox\/st

In [51]:
course_names = soup.find_all("div",class_="curriculum-text")

In [53]:
course_names[3].get_text()

'Digital Transformation in Retail Banking Technology'

In [69]:
list_courses=[]
for course_name in course_names:
    list_courses.append(course_name.get_text().strip())

In [70]:
for index,name in enumerate(list_courses):
    print(f"Course {index+1}: {name}")

Course 1: Digital Banking & Trends
Course 2: Data Science in Financial Services*
Course 3: Corporate Banking & Blockchain*
Course 4: Digital Transformation in Retail Banking Technology
Course 5: Financial Markets Systems & Technology
Course 6: Digital Payments & Innovations
Course 7: Fintech Innovations & Startups*
Course 8: Quantum Computing in Financial Services*
Course 9: RiskTech & RegTech
Course 10: Data Management
Course 11: Data Analytics Lab
Course 12: Applied Statistical Analysis with R
Course 13: Python Programming & Data Analysis
Course 14: Customer Analytics & Applications* (SMU-X)
Course 15: Operations Analytics & Applications
Course 16: Big Data: Tools & Techniques
Course 17: Visual Analytics & Applications
Course 18: Text Analytics & Applications
Course 19: Social Analytics & Applications
Course 20: Process Analytics Using Simulation
Course 21: Applied Machine Learning*
Course 22: Data Science for Business*
Course 23: Introduction to Artificial Intelligence*
Course 24: A

### Normalization

In [84]:
# Sample text
text = "The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?"
print(text)

The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?


In [85]:
# Convert to lowercase
text = text.lower() 
print(text)

the first time you see the second renaissance it may look boring. look at it at least twice and definitely watch part 2. it will change your view of the matrix. are the human people the ones who started the war ? is ai a bad thing ?


### Punctuation Removal

In [86]:
import re

# Remove punctuation characters
text = re.sub(r"[^a-zA-Z0-9]"," ",text)
print(text)

the first time you see the second renaissance it may look boring  look at it at least twice and definitely watch part 2  it will change your view of the matrix  are the human people the ones who started the war   is ai a bad thing  


### Tokenization

In [89]:
# Split words into tokens(words)

words = text.split()
print(words)

['the', 'first', 'time', 'you', 'see', 'the', 'second', 'renaissance', 'it', 'may', 'look', 'boring', 'look', 'at', 'it', 'at', 'least', 'twice', 'and', 'definitely', 'watch', 'part', '2', 'it', 'will', 'change', 'your', 'view', 'of', 'the', 'matrix', 'are', 'the', 'human', 'people', 'the', 'ones', 'who', 'started', 'the', 'war', 'is', 'ai', 'a', 'bad', 'thing']


### NLTK: Natural Language ToolKit

In [90]:
import os
import nltk
nltk.data.path.append(os.path.join(os.getcwd(), "nltk_data"))

In [91]:
# Another sample text
text = "Dr. Smith graduated from the University of Washington. He later started an analytics firm called Lux, which catered to enterprise customers."
print(text)

Dr. Smith graduated from the University of Washington. He later started an analytics firm called Lux, which catered to enterprise customers.


In [93]:
from nltk.tokenize import word_tokenize

#split text into words using nltk
words = word_tokenize(text)
print(words)

['Dr.', 'Smith', 'graduated', 'from', 'the', 'University', 'of', 'Washington', '.', 'He', 'later', 'started', 'an', 'analytics', 'firm', 'called', 'Lux', ',', 'which', 'catered', 'to', 'enterprise', 'customers', '.']


In [94]:
words2 = text.split()
print(words2)

['Dr.', 'Smith', 'graduated', 'from', 'the', 'University', 'of', 'Washington.', 'He', 'later', 'started', 'an', 'analytics', 'firm', 'called', 'Lux,', 'which', 'catered', 'to', 'enterprise', 'customers.']


In [95]:
from nltk.tokenize import sent_tokenize

#split text into sentences using nltk
sentences = sent_tokenize(text)
print(sentences)

['Dr. Smith graduated from the University of Washington.', 'He later started an analytics firm called Lux, which catered to enterprise customers.']


In [96]:
#List stop words
from nltk.corpus import stopwords
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### Proper way of doing it 

In [99]:
# Reset text
text = "The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?"

# Normalize it
text = re.sub(r"[^a-zA-Z0-9]"," ", text.lower())

# Tokenize it
words = word_tokenize(text)
#words= text.split()

# Remove stopwords
words = [word for word in words if word not in stopwords.words("english")]
print(words)

['first', 'time', 'see', 'second', 'renaissance', 'may', 'look', 'boring', 'look', 'least', 'twice', 'definitely', 'watch', 'part', '2', 'change', 'view', 'matrix', 'human', 'people', 'ones', 'started', 'war', 'ai', 'bad', 'thing']


### Parts of words tagging

In [121]:
from nltk import pos_tag

#tag parts of speech (pos)

words = word_tokenize("I really want to drink coffee and have some bread in the morning")
pos_tag(words)

[('I', 'PRP'),
 ('really', 'RB'),
 ('want', 'VBP'),
 ('to', 'TO'),
 ('drink', 'VB'),
 ('coffee', 'NN'),
 ('and', 'CC'),
 ('have', 'VBP'),
 ('some', 'DT'),
 ('bread', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('morning', 'NN')]

### Sentence Parsing

In [122]:
import nltk
# Define a custom grammar

my_grammer = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
"""
)

parser = nltk.ChartParser(my_grammer)

In [123]:
# Parse a sentence
sentence = word_tokenize("I shot an elephant in my pajamas")
for tree in parser.parse(sentence):
    print(tree)

(S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
(S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))


In [126]:
# Visualize parse trees
for tree in parser.parse(sentence):
    tree.draw()

### Named entity recognition

#### You have first to tokenize and tag parts of speech

In [127]:
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize

# Recognize named entities in a tagged sentence
ne_chunk(pos_tag(word_tokenize("I really want to drink coffee and have some bread in the morning.")))

The Ghostscript executable isn't found.
See http://web.mit.edu/ghostscript/www/Install.htm
If you're using a Mac, you can try installing
https://docs.brew.sh/Installation then `brew install ghostscript`


LookupError: 

Tree('S', [('I', 'PRP'), ('really', 'RB'), ('want', 'VBP'), ('to', 'TO'), ('drink', 'VB'), ('coffee', 'NN'), ('and', 'CC'), ('have', 'VBP'), ('some', 'DT'), ('bread', 'NN'), ('in', 'IN'), ('the', 'DT'), ('morning', 'NN'), ('.', '.')])

### Stemming

### Reduce the word to its root form

In [128]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [130]:
# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
print(stemmed)

['I', 'realli', 'want', 'to', 'drink', 'coffe', 'and', 'have', 'some', 'bread', 'in', 'the', 'morn']


### Lemmatization
### Resulting form is also a meaningful word

In [135]:
from nltk.stem.wordnet import WordNetLemmatizer

# Reduce words to their root form
lemmed = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmed)

['I', 'really', 'want', 'to', 'drink', 'coffee', 'and', 'have', 'some', 'bread', 'in', 'the', 'morning']


In [136]:
# Lemmatize verbs by specifying pos
lemmed = [WordNetLemmatizer().lemmatize(w, pos='v') for w in lemmed]
print(lemmed)

['I', 'really', 'want', 'to', 'drink', 'coffee', 'and', 'have', 'some', 'bread', 'in', 'the', 'morning']
