#### Noise Removal

In [1]:
import re 

headline_one = '<h1>Nation\'s Top Pseudoscientists Harness High-Energy Quartz Crystal Capable Of Reversing Effects Of Being Gemini</h1>'

tweet = '@fat_meats, veggies are better than you think.'


headline_no_tag = re.sub(r'<.?h1>', '', headline_one)

tweet_no_at = re.sub(r'@', '', tweet)






try:
  print(headline_no_tag)
except:
  print('No variable called `headline_no_tag`')
try:
  print(tweet_no_at)
except:
  print('No variable called `tweet_no_at`')

Nation's Top Pseudoscientists Harness High-Energy Quartz Crystal Capable Of Reversing Effects Of Being Gemini
fat_meats, veggies are better than you think.


#### Tokenization

In [2]:
from nltk import word_tokenize, sent_tokenize

ecg_text = 'An electrocardiogram is used to record the electrical conduction through a person\'s heart. The readings can be used to diagnose cardiac arrhythmias.'


tokenized_by_word = word_tokenize(ecg_text)
tokenized_by_sentence = sent_tokenize(ecg_text)






try:
  print('Word Tokenization:')
  print(tokenized_by_word)
except:
  print('Expected a variable called `tokenized_by_word`')
try:
  print('Sentence Tokenization:')
  print(tokenized_by_sentence)
except:
  print('Expected a variable called `tokenized_by_sentence`')

Word Tokenization:
['An', 'electrocardiogram', 'is', 'used', 'to', 'record', 'the', 'electrical', 'conduction', 'through', 'a', 'person', "'s", 'heart', '.', 'The', 'readings', 'can', 'be', 'used', 'to', 'diagnose', 'cardiac', 'arrhythmias', '.']
Sentence Tokenization:
["An electrocardiogram is used to record the electrical conduction through a person's heart.", 'The readings can be used to diagnose cardiac arrhythmias.']


#### Normalization 
##### Upper or lower casing

In [3]:
brands = 'Salvation Army, YMCA, Boys & Girls Club of America'

brands_lower = brands.lower()

brands_upper = brands.upper()










try:
  print(f'Lowercased brands: {brands_lower}')
except:
  print('Expected a variable called `brands_lower`')
try:
  print(f'Uppercased brands: {brands_upper}')
except:
  print('Expected a variable called `brands_upper`')

Lowercased brands: salvation army, ymca, boys & girls club of america
Uppercased brands: SALVATION ARMY, YMCA, BOYS & GIRLS CLUB OF AMERICA


##### Stop words removal

In [5]:
from nltk.corpus import stopwords

survey_text = 'A YouGov study found that American\'s like Italian food more than any other country\'s cuisine.'

stop_words = set(stopwords.words('english'))

tokenized_survey = word_tokenize(survey_text)

text_no_stops = [word for word in tokenized_survey if word not in stop_words]
print(text_no_stops)

['A', 'YouGov', 'study', 'found', 'American', "'s", 'like', 'Italian', 'food', 'country', "'s", 'cuisine', '.']


##### Stemming 

In [6]:
from nltk.stem import PorterStemmer
populated_island = 'Java is an Indonesian island in the Pacific Ocean. It is the most populated island in the world, with over 140 million people.'

stemmer = PorterStemmer()
island_tokenized = word_tokenize(populated_island)

stemmed = [stemmer.stem(token) for token in island_tokenized]





try:
  print('A stemmer exists:')
  print(stemmer)
except:
  print('Expected a variable called `stemmer`')
try:
  print('Words Tokenized:')
  print(island_tokenized)
except:
  print('Expected a variable called `island_tokenized`')
try:
  print('Stemmed Words:')
  print(stemmed)
except:
  print('Expected a variable called `stemmed`')
  

A stemmer exists:
<PorterStemmer>
Words Tokenized:
['Java', 'is', 'an', 'Indonesian', 'island', 'in', 'the', 'Pacific', 'Ocean', '.', 'It', 'is', 'the', 'most', 'populated', 'island', 'in', 'the', 'world', ',', 'with', 'over', '140', 'million', 'people', '.']
Stemmed Words:
['java', 'is', 'an', 'indonesian', 'island', 'in', 'the', 'pacif', 'ocean', '.', 'It', 'is', 'the', 'most', 'popul', 'island', 'in', 'the', 'world', ',', 'with', 'over', '140', 'million', 'peopl', '.']


##### Lemmatization

In [7]:
from nltk.stem import WordNetLemmatizer
populated_island = 'Indonesia was founded in 1945. It contains the most populated island in the world, Java, with over 140 million people.'

lemmatizer = WordNetLemmatizer()
tokenized_string = word_tokenize(populated_island)

lemmatized_words = [lemmatizer.lemmatize(token) for token in tokenized_string]






try:
  print(f'A lemmatizer exists: {lemmatizer}')
except:
  print('Expected a variable called `lemmatizer`')
try:
  print(f'Words Tokenized: {tokenized_string}')
except:
  print('Expected a variable called `tokenized_string`')
try:
  print(f'Lemmatized Words: {lemmatized_words}')
except:
  print('Expected a variable called `lemmatized_words`')
  

A lemmatizer exists: <WordNetLemmatizer>
Words Tokenized: ['Indonesia', 'was', 'founded', 'in', '1945', '.', 'It', 'contains', 'the', 'most', 'populated', 'island', 'in', 'the', 'world', ',', 'Java', ',', 'with', 'over', '140', 'million', 'people', '.']
Lemmatized Words: ['Indonesia', 'wa', 'founded', 'in', '1945', '.', 'It', 'contains', 'the', 'most', 'populated', 'island', 'in', 'the', 'world', ',', 'Java', ',', 'with', 'over', '140', 'million', 'people', '.']


##### Part of Speech Tagging

In [8]:
import nltk
from nltk.corpus import wordnet
from collections import Counter

def get_part_of_speech(word):
  probable_part_of_speech = wordnet.synsets(word)
  
  pos_counts = Counter()

  pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
  pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
  pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
  pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  
  most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
  return most_likely_part_of_speech

In [9]:
lemmatizer = WordNetLemmatizer()

populated_island = 'Indonesia was founded in 1945. It contains the most populated island in the world, Java, with over 140 million people.'

tokenized_string = word_tokenize(populated_island)
lemmatized_pos = [lemmatizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized_string]


try:
  print(f'The lemmatized words are: {lemmatized_pos}')
except:
  print('Expected a variable called `lemmatized_pos`')

The lemmatized words are: ['Indonesia', 'be', 'found', 'in', '1945', '.', 'It', 'contain', 'the', 'most', 'populate', 'island', 'in', 'the', 'world', ',', 'Java', ',', 'with', 'over', '140', 'million', 'people', '.']


In [11]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from part_of_speech import get_part_of_speech
import re

lemmatizer = WordNetLemmatizer()

oprah_wiki = '<p>Working in local media, she was both the youngest news anchor and the first black female news anchor at Nashville\'s WLAC-TV. </p>'

oprah_edited_text = re.sub(r'</?p>', '', oprah_wiki)
text_no_periods = re.sub(r'\.', '', oprah_edited_text).lower()

tokenized_string = word_tokenize(text_no_periods)

lemmatized_pos = [lemmatizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized_string]