## Lowercasing

In [5]:
texts=["CANADA","Canada","canadA","canada"]
lower_words=[word.lower() for word in texts]
lower_words

['canada', 'canada', 'canada', 'canada']

## Stemming

In [1]:
import nltk
import pandas as pd
from nltk.stem import PorterStemmer

porter_stemmer=PorterStemmer()

words=["trouble","troubling","troubled","troubles",]
stemmed_words=[porter_stemmer.stem(word=word) for word in words]
stemdf= pd.DataFrame({'original_word': words,'stemmed_word': stemmed_words})
stemdf

Unnamed: 0,original_word,stemmed_word
0,trouble,troubl
1,troubling,troubl
2,troubled,troubl
3,troubles,troubl


In [2]:
words=["connect","connected","connection","connections","connects"]
stemmed_words=[porter_stemmer.stem(word=word) for word in words]
print("stemmed_words=",stemmed_words)

stemdf= pd.DataFrame({'original_word': words,'stemmed_word': stemmed_words})
stemdf

stemmed_words= ['connect', 'connect', 'connect', 'connect', 'connect']


Unnamed: 0,original_word,stemmed_word
0,connect,connect
1,connected,connect
2,connection,connect
3,connections,connect
4,connects,connect


In [90]:
words=["trouble","troubled","troubles","troublemsome"]
stemmed_words=[porter_stemmer.stem(word=word) for word in words]
print("stemmed_words=",stemmed_words)

stemdf= pd.DataFrame({'original_word': words,'stemmed_word': stemmed_words})
stemdf

stemmed_words= ['troubl', 'troubl', 'troubl', 'troublemsom']


Unnamed: 0,original_word,stemmed_word
0,trouble,troubl
1,troubled,troubl
2,troubles,troubl
3,troublemsome,troublemsom


## Lemmatization

In [34]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

words=["trouble","troubling","troubled","troubles",]
lemmatized_words=[lemmatizer.lemmatize(word=word,pos='v') for word in words]
lemmatizeddf= pd.DataFrame({'original_word': words,'lemmatized_word': lemmatized_words})
lemmatizeddf=lemmatizeddf[['original_word','lemmatized_word']]
lemmatizeddf

[nltk_data] Downloading package wordnet to /Users/kavgan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,original_word,lemmatized_word
0,trouble,trouble
1,troubling,trouble
2,troubled,trouble
3,troubles,trouble


In [33]:
words=["goose","geese"]
lemmatized_words=[lemmatizer.lemmatize(word=word,pos='n') for word in words]
lemmatizeddf= pd.DataFrame({'original_word': words,'lemmatized_word': lemmatized_words})
lemmatizeddf=lemmatizeddf[['original_word','lemmatized_word']]
lemmatizeddf

Unnamed: 0,original_word,lemmatized_word
0,goose,goose
1,geese,goose


## Stop Word Removal

In [48]:
stopwords=['this','thah','and','a','we','it','to','is','of','up','need']
text="this is a text full of content and we need to clean it up"
words=text.split(" ")
shortlisted_words=[]

#remove stop words
for w in words:
    if w not in stopwords:
        shortlisted_words.append(w)

print("original sentence = ",text)    
print("stop words removed sentences = ",' '.join(shortlisted_words))    

original sentence =  this is a text full of content and we need to clean it up
stop words removed sentences =  text full content clean


## Noise Removal


In [82]:
import nltk
import pandas as pd
import re
from nltk.stem import PorterStemmer

  
porter_stemmer=PorterStemmer()

# stem raw words
raw_words=["..trouble..","trouble<","trouble!","<a>trouble</a>",'1.trouble']
stemmed_words=[porter_stemmer.stem(word=word) for word in raw_words]
stemdf= pd.DataFrame({'raw_word': raw_words,'stemmed_word': stemmed_words})
stemdf

Unnamed: 0,raw_word,stemmed_word
0,..trouble..,..trouble..
1,trouble<,trouble<
2,trouble!,trouble!
3,<a>trouble</a>,<a>trouble</a>
4,1.trouble,1.troubl


In [86]:
def scrub_words(text):
    """Basic cleaning of texts."""
    
    # remove html markup
    text=re.sub("(<.*?>)","",text)
    
    #remove non-ascii and digits
    text=re.sub("(\\W|\\d)"," ",text)
    
    #remove whitespace
    text=text.strip()
    return text

# clean, then stem
cleaned_words=[scrub_words(w) for w in raw_words]
cleaned_stemmed_words=[porter_stemmer.stem(word=word) for word in cleaned_words]
stemdf= pd.DataFrame({'raw_word': raw_words,'cleaned_word':cleaned_words,'stemmed_word': cleaned_stemmed_words})
stemdf=stemdf[['raw_word','cleaned_word','stemmed_word']]
stemdf

Unnamed: 0,raw_word,cleaned_word,stemmed_word
0,..trouble..,trouble,troubl
1,trouble<,trouble,troubl
2,trouble!,trouble,troubl
3,<a>trouble</a>,trouble,troubl
4,1.trouble,trouble,troubl
