<font color=#505050 size=3 face='arial black'>
<p><font color=#13577F size=5 face='arial black'> Text Analysis in Python </font></p>

<p>Python is a powerful programming language that's especially suited to text analysis. In this workshop, we will cover some of the most state to the art packages in python for processing text.</p>

<p>Compared to the rest of python, these packages are much less developed. They require a higher tolerance for non-intuitive interface and experimental, incompletely or imperfect features. However, if you stick with it, you can do many very cool things.</p>

</font>

<font color=#505050 size=3 face='arial black'>
<p><font color=#13577F size=4 face='arial black'> Quick Discussion of Cleaning </font></p>
</font>

In [None]:
alice_file = open('alice_in_wonderland.txt','r')
alice = alice_file.read()

In [None]:
print(alice[:335])

In [None]:
raw_paragraph=alice[:335]
print(raw_paragraph)

In [None]:
paragraph = raw_paragraph.lower()
print(paragraph)

In [None]:
print(paragraph.split())

In [None]:
import string
print(string.punctuation)

In [None]:
punctuation_list = list(string.punctuation)+['’']
print(punctuation_list)

In [None]:
empty_list = ['']*len(punctuation_list)
print(empty_list)

In [None]:
zipped_list = list(zip(punctuation_list,empty_list))
print(zipped_list)

In [None]:
substitution_dictionary=dict(zipped_list)
print(substitution_dictionary)

In [None]:
translation=str.maketrans(dict(zipped_list))
print(translation)

In [None]:
paragraph

In [None]:
paragraph.translate(translation)

In [None]:
def simple_punctuation_cleaner(text):
    punctuation_list = list(string.punctuation)
    empty_list = ['']*len(punctuation_list)
    zipped_list = zip(punctuation_list,empty_list)
    translation=str.maketrans(dict(zipped_list))
    return text.translate(translation)

In [None]:
no_punctuation_text = simple_punctuation_cleaner(paragraph)
print(no_punctuation_text)

In [None]:
print(no_punctuation_text.split())

In [None]:
from nltk import word_tokenize, sent_tokenize

print(word_tokenize(paragraph))

In [None]:
tokenizer = RegexpTokenizer(r'\w+').tokenize
tokenized_text = tokenizer(paragraph)
print(tokenized_text)

In [None]:
from nltk import word_tokenize, sent_tokenize

print(sent_tokenize(paragraph))

In [None]:
def has_no_numbers(inputString):
     return not any(char.isdigit() for char in inputString)

print( has_no_numbers('aa'))

In [None]:
list(filter(has_no_numbers, ['cat','k33p','dog','mouse','1221']))

In [None]:
def filter_numbers_from_word_list(word_list):
    return list(filter(has_no_numbers, word_list))

filter_numbers_from_word_list(['cat','k33p','dog','mouse','1221'])

In [None]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
porter_stemmer.stem('president')

<font color=#505050 size=3 face='arial black'>
<p><font color=#13577F size=4 face='arial black'> Preparing Our Data </font></p>
</font>

In [None]:
import json
import pandas as pd
shakespeare = pd.read_csv('will_play_text.data',delimiter=';',index_col=0,names=['play','act','index','speaker','dialogue'])
shakespeare.head(6)

In [None]:
shakespeare = shakespeare[shakespeare['speaker'].notnull()]
shakespeare.head(6)

In [None]:
shakespeare = shakespeare[['speaker','dialogue']]
shakespeare.head(6)

In [None]:
speech = shakespeare.groupby('speaker').apply(lambda x: " ".join(x['dialogue']))
speech.head()

In [None]:
characters = pd.DataFrame()
characters['dialogue']=speech
characters

In [None]:
corpus = characters['dialogue'].tolist()
print(corpus[0:2])

In [None]:
from nltk.corpus import stopwords
stopword_list = list(stopwords.words('english'))

In [None]:
#solution 1
#from nltk.corpus import stopwords
#nltk.download('stopwords')
#stopword_list = list(stopwords.words('english'))

In [None]:
#solution 2
#import pickle
#stopword_list = pickle.load(open('stopwords.pickle','rb'))

<font color=#505050 size=3 face='arial black'>
<p><font color=#13577F size=4 face='arial black'> Cleaning Our Data </font></p>
</font>

In [None]:

def clean_and_tokenize(raw_text,custom_stopwords=None):
    if custom_stopwords == None: custom_stopwords=[] 
    
    #tokenize
    tokenizer = RegexpTokenizer(r'\w+').tokenize
    words = tokenizer(raw_text)
    
    #filter
    words = [word for word in words if word not in set(custom_stopwords+stopword_list)]
    
    #filter digits
    words = filter_numbers_from_word_list(words)
    
    porter_stemmer = PorterStemmer()
    
    words = [porter_stemmer.stem(word) for word in words]
    
    words = [word.lower() for word in words]
    
    words = [word for word in words if len(word) > 2]

    return words

In [None]:
cleaned_corpus = list(map(clean_and_tokenize, corpus))

In [None]:
cleaned_corpus[0]

In [None]:
pip.main(['install', 'gensim'])
import gensim

In [None]:
print(" ".join(cleaned_corpus[5]))

<font color=#505050 size=3 face='arial black'>
<p><font color=#13577F size=4 face='arial black'> Topic Analysis with Gensim</font></p>
</font>

In [None]:
from gensim import corpora
dictionary = corpora.Dictionary(cleaned_corpus)
vecs = [dictionary.doc2bow(document) for document in cleaned_corpus]

from gensim import models
tfidf = models.TfidfModel(vecs)
tfidf_vecs = tfidf[vecs]

In [None]:
from gensim.models.ldamodel import LdaModel
lda = LdaModel(corpus=tfidf_vecs,id2word=dictionary,num_topics=2,update_every=0,passes=1)

In [None]:
lda.show_topics()

<font color=#505050 size=3 face='arial black'>
<p><font color=#13577F size=4 face='arial black'> A Quick Tour of Advanced Text Manipulation </font></p>
</font>

<font color=#505050 size=3 face='arial black'>
<p><font color=#13577F size=3 face='arial black'> spellchecking </font></p>
</font>

In [None]:
import pip
pip.main(['install', 'pyenchant'])

In [None]:
import enchant
eng = enchant.Dict("en_US")
eng.check("Apple")

<font color=#505050 size=3 face='arial black'>
<p><font color=#13577F size=3 face='arial black'> TextBlob </font></p>
</font>

In [None]:
#install TextBlob
import pip
pip.main(['install', 'TextBlob'])

In [None]:
from textblob import TextBlob

raw_paragraph = \
"I have a cat. I think she is okay. She thinks milk is awesome. Usually, the milk is cold. I hate cold milk."

blob = TextBlob(raw_paragraph)

for sentence in blob.sentences:
    print("original:",sentence)
    print("grammar:", sentence.tags)
    print("polarity:",sentence.sentiment.polarity)
    print("subjective:",sentence.sentiment.subjectivity)
    print(sentence.words)
    print()

In [None]:
from urllib.request import urlopen
npr = urlopen('http://www.npr.org/')
npr_html = npr.read()
npr_html[0:500]

<font color=#505050 size=3 face='arial black'>
<p><font color=#13577F size=3 face='arial black'> webpages </font></p>
</font>

In [None]:
pip.main(['install', 'bs4'])

In [None]:
#http://stackoverflow.com/questions/1936466/beautifulsoup-grab-visible-webpage-text

from bs4 import BeautifulSoup
import re

def remove_(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    elif re.match('<!--.*-->', str(element)):
        return False
    return True


def clean_html(html):
    soup = BeautifulSoup(npr_html, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(visible, texts)
    text = " ".join(visible_texts)
    return(text)

cleaned_html = clean_html(npr_html)
print(cleaned_html[:300])

<font color=#505050 size=3 face='arial black'>
<p><font color=#13577F size=3 face='arial black'> n-grams </font></p>
</font>

In [None]:
from nltk import bigrams, ngrams
tokens = word_tokenize('I had a cat.')
bgs = bigrams(tokens)
print(list(bgs))

In [None]:
list(ngrams(tokens,n=3))

In [None]:
nltk.FreqDist(tokens)

<font color=#505050 size=3 face='arial black'>
<p><font color=#13577F size=3 face='arial black'> grammar </font></p>
</font>

In [None]:
from nltk import pos_tag
from nltk import RegexpTokenizer

tokenize = RegexpTokenizer(r'\w+').tokenize

tokens = tokenize('I have a cat. His name was Bob. Bob had a ball.')
tagged_sentence = pos_tag(tokens)
tagged_sentence

In [None]:
result = list(zip(*tagged_sentence))
print(result)

In [None]:
_,grammar = zip(*tagged_sentence)

In [None]:
grammar

<font color=#505050 size=3 face='arial black'>
<p><font color=#13577F size=3 face='arial black'> inflect </font></p>
</font>

In [None]:
import pip
pip.main(['install', 'inflect'])

In [None]:
#https://pypi.python.org/pypi/inflect
import inflect
english = inflect.engine()
english.plural('wolf')

In [None]:
english.singular_noun('oxen')

In [None]:
english.number_to_words(1234)

In [None]:
english.ordinal(106)

In [None]:
english.join(["apple", "banana", "carrot"])