# NLP 101

In [2]:
!pip install textblob nltk seaborn

In [3]:
# import packages
import os
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.corpus import cmudict
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
import nltk
import re
from textblob import TextBlob
import string
import seaborn as sns
import matplotlib.pyplot as plt
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

ModuleNotFoundError: No module named 'textblob'

In [None]:
# check our working directory
os.getcwd()

In [None]:
#set our working directory to where our files are located
##os.chdir('C:\\PATH\\PATH\\PATH\\')

## Yelp Reviews

In [None]:
%ls

In [None]:
# create dataframe from Yelp reviews file
df = pd.read_csv('yelp.csv')

In [None]:
# inspect the file
print(df.describe())
print(df.columns)

In [None]:
# select column with the review text from dataframe and turn into a list
corpora = df['text'].tolist()

In [None]:
# view the first review
print(...)

In [None]:
# set exhibit to the first review
exhibit = ...

### Pre-processing: Cleaning

In [None]:
# lower case
docClean = " ".join(x.lower() for x in exhibit.split())
docClean

In [None]:
# remove numbers
docClean = re.sub('[0-9]', '', docClean).strip()
docClean

### Pre-processing: Contractions

In [None]:
contractions_dict = {'didn\'t': 'did not','don\'t': 'do not',
                     'wouldn\'t': 'would not', 'won\'t': 'will not',
                    'can\'t': 'cannot', 'i\'ve': 'i have', 'i\'m': 'i am'}
def expand_contractions(s, contractions_dict=contractions_dict):
    #import re
    contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))
    
    return contractions_re.sub(replace, s)

def replace(match):
    return contractions_dict[match.group(0)]

In [None]:
expand_contractions('won\'t')

In [None]:
docClean = expand_contractions(docClean)

In [None]:
docClean

### Pre-processing: Tokenization

In [None]:
#tokenize into sentences
#from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(docClean)
sentences[0:3]

In [None]:
#tokenize into words
#from nltk.tokenize import word_tokenize
words = word_tokenize(docClean)
words[:10]

### Pre-processing: Stop Words

In [None]:
#from nltk.corpus import stopwords
stopWords = list(set(stopwords.words('english')))

#customize stop word list
stopWords.append('food')

#remove stop words from text
cleanWords = [w for w in words if not w in stopWords]
cleanWords[0:5]

### Pre-processing: Special Characters & Punctuation

In [None]:
# special characters
cleanWords = [w for w in cleanWords if re.search('^[a-zA-Z]+', w)]
cleanWords[-5:]

In [None]:
# remove punctuation
#import string
cleanWords = list(map(lambda x: x.translate(str.maketrans('', '', string.punctuation)), cleanWords))
cleanWords[-5:]

### Checkpoint: Difference between Raw and Clean Words

In [None]:
pd.DataFrame({'rawWords': words[:10], 'cleanWords': cleanWords[:10]})

### Pre-processing 3: Stemming & Lemmatization

In [None]:
#establish stemmer
#from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
#stem words
stemmedWords = [stemmer.stem(w) for w in cleanWords]
pd.DataFrame({'cleanWords': cleanWords[:10], 'stemmedWords': stemmedWords[:10]})

In [None]:
# lambda function to convert pos_tag output to lemmatizer.lemmatize-friendly input ('NN' -> 'n')
wnpos = lambda e: ('a' if e[0].lower() == 'j' else e[0].lower()) if e[0].lower() in ['n', 'r', 'v'] else 'n'
wnpos('NN')

In [None]:
#establish lemmatizer
#from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
#lemmatize words
lemWords = [lemmatizer.lemmatize(w, wnpos(pos_tag(w.split())[0][1])) for w in cleanWords]

In [None]:
pd.DataFrame({'cleanWords': cleanWords[:10], 'stemmedWords': stemmedWords[:10], 'lemWords': lemWords[:10]})

In [None]:
#function to apply all elements of text cleaning
def clean_text(document):
    #import statements
    #from nltk.tokenize import word_tokenize
    #from nltk.corpus import stopwords
    document = str(document)
    docClean = document.replace('\n', ' ').replace('\r', '')  ## Newline removal
    docClean = " ".join(x.lower() for x in docClean.split()) ## Lowercase
    docClean = expand_contractions(docClean) # expand contractions
    docClean = word_tokenize(docClean) #Tokenize
    docClean = [w for w in docClean if not w in stopWords] #Drop Stop words
    docClean = ' '.join([re.sub(r'\W+','',w) for w in docClean]) #Remove non alphanumeric chars
    docClean = re.sub('  ', ' ', docClean)
    
    return docClean.strip()

In [None]:
df['cleanText'] = df['text'].apply(clean_text)

### N-Grams

In [None]:
def ngrams(text, maxNGram):
    text = text.split(' ')
    output = []
    for i in range(len(text)-maxNGram+1):
        output.append(text[i:i+maxNGram])
    
    return [' '.join(x) for x in output]

def flat_list(x):
    return [item for sublist in x for item in sublist]

In [None]:
# create unigrams
df['unigrams'] = df['cleanText'].apply(lambda x: ngrams(x, 1))
# create bigrams
df['bigrams'] = ...
# create trigrams
df['trigrams'] = ...

In [None]:
pd.Series(flat_list(df['unigrams'].tolist())).value_counts()[1:10]

In [None]:
pd.Series(flat_list(df['bigrams'].tolist())).value_counts()[0:10]

In [None]:
pd.Series(flat_list(df['trigrams'].tolist())).value_counts()[1:10]

## Feature Engineering

### Sentiment

In [None]:
#calculates the sentiment, or polarity of a body of text
# Output: sentiment - polarity score, scaled (-1, 1), of a document (higher == more positive)
def get_sentiment(document):
    try:
        #from textblob import TextBlob
        #drop non-alpha, keep some punctuation in raw text
        document = re.sub('[^a-z0-0\.?!\',]', ' ', document.lower())
        blob = TextBlob(document.lower())
        sentiment = blob.sentiment.polarity
        
        return sentiment
    
    except:
        return None

In [None]:
#apply function to df to calculate sentiment
df['sentiment'] =...

In [None]:
df['sentiment'].describe()

In [None]:
# visualize with a histogram
sns.distplot(df['sentiment'])

In [None]:
#Lowest sentiment statement
df.text[df.sentiment==min(df.sentiment)].iloc[0]

In [None]:
#Highest sentiment


### Subjectivity

In [None]:
#calculates subjectivity, or modality, of a body of text
# Output: subjectivity - modality score, scaled (0, 1), of a document (higher == more subjective)
def get_subjectivity(document):
    #from textblob import TextBlob
    try:
        #drop non-alpha, keep some punctuation in raw text
        document = re.sub('[^a-z0-0\.?!\',]', ' ', document.lower())
        blob = TextBlob(document.lower())
        subjectivity = blob.sentiment.subjectivity
        
        return subjectivity
    
    except:
        return None

In [None]:
#apply function to df to calculate subjectivity
df['subjectivity'] = ...

In [None]:
df['subjectivity'].describe()

In [None]:
# visualize with a histogram
sns.distplot(df['subjectivity'])

In [None]:
df[['text', 'sentiment', 'subjectivity']].head()

In [None]:
#Let's find the review with the lowest subjectivity
df.text[...]

In [None]:
#Now the highest subjectivity


#### Scatterplot of Sentiment v. Subjectivity

In [None]:
# visualize with a scatterplot
plt.scatter(df['sentiment'], df['subjectivity'])
plt.xlabel('Sentiment')
plt.ylabel('Subjectivity')

## Readability

In [None]:
#functions to calculate number of syllables
#cmu dictionary
d = cmudict.dict()

#function to determine the number of syllables in a word with backup function if word not found in cmu dictionary
def nsyl(word):
    try:
        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]]
    except KeyError:
        #if word not found in cmudict
        return syllables(word)

#backup function to determine syllables if word not found in cmu dictionary
def syllables(word):
    count = 0
    vowels = 'aeiouy'
    word = word.lower()
    try:
        if word[0] in vowels:
            count +=1
        for index in range(1,len(word)):
            if word[index] in vowels and word[index-1] not in vowels:
                count +=1
        if word.endswith('e'):
            count -= 1
        if word.endswith('le'):
            count += 1
        if count == 0:
            count += 1
        return count
    except IndexError:
        return 0

In [None]:
#function to calculate total number of words
def total_words(document):
    #from nltk.tokenize import word_tokenize
    words = word_tokenize(document)
    return len(words)

#function to calculate total number of sentences
def total_sentences(document):
    #from nltk.tokenize import sent_tokenize
    sent = sent_tokenize(document)
    return len(sent)

In [None]:
# create column for total words
df['total_words'] = df['text'].apply(total_words)
# create column for total sentences
df['total_sentences'] = df['text'].apply(total_sentences)
# create column for total syllables
df['total_syllables'] = df['text'].apply(nsyl)

#### Flesch Reading Ease
206.835-1.015\*(total_words/total_sentences)-84.6*(total_syllables/total_words)

In [None]:
#calculate Flesch Reading Ease across the df
def FRES(document):
    try:
        numWord = total_words(document)
        numSent = total_sentences(document)
        numSyll = nsyl(document)
        x = (numWord / numSent)
        y = (numSyll / numWord)
        FRES = 206.835 - 1.015*(x) - 84.6*(y)
        return FRES
    except:
         return None

In [None]:
# apply FRES function
df['reading_ease'] = ...

In [None]:
df['reading_ease'].describe()

In [None]:
# visualize with histogram
df['reading_ease'] = df['text'].apply(FRES)

In [None]:
#Lowest reading ease
df.text[df.reading_ease==min(df.reading_ease)].iloc[0]

#### Flesch-Kincaid Grade
.39*(total_words/total_sentences)+11.8*(total_syllables/total_words)-15.59

In [None]:
#calculate Flesch-Kincaid grade level across the df
def FKR(document):
    try:
        numWord = total_words(document)
        numSent = total_sentences(document)
        numSyll = nsyl(document)
        x = (numWord / numSent)
        y = (numSyll / numWord)
        FKR = .39*(x) + 11.8*(y) - 15.59
        return FKR
    except:
        return None

In [None]:
# apply FKR function


In [None]:
df['grade_level'].describe()

In [None]:
# visualize with histogram
sns.distplot(df['grade_level'])

# Exercises

Using the provided _Literature_ dataset, complete the below exercises to conduct an exploratory analysis.

## Exercise 1

In [1]:
%ls

 Volume in drive C is OSDisk
 Volume Serial Number is E878-394F

 Directory of C:\Users\jheuer\Desktop\auNLPTraining\NLP 101

07/24/2020  11:59 AM    <DIR>          .
07/24/2020  11:59 AM    <DIR>          ..
07/24/2020  12:53 AM    <DIR>          .ipynb_checkpoints
04/30/2019  02:54 PM             2,048 literature.csv
07/24/2020  11:59 AM            26,591 NLP 101.ipynb
07/23/2020  07:57 PM        10,750,489 NLP 101.pptx
04/30/2019  02:54 PM            15,386 NLP 101.py
07/24/2020  11:59 AM            26,018 NLP 101-key.ipynb
04/30/2019  02:54 PM               546 README.md
04/30/2019  02:54 PM         1,606,396 yelp.csv
               7 File(s)     12,427,474 bytes
               3 Dir(s)  248,026,525,696 bytes free


In [6]:
# Create a corpora and instantiate a Corpus class
dfLit = pd.read_csv('Literature.csv')
# Print the first document of the Corpus
dfLit.columns

Index(['text'], dtype='object')

In [7]:
dfLit['text'].iloc[0]

'It was the best of times it was the worst of times it was the age of wisdom it was the age of foolishness it was the epoch of belief it was the epoch of incredulity it was the season of light it was the season of darkness it was the spring of hope it was the winter of despair.'

In [None]:
# how many words are in your corpus? (hint: print the words)


In [None]:
# generate and print the clean text


In [None]:
# print the bigrams of your corpus


## Exercise 2

Find the 10 top occuring words in your corpus and plot them

In [None]:
# get the top words in the corpus


# get just the top 10 words and their counts


# create plot of these words and their counts


## Exercise 3

Return a sentiment score from each document

In [None]:
# get sentiment scores for each document


In [None]:
# plot the sentiment distribution of the documents in the corpus
