# Text Analysis
In this module, we will use the Natural Language Toolkit Library (NLTK) to look at individual words
and sentences in a text and clean unneccessary features from the text data to prepare for sentiment analysis. Then using the textblob library, we will analyze the 
sentiment of opinioned data to give a numerical value for use in a predictive model.

In [9]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#this is sample data
from nltk.corpus import names  

from string import punctuation

#if the next cell does not work
#remove number symbol on following lines and re-run this cell
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('names')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\GBTC408004ur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\GBTC408004ur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\GBTC408004ur\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\GBTC408004ur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\GBTC408004ur\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [16]:
#create an empty list here to hold the tokens at the end

princess = []

with open ('datasets_12dancingprincesses.txt', 'r') as f:
    for line in f:
        cline = line.strip() #get rid of newline character
        if cline == '': pass  #this will skip over lines that only had a newline and are now blank
        else:
            tknls = word_tokenize(cline)
        for token in tknls:
            princess.append(token)
        

In [17]:
princess

['THE',
 'TWELVE',
 'DANCING',
 'PRINCESSES',
 'THE',
 'TWELVE',
 'DANCING',
 'PRINCESSES',
 'There',
 'was',
 'a',
 'king',
 'who',
 'had',
 'twelve',
 'beautiful',
 'daughters',
 '.',
 'They',
 'slept',
 'in',
 'twelve',
 'beds',
 'all',
 'in',
 'one',
 'room',
 ';',
 'and',
 'when',
 'they',
 'went',
 'to',
 'bed',
 ',',
 'the',
 'doors',
 'were',
 'shut',
 'and',
 'locked',
 'up',
 ';',
 'but',
 'every',
 'morning',
 'their',
 'shoes',
 'were',
 'found',
 'to',
 'be',
 'quite',
 'worn',
 'through',
 'as',
 'if',
 'they',
 'had',
 'been',
 'danced',
 'in',
 'all',
 'night',
 ';',
 'and',
 'yet',
 'nobody',
 'could',
 'find',
 'out',
 'how',
 'it',
 'happened',
 ',',
 'or',
 'where',
 'they',
 'had',
 'been',
 '.',
 'could',
 'find',
 'out',
 'how',
 'it',
 'happened',
 ',',
 'or',
 'where',
 'they',
 'had',
 'been',
 '.',
 'Then',
 'the',
 'king',
 'made',
 'it',
 'known',
 'to',
 'all',
 'the',
 'land',
 ',',
 'that',
 'if',
 'any',
 'person',
 'could',
 'discover',
 'the',
 'secre

In [24]:
#the NLTK FreqDist gives a count for how often each part of the text occurs
fd_wct = FreqDist(princess)
fd_wct


FreqDist({'!': 2,
          '(': 1,
          ')': 1,
          ',': 107,
          '--': 2,
          '.': 50,
          ':': 9,
          ';': 35,
          '?': 3,
          'A': 1,
          'After': 1,
          'All': 1,
          'And': 6,
          'As': 3,
          'At': 1,
          'But': 3,
          'DANCING': 2,
          'He': 2,
          'However': 1,
          'I': 18,
          'In': 1,
          'It': 2,
          'Just': 1,
          'Now': 2,
          'On': 1,
          'One': 1,
          'PRINCESSES': 2,
          'So': 1,
          'THE': 2,
          'TWELVE': 2,
          'That': 1,
          'The': 4,
          'Then': 8,
          'There': 3,
          'They': 2,
          'This': 1,
          'Well': 1,
          'When': 4,
          'Where': 1,
          'With': 1,
          'You': 2,
          'a': 21,
          'able': 1,
          'about': 2,
          'adventure': 1,
          'afraid': 1,
          'after': 4,
          'afterwards': 1,
          '

In [25]:
#shows the top 10 words in the text
fd_wct.most_common(10)

[('the', 138),
 (',', 107),
 ('and', 74),
 ('.', 50),
 ('to', 47),
 (';', 35),
 ('he', 34),
 ('they', 31),
 ('’', 31),
 ('of', 29)]

In [26]:
#number of tokens in list before puntuation removal
len(princess)

1970

In [27]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [34]:
#remove the puntuation tokens from the list
for word in princess:
    if word in punctuation:
        princess.remove(word)

In [35]:
#number of tokens in list before puntuation removal
len(princess)

1763

In [36]:
#list of english stopwords
eng_stopwords = stopwords.words('english')
eng_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [37]:
rm_count = 0
new_words = []  #list to hold new words

for word in princess:
    if word not in eng_stopwords:
        new_words.append(word)
    else: rm_count += 1

In [38]:
rm_count

927

In [39]:
len(new_words)

836

Now let's see the new top 10 words in this text.

In [40]:
fd_nw = FreqDist(new_words)
fd_nw.most_common(10)

[('’', 31),
 ('‘', 22),
 ('soldier', 19),
 ('I', 18),
 ('princesses', 17),
 ('said', 16),
 ('king', 15),
 ('went', 11),
 ('twelve', 10),
 ('came', 10)]