## Explore movie description  
This notebook is used to initially look at the text data and make some plots and figures to help understand what's in it.

To execute each cell, click on it and then type `shift-enter`.  You can insert
another code or markdown cell above or below any cell by clicking on `Insert` in the above menu bar.

In [None]:
# first a function to read in a file given a filename

def read_the_file(fname):
    '''Reads the filename - should have a .txt extension.

       Returns a text string containing the entire description.
    ''' 
    f = open(fname, 'r')
    textstring = f.read()
    return textstring

In [None]:
# relative path to description
path_including_filename = "../data/train_to_busan_description.txt"

text = read_the_file(path_including_filename)

In [None]:
text

In [None]:
# the text has \n (newlines) in it.  If we print it, the newlines will render
print(text)

### Clean the text

In [None]:
# specify how many characters to show after each step (arbitrary)
nc = 160

# lowercase it
text_lc = text.lower()
text_lc[:nc]  # show first nc characters

In [None]:
# remove punctuation
from string import punctuation

text_np = ''.join([ch for ch in text_lc if ch not in punctuation])
text_np[:nc]

In [None]:
# remove newline characters
text_nnl = text_np.replace('\n', ' ')
text_nnl[:nc]

In [None]:
# split into words
words = text_nnl.split(' ')
words

In [None]:
# a function to help summarize the output

def print_word_stats(words):
    num_words = len(words)
    unique_words = set(words)
    num_unique_words = len(unique_words)
    print(f"The number of words in the description is {num_words}.")
    print(f"The number of unique words in the description is {num_unique_words}.")

In [None]:
# before removing stopswords
print_word_stats(words)

In [None]:
# remove stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
stopwords = ENGLISH_STOP_WORDS

words_nsw = [word for word in words if word not in stopwords]

In [None]:
# after removing stopwords
print_word_stats(words_nsw)

In [None]:
words_nsw

In [None]:
# it looks like '' occurs as a word in many places - remove it
words_cleaned = [word for word in words_nsw if word is not '']

In [None]:
words_cleaned

In [None]:
# let's get a count of each word in the description
# a Counter does just what it sounds like.  It's a dictionary with
# the word as a key, and the count as the value.

from collections import Counter

word_counts = Counter(words_cleaned)

In [None]:
word_counts

In [None]:
# let's find the 20 most common, and plot them
num = 20
most_common = word_counts.most_common(num)

In [None]:
most_common

### Make a bar chart of the most common words

In [None]:
# pre-processing
labels = [tup[0] for tup in most_common]
counts = [tup[1] for tup in most_common]
print(labels)
print(counts)

In [None]:
# use matplotlib to make a bar chart
import matplotlib.pyplot as plt

# choose a nice matplotlib style
plt.style.use('ggplot')

In [None]:
# change the default text size (it's usually too small)
plt.rcParams.update({'font.size': 14})

In [None]:
# from "A simple bar chart" in your Matplotlib.pdf cheatsheat

N = len(labels)
fig, ax = plt.subplots(figsize=(12, 6))
width = 0.8
ticklocations = list(range(N))
ax.bar(ticklocations, counts, width, linewidth=4.0, align='center')
ax.set_xticks(ticks=ticklocations)
ax.set_xticklabels(labels, rotation=90)
ax.set_xlim(min(ticklocations)-0.6, max(ticklocations)+0.6)
ax.set_yticks(range(N))
ax.set_ylim((0,N))
ax.yaxis.grid(True)
ax.set_xlabel('word')
ax.set_ylabel('counts');

### Wordcloud visualization
A wordcloud is another nice way to visualize the frequency or importance of
words in text data. Alt-tab to your Unix/Linux terminal and install a wordcloud
utility for Python from the command line:
```bash
$ conda install -c conda-forge wordcloud
```

In [None]:
# get cleaned words into one-string for wordcloud utility
cleaned_text = ' '.join([word for word in words_cleaned])
cleaned_text[:nc]

In [None]:
from wordcloud import WordCloud

wordcloud = WordCloud(background_color="white", width=960, height=960, margin=8).generate(cleaned_text)
fig, ax = plt.subplots(figsize=(8,8))
ax.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()