# Text Visualization of Hansel & Gretel

In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import math

In [None]:
text = open('hansel_gretel.txt').read().replace('\n', " ").lower()

## Lexical dispersion plot of the most significative words in Hansel & Gretel

### Yellowbrick library

In [None]:
#pip install yellowbrick

Libreria utilizzata: https://pypi.org/project/yellowbrick/ 

In [None]:
from yellowbrick.text import DispersionPlot, dispersion # For the dispersion plot
import collections # To order dict

In [None]:
text_clean = ''

punctuations = '''!()-[]{};:'"\,<>/?@#$%^&*._~'''

for t in text:
    if t not in punctuations:
        text_clean = text_clean + t

In [None]:
# Divide the text cleaned into words (array of array because of the library)
text_words = []
text_words.append([t for t in text_clean.split(' ')])
#text_words

In [None]:
# Let's see most frequent words
# Not necessary

frequencies_dict = {}

for w in text_words[0]:
    count = text_words[0].count(w)
    frequencies_dict[w] = count

#collections.OrderedDict(sorted(frequencies_dict.items(), key=lambda t: t[1], reverse = True))

In [None]:
# Choose words whose occurence in the text will be plotted
target_words = ['hansel', 'gretel', 'witch', 'bread', 'forest']

# Create the visualizer and draw the plot
visualizer = DispersionPlot(target_words, color = 'red')
visualizer.fit(text_words)
visualizer.show()

In [None]:
len(text_words[0])

### Seaborn library 

In [None]:
#pip install seaborn

Libreria utilizzata: https://pypi.org/project/seaborn/

In [None]:
import pandas as pd
import seaborn as sns
sns.set(style = 'ticks', font = 'Avenir')

In [None]:
# Initialize arrays for values for the plot and the values for the target word
target_values = []
count = []
c = 0
for t in target_words:
    for w in text_words[0]:
        if t != w:
            c = c + 1
        else:
            count.append(c)
    target_values.append(count)
    count = []
    c = 0

print(target_values)

In [None]:
#print(target_words)
#print(text_words[0])

In [None]:
# Initialize the figure
f, ax = plt.subplots()

pal = sns.color_palette("husl", 8)

sns.stripplot(data = target_values, dodge = True, alpha = .6, zorder = 1, 
              orient = 'h', jitter = 0, palette = pal, size = 6, marker = 'd')


ax.set_yticklabels(target_words, size = 14)
ax.set_ylabel("target words", size = 12)
ax.set_xlabel("nth words", size = 12)

plt.title('Dispersion plot of significative words in Hansel & Gretel', size = 18, va = 'bottom')

## Heat map of Hansel & Gretel sentence length

In [None]:
#pip install plotly

Libreria utilizzata: https://pypi.org/project/plotly/, https://pypi.org/project/plotly-express/

In [None]:
import plotly.express as px
import plotly.graph_objects as go

In [None]:
# Splitting text into sentences point separated, removing last value
sentences = text.split(".")
sentences.pop(len(sentences)-1)

In [None]:
# Array of sentences with no punctuation

sentences_clean = []

punctuations = '''!()-[]{};:'"\,<>/?@#$%^&*_~'''

for s in sentences:
    clean = ""
    for char in s:
        if char not in punctuations:
            clean = clean + char
    sentences_clean.append(clean)

In [None]:
print("Number of sentences:", len(sentences_clean))
print("Third sentence:", sentences[2])
print("Third sentence:", sentences_clean[2])

In [None]:
sentences_count = []

for s in sentences_clean:
    sentences_count.append(len([len(s) for s in s.split()]))
    
print("Third sentence length:", sentences_count[2]) 

In [None]:
print("Squared root: ", np.sqrt(len(sentences_count)))
sqrt = np.sqrt(len(sentences_count))

In [None]:
print(len(sentences_count))

In [None]:
sentences_count_zero = sentences_count[:]

In [None]:
# Adding NaN values for the matrix to be squared

n = math.ceil(sqrt)**2 - len(sentences_count)

for i in range(n):
    sentences_count_zero.append("NaN")

print(len(sentences_count_zero))

In [None]:
#print(sentences_count)

In [None]:
#print(sentences_count_zero)

In [None]:
print("Array of lengths: ", sentences_count_zero)
print("Length: ", len(sentences_count_zero))
print("Last sentence: ", sentences[116])

In [None]:
matrix = np.array(sentences_count_zero, dtype = float)
matrix.shape = (math.ceil(sqrt), math.ceil(sqrt))
print(matrix)

In [None]:
# Find min and max values
min_value = min(sentences_count)
max_value = max(sentences_count)
mean_value = np.mean(sentences_count)

bounds = [min_value, 20, 40, 60, 80, max_value]

print("min: ", min_value, " max: ", max_value," mean: ", mean_value)

In [None]:
fig = px.imshow(matrix, labels = dict(color="Lenght in words"), color_continuous_scale='Oranges')
fig.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)

# Style the heatmap
fig.update_layout(
    title_text = "Sentence length of Hansel & Gretel",
    title_x = 0.5,
    #coloraxis_reversescale = True,
    font = dict(
        family = "Avenir",
        size = 16,
        color = "#7f7f7f"
    ),
    coloraxis_colorbar = dict(tickvals = bounds, title_side = 'right', x = 0.8),
)

fig.update_traces(hovertemplate = "Length: %{z}")

fig.show()

In [None]:
# Basic version with matplotlib

fig, axs = plt.subplots()
fig.suptitle('Sentence length of Hansel and Gretel')

plot = axs.imshow(matrix, cmap = 'coolwarm_r')
axs.axis('off')

cb = fig.colorbar(plot, ax = axs, orientation='vertical', ticks = [], aspect = 8)
#cb.set_label('Length in words', labelpad = 12)
cb.outline.set_visible(False)
cb.ax.text(x = 23, y = 9, s = 'short', color = 'white', ha = 'left')
cb.ax.text(x = 25, y = 99, s = 'long', color = 'white', ha = 'left')

plt.show()