In [None]:
# HOWTO install and create a conda environment with Python 2 with the required packages for Linux in a bash terminal.
# 1. Download and install miniconda2 from
#    https://conda.io/en/latest/miniconda.html
# 2. Activate the base environment for conda
# 3. Create a new environment
#    $ conda create --name py2_zipf_music python=3
# 4. Activate the new environment
#    $ conda activate py2_zipf_music
# 5. Install the required Python packages
#    $ conda install -c anaconda numpy
#    $ conda install -c conda-forge matplotlib
#    $ conda install -c anaconda jupyter 
#    $ conda install -c anaconda nltk
# 6. Run Jupyter notebook
#    $ jupyter notebook
# 7. and open py2_zipf_gutenberg_example.ipynb
# 8. The code automatically downloads the text example from Project Gutenberg.

In [None]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 

In [None]:
plt.rcParams["figure.figsize"] = (13,9)

In [None]:
from collections import defaultdict
import numpy as np
import urllib
import string
import nltk as nl
import unicodedata

In [None]:
# The n-gramCA

def seq_ngram( seq , n , t ):
    #return "-".join( seq[ t : t + n ] )
    return "%".join( seq[ t : t + n ] )

def it_ngram( seq , n ):
    assert n > 1
    for t in xrange( len( seq ) - n + 1 ):
        yield seq_ngram( seq , n , t )
    
def n_gramCA( _seq , n_max = 20 , safety = None , verbose = 0 ):
    #N = len( _seq )
    #w_f = defaultdict( int )
    #for w in _seq:
    #    w_f[ w ] += 1
    if safety is None:
        safety = len( _seq ) * n_max
    seq = list( _seq )
    large_enough = False
    n = n_max
    while n > 1 and safety > 0:
        if verbose > 0:
            print '### n',n
        ngram_idxs  = defaultdict( list )
        #ngram_f     = defaultdict( int )
        for t , ngram in enumerate( it_ngram( seq , n ) ):
            #ngram_f[ ngram ] += 1
            list_t = ngram_idxs[ ngram ]
            if len( list_t ) > 0:
                if t - list_t[ -1 ] < n:
                    continue
            ngram_idxs[ ngram ].append( t )
        z_ngrams = sorted( [ ( len( list_t ) , ngram , list_t ) for ngram , list_t in ngram_idxs.items() if len( list_t ) > 1 ] , reverse = True )
        #z_ngrams = sorted( [ ( log_score( ngram , len( list_t ) , w_f , N ) , ngram , list_t ) for ngram , list_t in ngram_idxs.items() if len( list_t ) > 1 ] , reverse = True )
        if len( z_ngrams ) == 0:
            if verbose > 0:
                print '### large_enough n',n
            large_enough = True
            n -= 1
        else:
            assert large_enough , "ERROR : n_max is too small..."
            dummy , ngram , list_t = z_ngrams[ 0 ]
            for t in sorted( list_t ):
                seq = seq[ : t ] + [ ngram ] + [ "" ] * ( n - 1 ) + seq[ t + n : ]
            seq = [ s for s in seq if s != "" ]
        safety -= 1
    return seq

## Download the .txt data

In [None]:
# Donwload Metamorphosis .txt file.
url = "http://www.gutenberg.org/cache/epub/5200/pg5200.txt"
response = urllib.urlopen(url)
raw = response.read().decode('utf8')
raw[:100]

## Curate the text and convert it into a sequence of lower-case words without punctuation

In [None]:
# Cut the non-original parts of the text.
cut=raw[871:871+121115]
print 'BEGINNING...'
print cut[:20]
print 'END...'
print cut[-20:]

In [None]:
cut_ascii=unicodedata.normalize('NFKD',cut).encode('ascii','ignore') # Transforma UTF8 a ascii
cut_ascii[:30]

In [None]:
tokens = nl.word_tokenize(cut_ascii)
tokens[:10]

In [None]:
words = nl.Text(tokens)
words[:10]

In [None]:
words = [ w.lower() for w in words ]
words[:10]

In [None]:
# Remove punctuation but keep contractions like "can't" together.
word_sequence = [ sw for sw in [ str( w ).strip( string.punctuation ) for w in words ] if len( sw ) > 0 ]
word_sequence[:10]
#sorted( set( stripped ) )[:1000]

In [None]:
# Compute the compressed word sequence
compressed_word_sequence = n_gramCA( word_sequence , n_max = 20 , safety = None , verbose = 0 )
compressed_word_sequence[:20]

In [None]:
compressed_word_sequence[:200]

In [None]:
# Compute refragmented word sequence
eta = 0.4
tmptext = ' '.join(compressed_word_sequence)
new_tmptext = []
for i in xrange( len( tmptext ) ):
    s = tmptext[ i ] 
    if s == '%' and np.random.random() < eta:
        new_tmptext.append( ' ' )
    else:
        new_tmptext.append( s )
refragmented_word_sequence = ''.join(new_tmptext).split()       
refragmented_word_sequence[:10]        

## Plot the rank-frequency plot of the original word sequence

In [None]:
# Rank-frequency distribution of original word frequency.
f_w = defaultdict(float)
for w in word_sequence:
    f_w[w] += 1.0
word_ranks = []
word_frequencies = []
for rr,(f,w) in enumerate(sorted([(f,w) for (w,f) in f_w.items()],reverse=True)):
    r=rr+1
    #print r,f,w
    word_ranks.append(r)
    word_frequencies.append(f)
    
# Rank-frequency distribution of the compressed word frequency.    
f_w = defaultdict(float)
for w in compressed_word_sequence:
    f_w[w] += 1.0
compressed_word_ranks = []
compressed_word_frequencies = []
for rr,(f,w) in enumerate(sorted([(f,w) for (w,f) in f_w.items()],reverse=True)):
    r=rr+1
    #print r,f,w
    compressed_word_ranks.append(r)
    compressed_word_frequencies.append(f)    
    
# Rank-frequency distribution of the compressed word frequency.    
f_w = defaultdict(float)
for w in refragmented_word_sequence:
    f_w[w] += 1.0
refragmented_word_ranks = []
refragmented_word_frequencies = []
for rr,(f,w) in enumerate(sorted([(f,w) for (w,f) in f_w.items()],reverse=True)):
    r=rr+1
    #print r,f,w
    refragmented_word_ranks.append(r)
    refragmented_word_frequencies.append(f)       
    
# Plot curves in log-log   
plt.title("Metamorphosis")
plt.xlabel("r")
plt.ylabel("f");
plt.loglog()

zipf_law_ranks = range(10,2000)
zipf_law_frequencies = [3500.0/r for r in zipf_law_ranks]

plt.plot(word_ranks, word_frequencies,label="words")
plt.plot(compressed_word_ranks, compressed_word_frequencies,label="compressed words")
plt.plot(refragmented_word_ranks, refragmented_word_frequencies,label="refragmented words")
plt.plot(zipf_law_ranks, zipf_law_frequencies,label="Zipf's law")

plt.legend()
plt.show()    