In [None]:
from collections import Counter
import pandas as pd
import re
import stopwordsiso
import glob
import numpy as np

from PIL import Image
from matplotlib.colors import LinearSegmentedColormap
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer,CountVectorizer
from nltk.corpus import stopwords
from wordcloud import WordCloud
import nltk
import matplotlib.pyplot as plt

In [None]:
sp_stopwords = stopwordsiso.stopwords('es')

In [None]:
sp_stopwords.update({"pa", "na", "ah", "yeah", "uh", "yeh"})

In [None]:
len(sp_stopwords)

738

## Choose corpus: "21Century" or "coronaversos"

In [None]:
# CORPUS_PATH = "coronaversos"
CORPUS_PATH = "21Century"

## Create TF-IDF vectorizer

In [None]:
tfIdfVectorizer = TfidfVectorizer(use_idf=True, stop_words=sp_stopwords)

## Get the top n most frequent terms

In [None]:
def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    """
    vec = CountVectorizer(stop_words=sp_stopwords).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return dict(words_freq[:n])

# Load corpus

In [None]:
df = pd.read_csv(f"{CORPUS_PATH}.csv", sep="\t")

### Set all text to lowercase

In [None]:
df.text = df.text.apply(lambda x: x.lower())
df.lemmatized_text = df.lemmatized_text.apply(lambda x: x.lower())

# Lemmas

In [None]:
df.head(5)

Unnamed: 0,poem,text,lemmatized_text
0,"Segarra,Iván:Idilio_Sideral.txt","como misterios ensortijados de vida,\nun eco q...","como misterio ensortijado de vida , \n uno eco..."
1,"Gil_Segura,F._Javier:Si_Te_Echo_De_Menos.txt",sabes a metal\ncuando no soy lo primero.\nme s...,saber a metal \n cuando no ser él primero . \n...
2,"Sierra,Iván_Francisco:De_Ida_Y_Vuelta.txt","ayer,\nmientras réplicas baratas nos aventábam...","ayer , \n mientras réplica barato yo aventár a..."
3,"Gregori,Diego_Alberto_De:Sin_Poder_Dormir.txt","el hombre es mito,\nreligión y tormento.\ntus ...","el hombre ser mito , \n religión y tormento . ..."
4,"Corredor_Cuervo,Héctor_José:¡Oh!_Patria_Bella_...",¡oh! patria bella de luto revestida\nbajo somb...,¡ oh ! patria bello de luto revestido \n bajo ...


#### Most frequent words

In [None]:
most_common_dict = get_top_n_words(df["lemmatized_text"], 10)

In [None]:
most_common = list(most_common_dict.keys())

In [None]:
most_common

['amor',
 'vida',
 'querer',
 'noche',
 'alma',
 'dejar',
 'ojo',
 'mano',
 'luz',
 'corazón']

In [None]:
test_df = df["lemmatized_text"]

## Calculate occurrences per document

### Occurrences of the most frequent words

In [None]:
occurrences_df = pd.DataFrame(columns=most_common)

In [None]:
for idx, row in df.iterrows():
    occ_list = []
    for word in most_common:
        if re.search(fr"(?<!\S){word}(?!\S)", row["lemmatized_text"]):
            occ_list.append(1)
        else:
            occ_list.append(0)
    occurrences_df.loc[idx] = occ_list

In [None]:
occurrences_df

In [None]:
occurrences_df.sum()

In [None]:
most_common_dict

## Word frequency

In [None]:
occ_percentage = occurrences_df[occurrences_df > 0.0].count()*100/len(occurrences_df)

In [None]:
occ_document_freq = occurrences_df[occurrences_df > 0.0].count()

In [None]:
occ_freq = most_common_dict.values()

In [None]:
occurrences_df.sum()

In [None]:
stats_occ_df = pd.DataFrame(columns=["Lemma", "Frequency", "Document frequency", "Relative document frequency"])

In [None]:
stats_occ_df.Lemma = occ_document_freq.keys()
stats_occ_df.Frequency = occ_freq
stats_occ_df["Document frequency"] = occ_document_freq.values
stats_occ_df["Relative document frequency"] = occ_percentage.values

In [None]:
stats_occ_df

## Word cloud

In [None]:
all_text = ""
for i in df.lemmatized_text: 
    i = str(i) 
    separate = i.split() 
    for j in range(len(separate)): 
        separate[j] = separate[j].lower() 
      
    all_text += " ".join(separate)+" "

In [None]:
top200 = get_top_n_words(df["lemmatized_text"], 200)

In [None]:
for key, value in top200.items():
    print(f"{key} "*value)

In [None]:
all_text = "".join([f"{key}\n"*value for key, value in top200.items()])

In [None]:
def grayscale_cmap(cmap):
    """Return a grayscale version of the given colormap"""
    cmap = plt.cm.get_cmap(cmap)
    colors = cmap(np.arange(cmap.N))
    
    # convert RGBA to perceived grayscale luminance
    # cf. http://alienryderflex.com/hsp.html
    RGB_weight = [0.299, 0.587, 0.114]
    luminance = np.sqrt(np.dot(colors[:, :3] ** 2, RGB_weight))
    colors[:, :3] = luminance[:, np.newaxis]
        
    return LinearSegmentedColormap.from_list(cmap.name + "_gray", colors, cmap.N)

In [None]:
mask = np.array(Image.open('nlp2.png'))

In [None]:
final_wordcloud = WordCloud(
                            width=1600, height=1000,
                            mode="RGBA",
                            mask=mask,
                            background_color=None,
                            colormap=grayscale_cmap("copper"),
                            stopwords=sp_stopwords,
                            collocations=False,
                            min_font_size = 10).generate(all_text)

In [None]:
plt.figure(figsize = (30, 30), facecolor = None)
plt.imshow(final_wordcloud)
plt.axis("off")
plt.show()