In [38]:
# import some things
import os
import pandas as pd
from textblob import TextBlob
pd.set_option("display.max_rows", 20)
textfolder = '../corpora/gertrude'
metapath = '../corpora/ovidio/ovidio.csv'

In [67]:
# Loop over each of the filenames

def make_dtm(text_folder,n_top_words=1000,normalize=True):
    # get stopwords
    from nltk.corpus import stopwords
    stopwords=set(stopwords.words('spanish'))
    morestops = ['aun','si','dice','así','pues','mientras','ahora']
    for w in morestops:
        stopwords.add(w)
    # make an empty results list
    all_results = []

    # make a count for all words
    from collections import Counter
    all_counts = Counter()

    # for each filename
    filenames=sorted(os.listdir(text_folder))
    for i,fn in enumerate(filenames):
        if not i%10: print('>> looping through #',i,'of',len(filenames),'files:',fn)
        # make sure is a text file
        if not fn.endswith('.txt'): continue
        
        # full path
        full_path = os.path.join(text_folder,fn)

        # open the file
        with open(full_path) as file:
            txt=file.read()

        # make a blob
        blob = TextBlob(txt.lower())

        # make a result dictionary
        text_result = {}

        # set the filename
        text_result['fn']=fn

        # loop over the word counts
        num_words = len(blob.words)

        # for each word,count pair in the blob.word_counts dictionary...
        for word,count in blob.word_counts.items():
            # is the word in the stopwords?
            if word in stopwords: continue  

            # is the word a punctuation?
            if not word[0].isalpha(): continue
            
            # set the normalized version
            if normalize:
                # get the term frequency (count divided by number of words)
                tf = count / num_words

                # set the term frequency result to the key 'word' in the text_result dictionary
                text_result[word] = tf
            else:
                # set the count as a result
                text_result[word] = count

            # add the count to the dictionary of counts for all words
            all_counts[word]+=count

        # add results
        all_results.append(text_result)
    
    # Get the most frequent words
    most_common_words_plus_counts = all_counts.most_common()[:-20:-1]
    
    # Get only the words
    word_columns = []
    for word,count in most_common_words_plus_counts:
        word_columns.append(word)
    
    # Get columns
    columns=[]
    columns.append('fn')
    columns.extend(word_columns)
    
    # Make dataframe
    df = pd.DataFrame(all_results, columns=columns).set_index('fn').fillna(0)
    
    # return dataframe
    return df

In [68]:
make_dtm(text_folder = textfolder)

>> looping through # 0 of 19 files: libro-1.txt
>> looping through # 10 of 19 files: libro-5.txt


Unnamed: 0_level_0,concurren,sociales,votó,alegraos,acicalados,albor,sistro,sonable,crepitó,fulgieron,remedan,inscribí,memorativo,distribuidos,honras,mareóticos,paretonio,isis,detrae
fn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
libro-1.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
libro-10.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
libro-11.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
libro-12.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
libro-13.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
libro-14.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
libro-15.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
libro-2.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
libro-3.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
libro-4.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
df_meta = pd.read_csv(metapath).set_index('fn')
dtm = make_dtm(textfolder,normalize=True)
dtm_meta=df_meta.merge(dtm,on='fn')
dtm_meta
#dtm_meta.sort_values("sentence",ascending=False)

>> looping through # 0 of 19 files: libro-1.txt
>> looping through # 10 of 19 files: libro-5.txt


Unnamed: 0_level_0,Book,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,concurren,sociales,...,fulgieron,remedan,inscribí,memorativo,distribuidos,honras,mareóticos,paretonio,isis,detrae
fn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [73]:
def concordance(text_folder, word,width=100,lines=1000):
    # Get the path
    for f in os.listdir(text_folder):
    # Open the file
        with open(f) as file:
            text_txt=file.read()

    # make nltk version of the text (useful for concordance)
        import nltk
        text_words = nltk.word_tokenize(text_txt)
        text_nltk = nltk.text.Text(text_words)

    # get concordance
        text_nltk.concordance(word,width=width,lines=lines)
    
    
concordance(textfolder,"isis")

FileNotFoundError: [Errno 2] No such file or directory: 'ovidio.csv'