# Libraries

In [1]:
import numpy as np
import pandas as pd
import os
import spacy

#french language processor
nlp_fr = spacy.load('fr_core_news_sm')
nlp_fr.max_length = 2000000

# Make frequency data

In [2]:
freq_list = pd.read_csv('Lexique383.tsv', sep='\t')

In [3]:
freq_list = freq_list.sort_values(by = 'freqlemlivres', ascending = False)

In [4]:
freq_list.head()

Unnamed: 0,ortho,phon,lemme,cgram,genre,nombre,freqlemfilms2,freqlemlivres,freqfilms2,freqlivres,...,orthrenv,phonrenv,orthosyll,cgramortho,deflem,defobs,old20,pld20,morphoder,nbmorph
35278,de,d°,de,PRE,,,25220.86,38928.92,25220.86,38928.92,...,ed,°d,de,"ART:def,PRE",,,1.0,,de,1
75471,la,la,la,ART:def,f,s,14946.48,23633.92,14946.48,23633.92,...,al,al,la,"ART:def,NOM,PRO:per",,,1.0,1.0,la-:def,2
52787,et,e,et,CON,,,12909.08,20879.73,12909.08,20879.73,...,te,e,et,CON,,,1.05,1.0,et,1
137701,à,a,à,PRE,,,12190.4,19209.05,12190.4,19209.05,...,à,a,à,PRE,,,1.0,1.0,à,1
76448,le,l°,le,ART:def,m,s,13652.76,18310.95,13652.76,18310.95,...,el,°l,le,"ART:def,PRO:per",,,1.0,,le-:def,2


In [5]:
#drop irrelevant columns
#we lemmatize because books have a lot of weird word forms like the passé simple.
freq_list = freq_list[['lemme', 'freqlemlivres']]

In [6]:
freq_list = freq_list.reset_index(drop=True)

In [7]:
#drop duplicate rows
freq_list = freq_list.drop_duplicates(subset=['lemme'])

In [8]:
freq_list = freq_list.reset_index(drop=True)

In [9]:
freq_list.columns = ['word', 'frequency']

In [10]:
#for good measure
freq_list = freq_list.dropna()

In [11]:
freq_list = freq_list.reset_index(drop=True)

In [12]:
freq_list.head(10)

Unnamed: 0,word,frequency
0,de,38928.92
1,la,23633.92
2,et,20879.73
3,à,19209.05
4,le,18310.95
5,il,15832.09
6,être,15085.47
7,les,14662.3
8,ne,13841.89
9,un,13550.68


In [13]:
freq_list.to_csv('freq_list.csv', index=False)

# Make a dataframe for each book

It will be a dataframe with a column for the chapter and a column for all the text contained in that chapter.

In [14]:
books = ['Harry Potter/1/', 'Harry Potter/2/', 'Harry Potter/3/', 'Harry Potter/4/', 'Harry Potter/5/', 
         'Harry Potter/6/', 'Harry Potter/7/']

In [15]:
for book in books:
    
    df = pd.DataFrame(columns = ['Chapter', 'Text'])

    for chap in os.listdir(book):
        
        if chap.isdigit():
    
            f = open(book + chap)
    
            lines = f.readlines()
    
            text = ""
            for line in lines:
            
                #get rid of '\n' and add space
                line = line.strip() + ' '
         
                text = text + line
        
            df.loc[int(chap)-1] = [int(chap), text]
    
            f.close()

        df = df.sort_values(by = 'Chapter', ascending = True)

        df.to_csv(book + 'book_df.csv', index=False)

# Make frequency list for each book

In [16]:
for book in books:
    
    print(book)
    
    df = pd.read_csv(book + 'book_df.csv')
    
    
    #collect all the text from the entire book
    all_text = ""
    for i in range(len(df)):
        all_text = all_text + df.loc[i, 'Text']
    
    
    
    
    #find all the unique tokens in the book
    all_words = nlp_fr(all_text.lower())

    unique_words = set()

    for word in all_words:
    
        unique_words.add(word.lemma_)
    
    unique_words = list(unique_words)
    
    
    
    
    #build a frequency list of all the words in the book

    freq_list_book = pd.DataFrame(columns = ['word', 'frequency'])

    for word in unique_words:
    
        #find the corresponding row in the frequency list
        #if the word even exists in the frequency list
        row = freq_list.loc[freq_list['word'] == word]
    
        freq_list_book = freq_list_book.append(row)
        
        
    freq_list_book = freq_list_book.sort_index()
    
    
    freq_list_book.to_csv(book + 'freq_list.csv', index=True)

Harry Potter/1/
Harry Potter/2/
Harry Potter/3/
Harry Potter/4/
Harry Potter/5/
Harry Potter/6/
Harry Potter/7/
