In [None]:
# Now, let's process the text to get in ready for analysis. We'll remove stopwords and save a new file with
# the cleaned text.

## LOWERCASE AND REMOVE STOPWORDS AND PUNCTUATION

In [2]:
import codecs
import nltk
from nltk.corpus import stopwords

# NLTK's default French stopwords
default_stopwords = set(nltk.corpus.stopwords.words('french'))

input_file = 'beauverie.txt' 

fp = codecs.open(input_file, 'r', 'utf-8')

words = nltk.word_tokenize(fp.read())

# Remove single-character tokens (mostly punctuation)
words = [word for word in words if len(word) > 1]

# Remove numbers
words = [word for word in words if not word.isnumeric()]

# Lowercase all words (default_stopwords are lowercase too)
words = [word.lower() for word in words]


# Remove stopwords
words = [word for word in words if word not in default_stopwords]

# Save our new file as 'cleaned_text.txt'
with open('cleaned_text.txt', 'w') as f:
    for item in words:
        f.write("%s\n" % item)

In [None]:
# We can remove words from a custom stopwords list, as well.

In [3]:
stop_words_lst = ['know', 'this']
s = "this is a test, you know"

import re
for w in stop_words_lst:
    pattern = r'\b'+w+r'\b'
    s = re.sub(pattern, '', s)

print (s)

 is a test, you 


In [None]:
# Alternatively, we can use Spacy to remove stopwords.

In [None]:
import spacy
from spacy.lang.fr import French
from spacy.lang.fr.stop_words import STOP_WORDS

# Load the French tokenizer
nlp = French()

# Set the variable "text" as our chosen file and remove any new lines ('\n')
with open('beauverie.txt', 'r') as file:
    text = file.read().replace('\n', '')
    
# Convert the text to a Spacy document
my_doc = nlp(text)

# Create list of word tokens
token_list = []

# Tokenize our text and append the tokens to a new list, "token_list"
for token in my_doc:
   if token.is_punct == False:
       token_list.append(token.text.lower())  # make the words lowercase

# Create list of word tokens after removing stopwords
filtered_text =[]

# Iterate through the list and remove any stopwords
for word in token_list:
   lexeme = nlp.vocab[word]
   if lexeme.is_stop == False:
       filtered_text.append(word)

print(filtered_text)  

# Save the file as 'cleaned_text.txt'
with open('cleaned_text_spacy.txt', 'w') as f:
    for item in filtered_text:
        f.write("%s\n" % item)