##### Now, let's process the text to get it ready for analysis. We'll tokenize the text into individual words and remove stopwords using [NLTK](https://github.com/ian-nai/Non-English-NLP-Tutorial/blob/master/Documentation%20Resources.md#nltk), then save a new file with the cleaned text.

In [2]:
import codecs
import nltk
from nltk.corpus import stopwords

# We'll use NLTK's default French stopwords
default_stopwords = set(nltk.corpus.stopwords.words('french'))

# Our input file
input_file = 'beauverie.txt' 

# Opening the input file 
fp = open(input_file, 'r')

# Tokenize the text into words
words = nltk.word_tokenize(fp.read())

# Remove single-character tokens (mostly punctuation)
words = [word for word in words if len(word) > 1]

# Remove numbers
words = [word for word in words if not word.isnumeric()]

# Lowercase all words (default_stopwords are lowercase too)
words = [word.lower() for word in words]

# Remove stopwords
words = [word for word in words if word not in default_stopwords]

# Save our new file as 'cleaned_text.txt'
with open('cleaned_text.txt', 'w') as f:
    for item in words:
        f.write("%s\n" % item)

##### We can remove words from a custom stopwords list using regular expressions, as well.

In [3]:
stop_words_lst = ['hello', 'world']
s = 'hello world, this is a test'

import re
for w in stop_words_lst:
    pattern = r'\b'+w+r'\b'
    s = re.sub(pattern, '', s)

print (s)

 is a test, you 


##### Alternatively, we can use [spaCy](https://github.com/ian-nai/Non-English-NLP-Tutorial/blob/master/Documentation%20Resources.md#spacy) to remove stopwords.

In [None]:
import spacy
from spacy.lang.fr import French
from spacy.lang.fr.stop_words import STOP_WORDS

# Load the French tokenizer
nlp = French()

# Set the variable "text" as our chosen file and remove any new lines ('\n')
with open('beauverie.txt', 'r') as file:
    text = file.read().replace('\n', '')
    
# Convert the text to a Spacy document
my_doc = nlp(text)

# Create list of word tokens
token_list = []

# Tokenize our text and append the tokens to a new list, "token_list"
for token in my_doc:
   if token.is_punct == False:
       token_list.append(token.text.lower())  # make the words lowercase

# Create list of word tokens after removing stopwords
filtered_text =[]

# Iterate through the list and remove any stopwords
for word in token_list:
   lexeme = nlp.vocab[word]
   if lexeme.is_stop == False:
       filtered_text.append(word)

print(filtered_text)  

# Save the file as 'cleaned_text.txt'
with open('cleaned_text_spacy.txt', 'w') as f:
    for item in filtered_text:
        f.write("%s\n" % item)