Import all necessary documentation. Choose text for use in THEME system. Text must be in .txt format, UTF-8 encoding. Ensure .txt file is located in locatable directory and that notebook is also in this directory. The user must also download TreeTagger and all associated French parameter files. See here for directions: https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/. TreeTagger was chosen for part of speech tagging because it reliably tags French POS and also provides a lemma for each tagged word. 

In [1]:
import os, re #to determine directories and use regex
os.getcwd()
os.chdir("/Users/kaylinland/Documents/RAshipSinclair/Fortier/TreeTagger")#navigate to user directory with TreeTagger files
import nltk
import sys
!sh install-tagger.sh #install TreeTagger
from treetagger import TreeTagger #call TreeTagger
from frenetic import * #Import API for using WOLF French wordnet in Python



mkdir: cmd: File exists
mkdir: lib: File exists
mkdir: bin: File exists
mkdir: doc: File exists

TreeTagger version for Mac OS-X installed.
Tagging scripts installed.
French parameter file installed.
Path variables modified in tagging scripts.

You might want to add /Users/kaylinland/Documents/RAshipSinclair/Fortier/TreeTagger/cmd and /Users/kaylinland/Documents/RAshipSinclair/Fortier/TreeTagger/bin to the PATH variable so that you do not need to specify the full path to run the tagging scripts.



In [2]:
TreeTaggerDirectory = "/Users/kaylinland/Documents/RAshipSinclair/Fortier/TreeTagger/"#User enters directory

def fileinput(filename, keywordList): #function allows user to input .txt file as filename and search list of word keywords
    file = open(filename, "r")
    filestring = file.read()
    
    regTokenizer = nltk.RegexpTokenizer(r'''\w'|\w+|[^\w\s]''')#French tokenizer accounts for liaisons
    tokens1 = regTokenizer.tokenize(filestring.lower())#tokenize all words in .txt file and make lowercase
    tokens2 = [word for word in tokens1 if word[0].isalpha()]#removes all non-alphabetical words like numbers
    
    tt = TreeTagger(TreeTaggerDirectory, language='french')#call TreeTagger
    treetags = tt.tag(tokens2)
    punctag = re.compile("PUN")#create regex object to filter out POS tags that are not needed in THEME system
    senttag = re.compile("SENT")#regex object for sentence tagger
    numtag = re.compile("NUM")#regex object for number tag
    symtag = re.compile("SYM")#regex object for symbol
    treetokens = [x for x in treetags if not (punctag.search(x[1]))
        or senttag.search(x[1] or numtag.search(x[1]) or symtag.search(x[1]))]#remove unneeded POS tags
    
    allconcordances = []
    for kw in keywordList:
        matchset = set()#create set 
        for i, (word, pos, lemma) in enumerate(treetokens): 
            #if lemma == [word for word in keywordList] and word != [word for word in keywordList]:
            if lemma == kw and word != kw:
                matchset.add(word)#create set if lemma from text matches input keyword 
    
        print("\nword matches for ", kw, ": ", matchset)#print results
    
        textObject = nltk.Text(tokens2)#create nltk object for list of tokens
    
        print("\nRunning Primary concordance for ", kw)
        primaryConcordance = textObject.concordance(kw, lines=10)#creates Primary Concordance from text. Primary concordance is all concordances in instances where keyword appears as lemma
        secondaryConcordance = [] #secondaryConcordance creates concordances that include all inflections of inputed keyword
        for lemmamatch in matchset: 
            print("\nRunning Secondary concordances for ", lemmamatch)
            secondaryConcordance.append(textObject.concordance(lemmamatch, lines=10))#Secondary concordance
            textConcordancePositions = nltk.ConcordanceIndex(kw)#add location of keywords in text
        allconcordances.append([kw, primaryConcordance, secondaryConcordance, textConcordancePositions])
    return(allconcordances)#return all concordances

This function utilizes the Python API for WOLF. Code can be found here: https://github.com/hardik-vala/FreNetic. 

In [3]:
FreNeticDirectory = "/Users/kaylinland/Documents/RAshipSinclair/Fortier/wolf-1.0b4.xml" #Enter location with WOLF .xml file

def synonyms(keyword): #create list of synonyms using French wordnet from chosen theme keyword
       
    fwn = FreNetic(FreNeticDirectory)
    synset = fwn.synsets(keyword)#create synset object
    #synonymlist = []
    synonymlist = set()
    for synx in synset: #get list of all literals for chosen synset
        for sy in synx.literals():
            #synonymlist += (str(sy).split(' ')[0])
            #synonymlist = synonymlist + (str(sy).split(' ')[0])
            #synonymlist.append(str(sy).split(' ')[0])
            synonymlist.add(str(sy).split(' ')[0])
        
    
    return(synonymlist)

In [4]:
synonyms("nuit")#test using Gide. 

{'Buongiorno',
 'Nox',
 'noirceur',
 'nuit',
 'nuitée',
 'obscurité',
 'soir',
 'soirée',
 'sombre'}

In [5]:
TreeTaggerDirectory = "/Users/kaylinland/Documents/RAshipSinclair/Fortier/TreeTagger/"#User enters directory

def fileinput2(filename, kyw): #function allows user to input .txt file as filename and search list of word keywords
    file = open(filename, "r")
    filestring = file.read()
    
    regTokenizer = nltk.RegexpTokenizer(r'''\w'|\w+|[^\w\s]''')#French tokenizer accounts for liaisons
    tokens1 = regTokenizer.tokenize(filestring.lower())#tokenize all words in .txt file and make lowercase
    tokens2 = [word for word in tokens1 if word[0].isalpha()]#removes all non-alphabetical words like numbers
    
    tt = TreeTagger(TreeTaggerDirectory, language='french')#call TreeTagger
    treetags = tt.tag(tokens2)
    punctag = re.compile("PUN")#create regex object to filter out POS tags that are not needed in THEME system
    senttag = re.compile("SENT")#regex object for sentence tagger
    numtag = re.compile("NUM")#regex object for number tag
    symtag = re.compile("SYM")#regex object for symbol
    treetokens = [x for x in treetags if not (punctag.search(x[1]))
        or senttag.search(x[1] or numtag.search(x[1]) or symtag.search(x[1]))]#remove unneeded POS tags
    
    keywordList = synonyms(kyw)
    
    allconcordances = []
    for kw in keywordList:
        matchset = set()#create set 
        for i, (word, pos, lemma) in enumerate(treetokens): 
            #if lemma == [word for word in keywordList] and word != [word for word in keywordList]:
            if lemma == kw and word != kw:
                matchset.add(word)#create set if lemma from text matches input keyword 
    
        print("\nword matches for ", kw, ": ", matchset)#print results
    
        textObject = nltk.Text(tokens2)#create nltk object for list of tokens
    
        print("\nRunning Primary concordance for ", kw)
        primaryConcordance = textObject.concordance(kw, lines=10)#creates Primary Concordance from text. Primary concordance is all concordances in instances where keyword appears as lemma
        secondaryConcordance = [] #secondaryConcordance creates concordances that include all inflections of inputed keyword
        for lemmamatch in matchset: 
            print("\nRunning Secondary concordances for ", lemmamatch)
            secondaryConcordance.append(textObject.concordance(lemmamatch, lines=10))#Secondary concordance
            textConcordancePositions = nltk.ConcordanceIndex(kw)#add location of keywords in text
        allconcordances.append([kw, primaryConcordance, secondaryConcordance, textConcordancePositions])
    return(allconcordances)#return all concordances

In [6]:
isabelleGide = "/Users/kaylinland/Documents/RAshipSinclair/Fortier/SampleDocs/Isabelle_Gide.txt" #define sample file
keywordList = ['aimer', 'faire']#sample list of keywords

In [7]:
p1test = fileinput(isabelleGide, keywordList)#test the function


word matches for  aimer :  {'aimait', 'aimaient', 'aimerais', 'aimez', 'aime', 'aimais'}

Running Primary concordance for  aimer
Displaying 2 of 2 matches:
ent occupe par l' attente pouvais je aimer vraiment isabelle non sans doute mai
nsais je est ce la comme elle savait aimer a present je ramassais les menus obj

Running Secondary concordances for  aimait
Displaying 1 of 1 matches:
' abord mon oncle est mort qui vous aimait bien et puis dimanche apres ma tant

Running Secondary concordances for  aimaient
Displaying 1 of 1 matches:
 ni la cuisiniere ni gratien ne m' aimaient mes avances reiterees n' avaient p

Running Secondary concordances for  aimerais
Displaying 1 of 1 matches:
ais enfin cher monsieur l' abbe j' aimerais bien savoir si ce n' est pas cette

Running Secondary concordances for  aimez
Displaying 2 of 2 matches:
ens les poetes les orateurs que vous aimez ni sur la forme de gouvernement que 
 autres habits et qu est ce que vous aimez lire les grands voyages puis tournan

In [8]:
p2test = fileinput2(isabelleGide, "nuit")


word matches for  sombre :  {'sombres'}

Running Primary concordance for  sombre
Displaying 3 of 3 matches:
us encombrant le ciel decouvert une sombre masse d' arbres c' etait une avenue
ames dans le jardin il faisait trop sombre pour que je pusse rien distinguer d
artit la piece a present paraissait sombre isabelle cependant s' etait relevee

Running Secondary concordances for  sombres
Displaying 1 of 1 matches:
int aureol seule parmi les costumes sombres elle etait vetue tout en blanc d' a

word matches for  Nox :  set()

Running Primary concordance for  Nox
no matches

word matches for  obscurité :  set()

Running Primary concordance for  obscurité
no matches

word matches for  soir :  {'soirs'}

Running Primary concordance for  soir
Displaying 10 of 15 matches:
 dans les grandes occasions comme ce soir on emprunte le cheval du fermier mon
te bien aimable et je le recrirai ce soir a mon excellent ami desnos mais vous
aureol l' abbe du moins la veille au soir aurait bien pu m' avert

In [9]:
p2test[:]

[['sombre', None, [None], <ConcordanceIndex for 6 tokens (6 types)>],
 ['Nox', None, [], <ConcordanceIndex for 6 tokens (6 types)>],
 ['obscurité', None, [], <ConcordanceIndex for 6 tokens (6 types)>],
 ['soir', None, [None], <ConcordanceIndex for 4 tokens (4 types)>],
 ['soirée', None, [], <ConcordanceIndex for 4 tokens (4 types)>],
 ['nuitée', None, [], <ConcordanceIndex for 4 tokens (4 types)>],
 ['Buongiorno', None, [], <ConcordanceIndex for 4 tokens (4 types)>],
 ['nuit', None, [None], <ConcordanceIndex for 4 tokens (4 types)>],
 ['noirceur', None, [], <ConcordanceIndex for 4 tokens (4 types)>]]

In [13]:
#rousseauText = "/Users/kaylinland/Documents/RAshipSinclair/Fortier/SampleDocs/rousseau.txt"#test second file
#fileinput2(rousseauText, "bonheur")

In [26]:
lImmoraliste = "/Users/kaylinland/Documents/RAshipSinclair/Fortier/SampleDocs/L'Immoraliste.text"
fileinput2(lImmoraliste, "bonheur")


word matches for  félicité :  set()

Running Primary concordance for  félicité
no matches

word matches for  joie :  {'joies'}

Running Primary concordance for  joie
Displaying 10 of 35 matches:
ichel nous a reçus sans témoigner de joie très simple il semblait craindre tou
urent sans rires mais non sans grave joie tant la paix qu en obtint mon père f
e qui me plaisait surtout c était la joie de marceline ma fatigue cependant de
 marceline cependant qui voyait avec joie ma santé enfin revenir commençait de
autre chose que de l amusement de la joie quand j eus laissé à moktir tout le 
jeux l aliment qu il fallait pour ma joie je retournai vers marceline l exalta
on de mon esprit et de mes sens à la joie qu elle en eut je m aperçus qu elle 
chaque pas m encombrait empêchant ma joie je ne pouvais voir un théâtre grec u
 me découvrais autre et j existais ô joie en dehors d elles en tant que spécia
arais aux palimpsestes je goûtais la joie du savant qui sous les écritures plu

Running Secon

[['bonheur', None, [], <ConcordanceIndex for 7 tokens (6 types)>]]

In [25]:
synonyms ("bonheur")

{'allégresse', 'bonheur', 'félicité', 'joie', 'plaisir'}

Known issues: synonyms that include unnecessary information after # and issues with UTF-8 with some French characters

In [38]:
from frenetic import * #Import API for using WOLF French wordnet in Python
fwn = FreNetic("/Users/kaylinland/Documents/RAshipSinclair/Fortier/wolf-1.0b4.xml")
synset = fwn.synsets("nuit")#create synset object
print(synset)


[Synset(Id: eng-30-09559769-n, Literals: [Buongiorno (lrec12mllexwn(100.000)), Nox (0/1:enwikipedia;gwa2012(0.24269904402753086914)), noirceur (0/1:enwiktionary), nuit (0/1:frwiktionary), nuitée (0/2:enwiktionary,frwiktionary), obscurité (0/1:enwiktionary), soirée (0/1:enwiktionary)], Def.: Roman goddess of night; daughter of Erebus; counterpart of Greek Nyx, Usages: [], POS: n, Hypernyms: ['eng-30-09552681-n'], Instance Hypernyms: ['eng-30-09552681-n']), Synset(Id: eng-30-13983717-n, Literals: [nuit (gwa2012(0.12777375123480166907);lrec12mllexwn(2.349)), soirée (lrec12mllexwn(1.344))], Def.: darkness, Usages: ['it vanished into the night'], POS: n, Hypernyms: ['eng-30-13983515-n']), Synset(Id: eng-30-15155747-n, Literals: [nuit (62/5:fr.csbgen,fr.csen,fr.rocsbgen,fr.rocsen,fr.roen;gwa2012(0.51938766316449336280);lrec12mllexwn(2.349)), soirée (lrec12mllexwn(1.344))], Def.: the dark part of the diurnal cycle considered a time unit, Usages: ['three nights later he collapsed'], POS: n, Hy

In [39]:
synset[0]

Synset(Id: eng-30-09559769-n, Literals: [Buongiorno (lrec12mllexwn(100.000)), Nox (0/1:enwikipedia;gwa2012(0.24269904402753086914)), noirceur (0/1:enwiktionary), nuit (0/1:frwiktionary), nuitée (0/2:enwiktionary,frwiktionary), obscurité (0/1:enwiktionary), soirée (0/1:enwiktionary)], Def.: Roman goddess of night; daughter of Erebus; counterpart of Greek Nyx, Usages: [], POS: n, Hypernyms: ['eng-30-09552681-n'], Instance Hypernyms: ['eng-30-09552681-n'])

In [40]:
str(synset[0].literals()[0]).split(' ')[0]#isolate literals 

'Buongiorno'

In [43]:
for synx in synset: #get list of all literals for chosen synset
    for sy in synx.literals():
        print(str(sy).split(' ')[0])

Buongiorno
Nox
noirceur
nuit
nuitée
obscurité
soirée
nuit
soirée
nuit
soirée
nuit
obscurité
sombre
nuit
nuit
nuit
nuit
soir
soirée
