# Load Data
First, we load all the data we need into pandas dataframes.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.wsd import lesk
import nltk

In [2]:
TRAIN_ENGLISH_WIKIPEDIA = "../cwishareddataset/traindevset/" + \
                           "english/Wikipedia_Train.tsv"
df = pd.read_csv(TRAIN_ENGLISH_WIKIPEDIA, sep = "\t")
df.columns = ['id', 'sentence', "start", "end", "target", 
              "nat", "non_nat", "nat_marked", "non_nat_marked", "binary", "prob"]

In [None]:
pd.options.display.max_colwidth = 200
df_trans = df.loc[df.length < 5, ["sentence", "target", "binary"]]
df_trans.loc[df_trans.binary == 1]

In [112]:
group = df.loc[:,['target', 'binary']].groupby(['target'], as_index=False)
grouped = group.filter(lambda x : (x['binary'].mean() > 0) & (x['binary'].mean() < 1))
targets = grouped['target'].unique()
targets

array(['future', 'community', 'country', 'connection', 'Aboriginal',
       'Strike', 'Trust', 'Aboriginal land rights', 'won', 'moved',
       'campus', 'address', 'variety', 'feature', 'Quito', 'Panecillo',
       'located', 'metres', 'built', 'addition', 'visible', 'Symphony',
       'key', 'Clarinet', 'major', 'Ain', 'dry', 'include', 'popular',
       'due', 'Emirates', 'roles', 'Shakespeare', 'stage', 'role',
       'Opera', 'Academy', 'Actress', 'comic', 'theater', 'military',
       'following', 'update', 'brought', 'founded', 'Austrian', 'Jaeg',
       'Arovell', 'information', 'compounds', 'coordination', 'refers',
       'bonds', 'Doncaster', 'recognized', 'international', 'character',
       'season', 'stayed', 'returned', 'Buffy', 'series', 'Dushku',
       'Slayer', 'covered', 'album', 'century', 'residence', 'historical',
       'famous', 'Mucha', 'decorated', 'Pavilion', 'roof', 'features',
       'Carondelet', 'composed', 'government', 'seat', 'Aemilianus',
       'Aem

In [176]:
df.loc[df['target'] == 'covered',['target', 'binary', 'id','sentence', 'ctx_length']]

Unnamed: 0,target,binary,id,sentence,ctx_length
701,covered,1,3WJGKMRWVIAGMQCINAUOJA1MQ5XDCL,"La India covered the song on her album , Sobre el Fuego as her third single from the album .",3.65
4433,covered,0,34D9ZRXCYRVYV0Y20MTM8EXYR5MSAS,"The skin is covered in chromatophores , which enable the squid to change color to suit its surroundings , making it practically invisible .",4.833333


In [140]:
import requests
from lxml import html

page = requests.get('http://sentence.yourdictionary.com/decomposition')
tree = html.fromstring(page.content)
sentences = tree.xpath("//div[@class='li_content']/text()")

for sentence in sentences:
    print(sentence)

This investigator held that the 
 of the sugar molecules takes place outside the cell wall.
Witt), and by the 
 of ortho-anilido-(-toluidido- &c.)-azo compounds with dilute acids.
The digestion of fat or oil has not been adequately investigated, but its 
 in germinating seeds has been found to be due to an enzyme, which has been called lipase.
Its heat of 
 into its elements.
The theories propounded may be divided into two groups, namely, those ascribing to petroleum an inorganic origin, and those which regard it as the result of the 
 of organic matter.
Brown and Morris in 1892 advanced strong reasons for thinking that cane-sugar, Ci2H22O11, is the first carbohydrate synthesized, and that the hexoses found in the plant result from the 
 of this.
The material and the energy go together, the 
 of the one in the cell setting free the other, which is used at once in the vital processes of the cell, being in fact largely employed in constructing protoplasm or storing various products.
Depo

In [146]:
df.loc[df['sentence'].str.contains('although'),['target', 'sentence', 'binary']]

Unnamed: 0,target,sentence,binary
2336,modern version,The modern version of this honor has been conferred on non-Japanese recipients beginning in 1981 ( although several foreigners were given the honor before World War II ) ; and women were awarded t...,1
2337,modern,The modern version of this honor has been conferred on non-Japanese recipients beginning in 1981 ( although several foreigners were given the honor before World War II ) ; and women were awarded t...,0
2338,version,The modern version of this honor has been conferred on non-Japanese recipients beginning in 1981 ( although several foreigners were given the honor before World War II ) ; and women were awarded t...,1
2339,honor,The modern version of this honor has been conferred on non-Japanese recipients beginning in 1981 ( although several foreigners were given the honor before World War II ) ; and women were awarded t...,1
2340,conferred,The modern version of this honor has been conferred on non-Japanese recipients beginning in 1981 ( although several foreigners were given the honor before World War II ) ; and women were awarded t...,1
2341,non-Japanese,The modern version of this honor has been conferred on non-Japanese recipients beginning in 1981 ( although several foreigners were given the honor before World War II ) ; and women were awarded t...,0
2342,non-Japanese recipients,The modern version of this honor has been conferred on non-Japanese recipients beginning in 1981 ( although several foreigners were given the honor before World War II ) ; and women were awarded t...,1
2343,recipients,The modern version of this honor has been conferred on non-Japanese recipients beginning in 1981 ( although several foreigners were given the honor before World War II ) ; and women were awarded t...,1
2344,foreigners,The modern version of this honor has been conferred on non-Japanese recipients beginning in 1981 ( although several foreigners were given the honor before World War II ) ; and women were awarded t...,1
2345,beginning,The modern version of this honor has been conferred on non-Japanese recipients beginning in 1981 ( although several foreigners were given the honor before World War II ) ; and women were awarded t...,0


In [169]:
df['ctx_length'] = df.sentence.apply(lambda sent : np.mean([len(t) for t in sent.split()]))
df['ctx_target_length_dev'] = df.length - df.ctx_length
df.loc[:,['target', 'length', 'ctx_length', 'ctx_target_length_dev', 'binary','sentence']]
grouped = df.loc[:,['sentence', 'binary', 'ctx_length']].groupby('sentence').agg(['sum']).eval('Col4 = Col2 / Col3')
grouped

Unnamed: 0_level_0,binary,ctx_length
Unnamed: 0_level_1,sum,sum
sentence,Unnamed: 1_level_2,Unnamed: 2_level_2
", also spelled adzuki or aduki ) is an annual vine , Vigna angularis , widely grown throughout East Asia and the Himalayas for its small ( approximately 5 mm ) bean .",3,69.030303
"2 ( Two ; i \/ ˈtuː \/ ) is a number , numeral , and glyph .",2,9.555556
"A bog is a mire that accumulates peat , a deposit of dead plant material-often mosses , and in a majority of cases , sphagnum moss .",9,58.888889
"A major ( or the key of A ) is a major scale based on A , with the pitches A , B , C ♯ , D , E , F ♯ , and G ♯ .",2,13.815789
A monument to the Virgin Mary is located on top of El Panecillo and is visible from most of the city of Quito .,3,40.333333
A single launch pad can be used for launching all Angara versions except Angara A7 .,7,43.125000
A1 Grand Prix ( A1GP ) was a ` single make ' open-wheel auto racing series .,4,42.352941
"ACF Fiorentina , commonly referred to as simply Fiorentina , is a professional Italian football club from Florence , Tuscany .",5,65.619048
"Aboriginal land rights in Australia are grants of land to Indigenous Australians by the Commonwealth , state or territory governments of Australia .",6,76.695652
"About Christmas 1894 , Mucha happened to go into a print shop where there was a sudden and unexpected need for a new advertising poster for a play featuring Sarah Bernhardt , the most famous actress in Paris , at the Théâtre de la Renaissance on the Boulevard Saint-Martin .",13,121.500000


In [180]:
df.loc[(df['ctx_target_length_dev'] > 2) & (df['binary'] == 0),['id','target', 'sentence', 'binary', 'ctx_target_length_dev']]

Unnamed: 0,id,target,sentence,binary,ctx_target_length_dev
6,3XU9MCX6VODXPI3L8I02CM94TFB2R7,community,"Normally , the land will be passed down to future generations in a way that recognizes the community 's traditional connection to that country .",0,4.200000
10,3XU9MCX6VODXPI3L8I02CM94TFB2R7,connection,"Normally , the land will be passed down to future generations in a way that recognizes the community 's traditional connection to that country .",0,5.200000
12,3XU9MCX6VODXPI3L8I02CM94TFB2R7,passing,"The passing of Aboriginal land rights legislaton in Australia was preceded by a number of important Aboriginal protests , including the 1946 Aboriginal Stockmen 's Strike , the 1963 Yolngu Bark Pe...",0,2.209677
19,3XU9MCX6VODXPI3L8I02CM94TFB2R7,Australia,"The passing of Aboriginal land rights legislaton in Australia was preceded by a number of important Aboriginal protests , including the 1946 Aboriginal Stockmen 's Strike , the 1963 Yolngu Bark Pe...",0,4.209677
22,3XU9MCX6VODXPI3L8I02CM94TFB2R7,important,"The passing of Aboriginal land rights legislaton in Australia was preceded by a number of important Aboriginal protests , including the 1946 Aboriginal Stockmen 's Strike , the 1963 Yolngu Bark Pe...",0,4.209677
24,3XU9MCX6VODXPI3L8I02CM94TFB2R7,Stockmen,"The passing of Aboriginal land rights legislaton in Australia was preceded by a number of important Aboriginal protests , including the 1946 Aboriginal Stockmen 's Strike , the 1963 Yolngu Bark Pe...",0,3.209677
25,3XU9MCX6VODXPI3L8I02CM94TFB2R7,protests,"The passing of Aboriginal land rights legislaton in Australia was preceded by a number of important Aboriginal protests , including the 1946 Aboriginal Stockmen 's Strike , the 1963 Yolngu Bark Pe...",0,3.209677
26,3XU9MCX6VODXPI3L8I02CM94TFB2R7,including,"The passing of Aboriginal land rights legislaton in Australia was preceded by a number of important Aboriginal protests , including the 1946 Aboriginal Stockmen 's Strike , the 1963 Yolngu Bark Pe...",0,4.209677
27,3XU9MCX6VODXPI3L8I02CM94TFB2R7,Aboriginal,"The passing of Aboriginal land rights legislaton in Australia was preceded by a number of important Aboriginal protests , including the 1946 Aboriginal Stockmen 's Strike , the 1963 Yolngu Bark Pe...",0,5.209677
38,3XU9MCX6VODXPI3L8I02CM94TFB2R7,Aboriginal,"The passing of Aboriginal land rights legislaton in Australia was preceded by a number of important Aboriginal protests , including the 1946 Aboriginal Stockmen 's Strike , the 1963 Yolngu Bark Pe...",0,5.209677


# Aggregation
Since many labels are multi-word expression, we first of all define some aggregation functions that aggregate feature values over multiple tokens. Implementing this seperately allows to easily exchange the used aggregation function and keeps the feature computation functions clean. These feature computation functions should only compute features for a single target word.

In [3]:
from nltk.tokenize import word_tokenize

def agg_feat_num_average(target, func_feature, *args):
    return np.mean([func_feature(token, *args) for token in word_tokenize(target)])

def agg_feat_num_median(target, func_feature, *args):
    return np.median([func_feature(token, *args) for token in word_tokenize(target)])

def agg_feat_num_max(target, func_feature, *args):
    return np.max([func_feature(token, *args) for token in word_tokenize(target)])

def agg_feat_num_min(target, func_feature, *args):
    return np.min([func_feature(token, *args) for token in word_tokenize(target)])

# Orthographic features
Here we start computing simple features like the length of the target word.

In [4]:
from nltk.tokenize import word_tokenize

def ratio_cap_letters(target):
    return np.sum([1 for letter in target if target.isupper()]) / len(target)

df['length'] = df.target.apply(lambda target : agg_feat_num_average(target, len))
df['num_words'] = df.target.apply(lambda target : len(word_tokenize(target)))
#Relative position of the target word based on tokens
df['relative_position'] = df[['sentence', 'target']].apply(lambda vals : 
            (nltk.word_tokenize(vals[0]).index(vals[1].split()[0])) / len((nltk.word_tokenize(vals[0]))), axis = 1)
# Relative positions of the target word based on character counting
df['relative_position_left'] = df[['sentence', 'start']].apply(lambda vals : vals[1] / len(vals[0]), axis = 1)
df['relative_position_centered'] = df[['sentence', 'start', 'end']].apply(lambda vals : 
            ((vals[1] + vals[2]) / 2) / len(vals[0]), axis = 1)
df['relative_position_right'] = df[['sentence', 'end']].apply(lambda vals : vals[1] / len(vals[0]), axis = 1)
df['ratio_cap_letters'] = df.target.apply(lambda target : agg_feat_num_average(target, ratio_cap_letters))
df['all_caps'] = df.ratio_cap_letters == 1
df['hyphenated'] = df.target.apply(lambda target : int('-' in target))

# Linguistic Features
Here we compute linguistic word features like the number of vowels the word has.

In [6]:
from nltk.corpus import cmudict
import numpy as np
import pronouncing as pnc

d = cmudict.dict()

def num_syllables_rule_based(target):
    vowels = "aeiouy"
    numVowels = 0
    lastWasVowel = False
    for wc in target:
        foundVowel = False
        for v in vowels:
            if v == wc:
                if not lastWasVowel: numVowels+=1  
                foundVowel = lastWasVowel = True
                break
        if not foundVowel:  
            lastWasVowel = False
    if len(target) > 2 and target[-2:] == "es":
        numVowels-=1
    elif len(target) > 1 and target[-1:] == "e":
        numVowels-=1
    return numVowels

def num_syllables(target):
    if target in d:
        return np.mean([len(list(y for y in x if y[-1].isdigit())) for x in d[target.lower()]])
    else:
        return num_syllables_rule_based(target)

def num_vowels(target):
    return np.sum([target.lower().count(vowel) for vowel in 'aeiouy'])

def num_pronounciations(target):
    return len(pnc.phones_for_word(target))

In [701]:
from googletrans import Translator
translator = Translator()
targets = [ngram.strip().lower() for target in df['target'].tolist() for ngram in target.split()]
languages = ['fr', 'de']
translations = defaultdict(list)
for index, word in enumerate(targets):
    translator = Translator()
    for lang in languages:
        trans_word = translator.translate(word, dest=lang)
        translations[word].append(trans_word)
        print(str(index) + " " + word + " " + trans_word.text)

0 passed passé
0 passed bestanden
1 land terre
1 land land
2 future avenir
2 future Zukunft
3 future avenir
3 future Zukunft
4 generations générations
4 generations Generationen
5 generations générations
5 generations Generationen
6 recognizes reconnaît
6 recognizes erkennt
7 community communauté
7 community Gemeinschaft
8 traditional traditionnel
8 traditional traditionell
9 traditional traditionnel
9 traditional traditionell
10 connection connexion
10 connection Verbindung
11 to à
11 to zu
12 that cette
12 that Das
13 country pays
13 country Land
14 country pays
14 country Land
15 connection connexion
15 connection Verbindung
16 aboriginal Aborigène
16 aboriginal Ureinwohner
17 passing qui passe
17 passing Vorbeigehen
18 aboriginal Aborigène
18 aboriginal Ureinwohner
19 land terre
19 land land
20 rights droits
20 rights Rechte
21 legislaton législation
21 legislaton Gesetzgebung
22 land terre
22 land land
23 rights droits
23 rights Rechte
24 legislaton législation
24 legislaton Geset

205 sea mer
205 sea Meer
206 level niveau
206 level Niveau
207 metres metres
207 metres Meter
208 ft ft
208 ft ft
209 sea mer
209 sea Meer
210 level niveau
210 level Niveau
211 axis axe
211 axis Achse
212 nerve nerf
212 nerve Nerv
213 independence indépendance
213 independence Unabhängigkeit
214 public public
214 public Öffentlichkeit
215 space espace
215 space Raum
216 colonial colonial
216 colonial kolonial
217 square carré
217 square Platz
218 plaza carré
218 plaza Quadrat
219 grande grande
219 grande groß
220 built construit
220 built gebaut
221 addition une addition
221 addition Zusatz
222 archbishop archevêque
222 archbishop Erzbischof
223 municipal municipal
223 municipal Gemeinde
224 palace palais
224 palace Palast
225 metropolitan métropolitain
225 metropolitan Metropolitan-
226 palace palais
226 palace Palast
227 hotel un hôtel
227 hotel hotel
228 plaza carré
228 plaza Quadrat
229 grande grande
229 grande groß
230 metropolitan métropolitain
230 metropolitan Metropolitan-
231 

420 heritage patrimoine
420 heritage Erbe
421 village village
421 village Dorf
422 village village
422 village Dorf
423 literally Littéralement
423 literally buchstäblich
424 al à
424 al zu
425 ain Nommé
425 ain Ernannt
426 arabic arabe
426 arabic Arabisch
427 al-ʿayn al-'ayn
427 al-ʿayn al-'ayn
428 greenery verdure
428 greenery Grün
429 spring printemps
429 spring Frühling
430 garden jardin
430 garden Garten
431 city ville
431 city Stadt
432 due dû
432 due fällig
433 emirate émirats
433 emirate emirate
434 largest plus grand
434 largest größten
435 city ville
435 city Stadt
436 abu abu
436 abu abu
437 dhabi dhabi
437 dhabi Dhabi
438 abu abu
438 abu abu
439 united uni
439 united vereinigt
440 arab arabe
440 arab arabisch
441 emirates émirats
441 emirates Emirate
442 dhabi dhabi
442 dhabi Dhabi
443 fourth Quatrième
443 fourth vierte
444 largest plus grand
444 largest größten
445 city ville
445 city Stadt
446 united uni
446 united vereinigt
447 emirates émirats
447 emirates Emirate
448 a

625 compounds Verbindungen
626 methylcobalamin méthylcobalamine
626 methylcobalamin Methylcobalamin
627 cobalt-methyl cobalt-méthyle
627 cobalt-methyl Kobalt-Methyl
628 form forme
628 form form
629 vitamin vitamine
629 vitamin Vitamin
630 b12 B.12
630 b12 B.12
631 cobalt-methyl cobalt-méthyle
631 cobalt-methyl Kobalt-Methyl
632 bond liaison
632 bond Bindung
633 organometallic organométallique
633 organometallic organometallisch
634 bond liaison
634 bond Bindung
635 true vrai
635 true wahr
636 complex complexe
636 complex Komplex
637 biology la biologie
637 biology Biologie
638 metalorganics organométalliques
638 metalorganics Metallorganika
639 term terme
639 term Begriff
640 metalorganics organométalliques
640 metalorganics Metallorganika
641 metal-containing contenant du métal
641 metal-containing metallhaltig
642 refers se réfère
642 refers bezieht sich
643 metal-containing contenant du métal
643 metal-containing metallhaltig
644 compounds composés
644 compounds Verbindungen
645 com

820 fuego Feuer
821 single unique
821 single Single
822 century siècle
822 century Jahrhundert
823 end fin
823 end Ende
824 tsarskoye Tsarskoïe
824 tsarskoye Zarskoje
825 tsarskoye Tsarskoïe
825 tsarskoye Zarskoje
826 selo timbre
826 selo Stempel
827 popular populaire
827 popular Beliebt
828 selo timbre
828 selo Stempel
829 summer été
829 summer Sommer-
830 residence résidence
830 residence Residenz
831 place place
831 place Ort
832 summer été
832 summer Sommer-
833 residence résidence
833 residence Residenz
834 nobility la noblesse
834 nobility Adel
835 pushkin Poussin
835 pushkin Pushkin
836 part part
836 part Teil
837 town ville
837 town Stadt, Dorf
838 world monde
838 world Welt
839 heritage patrimoine
839 heritage Erbe
840 site site
840 site Seite? ˅
841 world monde
841 world Welt
842 heritage patrimoine
842 heritage Erbe
843 groups groupes
843 groups Gruppen
844 of de
844 of von
845 monuments monuments
845 monuments Monumente
846 site site
846 site Seite? ˅
847 saint saint
847 sa

1017 level Niveau
1018 palace palais
1018 palace Palast
1019 presidential présidentiel
1019 presidential Präsidenten
1020 residence résidence
1020 residence Residenz
1021 residence résidence
1021 residence Residenz
1022 luxurious luxueux
1022 luxurious luxuriös
1023 luxurious luxueux
1023 luxurious luxuriös
1024 colonial-style style colonial
1024 colonial-style Kolonialstil
1025 colonial-style style colonial
1025 colonial-style Kolonialstil
1026 colonial-style style colonial
1026 colonial-style Kolonialstil
1027 apartment appartement
1027 apartment Wohnung
1028 apartment appartement
1028 apartment Wohnung
1029 live vivre
1029 live Leben
1030 president Président
1030 president Präsident
1031 family famille
1031 family Familie
1032 rafael rafael
1032 rafael Rafael
1033 correa sangle
1033 correa Gurt
1034 rafael rafael
1034 rafael Rafael
1035 considering considérant
1035 considering in Anbetracht
1036 correa sangle
1036 correa Gurt
1037 president Président
1037 president Präsident
1038 ca

1208 blocked bloqué
1208 blocked verstopft
1209 indefinitely indéfiniment
1209 indefinitely unbegrenzt
1210 policy politique
1210 policy Politik
1211 blocked bloqué
1211 blocked verstopft
1212 indefinitely indéfiniment
1212 indefinitely unbegrenzt
1213 regardless indépendamment
1213 regardless ungeachtet
1214 regardless indépendamment
1214 regardless ungeachtet
1215 of de
1215 of von
1216 editing édition
1216 editing Bearbeitung
1217 behavior comportement
1217 behavior Verhalten
1218 reviewing révision
1218 reviewing Überprüfung
1219 also aussi
1219 also ebenfalls
1220 reviewing révision
1220 reviewing Überprüfung
1221 administrator administrateur
1221 administrator Administrator
1222 administrator administrateur
1222 administrator Administrator
1223 concludes conclut
1223 concludes schließt ab
1224 justified justifié
1224 justified gerechtfertigt
1225 block bloc
1225 block Block
1226 unblocked débloqué
1226 unblocked nicht blockiert
1227 administrator administrateur
1227 administrator

1391 disaster catastrophe
1391 disaster Katastrophe
1392 world monde
1392 world Welt
1393 street rue
1393 street Straße
1394 art art
1394 art art
1395 debut début
1395 debut Debüt
1396 movie film
1396 movie Film
1397 made fabriqué
1397 made gemacht
1398 sundance Sundance
1398 sundance Sonnentanz
1399 sundance Sundance
1399 sundance Sonnentanz
1400 film film
1400 film Film
1401 festival festival
1401 festival Festival
1402 film film
1402 film Film
1403 festival festival
1403 festival Festival
1404 occasionally parfois
1404 occasionally gelegentlich
1405 rarely rarement
1405 rarely selten
1406 readers lecteurs
1406 readers Leser
1407 edited édité
1407 edited bearbeitet
1408 intention intention
1408 intention Absicht
1409 location location
1409 location Lage
1410 intention intention
1410 intention Absicht
1411 of de
1411 of von
1412 registering enregistrement
1412 registering Registrieren
1413 registering enregistrement
1413 registering Registrieren
1414 registering enregistrement
1414 re

1592 australia Australien
1593 indigenous indigène
1593 indigenous einheimisch
1594 land terre
1594 land land
1595 commonwealth Commonwealth
1595 commonwealth Commonwealth
1596 australians Australiens
1596 australians Australier
1597 territory territoire
1597 territory Gebiet
1598 state Etat
1598 state Zustand
1599 governments Gouvernements
1599 governments Regierungen
1600 australia Australie
1600 australia Australien
1601 land terre
1601 land land
1602 rights droits
1602 rights Rechte
1603 laws lois
1603 laws Gesetze
1604 different différent
1604 different anders
1605 types types
1605 types Arten
1606 land terre
1606 land land
1607 laws lois
1607 laws Gesetze
1608 rights droits
1608 rights Rechte
1609 exist exister
1609 exist existieren
1610 grant subvention
1610 grant gewähren
1611 australia Australie
1611 australia Australien
1612 allowing en permettant
1612 allowing erlaubt
1613 grant subvention
1613 grant gewähren
1614 of de
1614 of von
1615 land terre
1615 land land
1616 indigen

1788 extensive umfangreich
1789 artists artistes
1789 artists Künstler
1790 education éducation
1790 education Bildung
1791 program programme
1791 program Programm
1792 premières premières
1792 premières zuerst
1793 birtwistle birtwistle
1793 birtwistle Birtwistle
1794 works travaux
1794 works funktioniert
1795 britten britannique
1795 britten Briten
1796 midsummer milieu de l'été
1796 midsummer Hochsommer-
1797 night nuit
1797 night Nacht-
1798 dream rêver
1798 dream Traum
1799 death décès
1799 death Tod
1800 venice Venise
1800 venice Venedig
1801 harrison harrison
1801 harrison Harrison
1802 punch coup de poing
1802 punch schlagen
1803 io Je
1803 io ich
1804 judy judy
1804 judy Judy
1805 passion passion
1805 passion Leidenschaft
1806 corridor couloir
1806 corridor Gang
1807 elephants les éléphants
1807 elephants Elefanten
1808 molars molaires
1808 molars Backenzähne
1809 weighs pèse
1809 weighs wiegt
1810 kg kg
1810 kg kg
1811 measures les mesures
1811 measures Maße
1812 lb kg
1812 l

1996 dead mort
1996 dead tot
1997 plant plante
1997 plant Pflanze
1998 mosses mousses
1998 mosses Moose
1999 majority majorité
1999 majority Mehrheit
2000 of de
2000 of von
2001 cases cas
2001 cases Fälle
2002 majority majorité
2002 majority Mehrheit
2003 sphagnum sphaigne
2003 sphagnum Sphagnum
2004 cases cas
2004 cases Fälle
2005 sphagnum sphaigne
2005 sphagnum Sphagnum
2006 moss mousse
2006 moss Moos
2007 moss mousse
2007 moss Moos
2008 creatine créatine
2008 creatine Kreatin
2009 essential essentiel
2009 essential wesentlich
2010 essential essentiel
2010 essential wesentlich
2011 nutrient nutritif
2011 nutrient Nährstoff
2012 nutrient nutritif
2012 nutrient Nährstoff
2013 l-arginine l-arginine
2013 l-arginine L-Arginin
2014 manufactured fabriqué
2014 manufactured hergestellt
2015 human Humain
2015 human Mensch
2016 body corps
2016 body Karosserie
2017 glycine glycine
2017 glycine Glycin
2018 l-methionine l-méthionine
2018 l-methionine L-Methionin
2019 expatriates expatriés
2019 exp

2178 character personnage
2178 character Charakter
2179 intermediate intermédiaire
2179 intermediate mittlere
2180 ionic ionique
2180 ionic ionisch
2181 ionic ionique
2181 ionic ionisch
2182 and et
2182 and und
2183 covalent covalent
2183 covalent kovalent
2184 covalent covalent
2184 covalent kovalent
2185 tallest le plus grand
2185 tallest höchste
2186 tenth dixième
2186 tenth Zehntel
2187 building bâtiment
2187 building Gebäude
2188 california Californie
2188 california Kalifornien
2189 tallest le plus grand
2189 tallest höchste
2190 tallest le plus grand
2190 tallest höchste
2191 united uni
2191 united vereinigt
2192 states États
2192 states Zustände
2193 tallest le plus grand
2193 tallest höchste
2194 west Ouest
2194 west West-
2195 mississippi Mississippi
2195 mississippi Mississippi
2196 river rivière
2196 river Fluss
2197 pinnacle sommet
2197 pinnacle Höhepunkt
2198 building bâtiment
2198 building Gebäude
2199 world monde
2199 world Welt
2200 pinnacle sommet
2200 pinnacle Höhepu

2377 bar Bar
2378 law loi
2378 law Recht
2379 council conseil
2379 council Rat
2380 association association
2380 association Verband
2381 president Président
2381 president Präsident
2382 law loi
2382 law Recht
2383 council conseil
2383 council Rat
2384 australia Australie
2384 australia Australien
2385 co-author coauteur
2385 co-author Mitverfasser
2386 robert robert
2386 robert robert
2387 brooking étouffer
2387 brooking bröckelt
2388 robert robert
2388 robert robert
2389 tenancy location
2389 tenancy Mietvertrag
2390 brooking étouffer
2390 brooking bröckelt
2391 ao à la
2391 ao zum
2392 tenancy location
2392 tenancy Mietvertrag
2393 law loi
2393 law Recht
2394 edition édition
2394 edition Auflage
2395 law loi
2395 law Recht
2396 practice entraine toi
2396 practice trainieren
2397 victoria victoria
2397 victoria Victoria
2398 editor éditeur
2398 editor Editor
2399 succeeded réussi
2399 succeeded gelungen
2400 alexander Alexandre
2400 alexander alexander
2401 macedon macedon
2401 mace

2575 leather cuir
2575 leather Leder
2576 tanning bronzage
2576 tanning Bräunen
2577 leather cuir
2577 leather Leder
2578 tanning bronzage
2578 tanning Bräunen
2579 ceramics céramique
2579 ceramics Keramik
2580 manufacture fabrication
2580 manufacture Herstellung
2581 glass verre
2581 glass Glas
2582 preparation préparation
2582 preparation Vorbereitung
2583 preparation préparation
2583 preparation Vorbereitung
2584 of de
2584 of von
2585 extracts des extraits
2585 extracts Auszüge
2586 extracts des extraits
2586 extracts Auszüge
2587 liquors liqueurs
2587 liquors Liköre
2588 liquors liqueurs
2588 liquors Liköre
2589 aqua aqua
2589 aqua aqua
2590 vitae vie
2590 vitae Leben
2591 preparation préparation
2591 preparation Vorbereitung
2592 aqua aqua
2592 aqua aqua
2593 vitae vie
2593 vitae Leben
2594 experiment expérience
2594 experiment Experiment
2595 water eau
2595 water Wasser
2596 life la vie
2596 life Leben
2597 fairly équitablement
2597 fairly ziemlich
2598 popular populaire
2598 po

2766 pleiades Plejaden
2767 open ouvrir
2767 open öffnen
2768 cluster grappe
2768 cluster Cluster
2769 open ouvrir
2769 open öffnen
2770 cluster grappe
2770 cluster Cluster
2771 open ouvrir
2771 open öffnen
2772 cluster grappe
2772 cluster Cluster
2773 located situé
2773 located gelegen
2774 al à
2774 al zu
2775 ain Nommé
2775 ain Ernannt
2776 the la
2776 the das
2777 emirate émirats
2777 emirate emirate
2778 of de
2778 of von
2779 abu abu
2779 abu abu
2780 dhabi dhabi
2780 dhabi Dhabi
2781 emirate émirats
2781 emirate emirate
2782 emirate émirats
2782 emirate emirate
2783 of de
2783 of von
2784 abu abu
2784 abu abu
2785 dhabi dhabi
2785 dhabi Dhabi
2786 inland intérieur
2786 inland inland
2787 abu abu
2787 abu abu
2788 dhabi dhabi
2788 dhabi Dhabi
2789 border frontière
2789 border Rand
2790 oman Oman
2790 oman Oman
2791 a une
2791 a ein
2792 higher plus haute
2792 higher höher
2793 proportion proportion
2793 proportion Anteil
2794 al à
2794 al zu
2795 ain Nommé
2795 ain Ernannt
2796 h

2964 school Schule
2965 formation formation
2965 formation Formation
2966 thought pensée
2966 thought habe gedacht
2967 based basé
2967 based basierend
2968 physics la physique
2968 physics Physik
2969 interior intérieur
2969 interior Innere
2970 currently actuellement
2970 currently zur Zeit
2971 marketed commercialisé
2971 marketed vermarktet
2972 luxembourg luxembourg
2972 luxembourg Luxemburg
2973 belgium Belgique
2973 belgium Belgien
2974 brazil Brésil
2974 brazil Brasilien
2975 hong hong
2975 hong hong
2976 kong kong
2976 kong kong
2977 italy Italie
2977 italy Italien
2978 japan Japon
2978 japan japan
2979 majority majorité
2979 majority Mehrheit
2980 macau macau
2980 macau macau
2981 morocco Maroc
2981 morocco Marokko
2982 netherlands Pays-Bas
2982 netherlands Niederlande
2983 peru Pérou
2983 peru peru
2984 portugal portugal
2984 portugal portugal
2985 spain Espagne
2985 spain Spanien
2986 taiwan Taïwan
2986 taiwan Taiwan
2987 majority majorité
2987 majority Mehrheit
2988 of de


3147 huxley huxley
3147 huxley Huxley
3148 sir Monsieur
3148 sir Herr
3149 fielding en campagne
3149 fielding Feldung
3150 andrew Andrew
3150 andrew Andrew
3151 nobel nobel
3151 nobel Nobel
3152 prize-winning primé
3152 prize-winning preisgekrönte
3153 huxley huxley
3153 huxley Huxley
3154 om sur
3154 om über
3155 frs frs
3155 frs frs
3156 november novembre
3156 november november
3157 may mai
3157 may kann
3158 nobel nobel
3158 nobel Nobel
3159 physiologist physiologiste
3159 physiologist Physiologe
3160 prize-winning primé
3160 prize-winning preisgekrönte
3161 english Anglais
3161 english Englisch
3162 biophysicist biophysicien
3162 biophysicist Biophysiker
3163 shared partagé
3163 shared geteilt
3164 hodgkin hodgkin
3164 hodgkin Hodgkin
3165 huxley huxley
3165 huxley Huxley
3166 cited cité
3166 cited zitiert
3167 prize prix
3167 prize Preis-
3168 year an
3168 year Jahr
3169 john John
3169 john John
3170 eccles eccles
3170 eccles eccles
3171 research recherche
3171 research Forschung


3347 arms bras
3347 arms Waffen
3348 london Londres
3348 london London
3349 originated originaire
3349 originated entstanden
3350 introduced introduit
3350 introduced eingeführt
3351 grapefruit-flavored aromatisé au pamplemousse
3351 grapefruit-flavored Grapefruitgeschmack
3352 japan Japon
3352 japan japan
3353 sports sports
3353 sports Sport
3354 drink boisson
3354 drink Getränk
3355 sports sports
3355 sports Sport
3356 response réponse
3356 response Antwort
3357 drink boisson
3357 drink Getränk
3358 competitor concurrent
3358 competitor Wettbewerber
3359 brand marque
3359 brand Marke
3360 pocari pocari
3360 pocari Pocari
3361 sports sports
3361 sports Sport
3362 drink boisson
3362 drink Getränk
3363 called appelé
3363 called namens
3364 pocari pocari
3364 pocari Pocari
3365 sweat sweat
3365 sweat Schweiß
3366 sweat sweat
3366 sweat Schweiß
3367 introduced introduit
3367 introduced eingeführt
3368 official officiel
3368 official offiziell
3369 spain Espagne
3369 spain Spanien
3370 por

3542 hockey hockey
3542 hockey Eishockey
3543 forward vers l'avant
3543 forward nach vorne
3544 original original
3544 original Original
3545 six six
3545 six sechs
3546 teams équipes
3546 teams teams
3547 gaye Je suis allé
3547 gaye Ging
3548 stewart stewart
3548 stewart Stewart
3549 played joué
3549 played gespielt
3550 nhl nhl
3550 nhl nhl
3551 original original
3551 original Original
3552 except sauf
3552 except außer
3553 teams équipes
3553 teams teams
3554 boston Boston
3554 boston Boston
3555 bruins bruins
3555 bruins bruins
3556 competing en compétition
3556 competing konkurrierend
3557 due dû
3557 due fällig
3558 competing en compétition
3558 competing konkurrierend
3559 physical physique
3559 physical physisch
3560 effects effets
3560 effects Auswirkungen
3561 physical physique
3561 physical physisch
3562 physical physique
3562 physical physisch
3563 effects effets
3563 effects Auswirkungen
3564 gravity la gravité
3564 gravity Schwere
3565 effects effets
3565 effects Auswirku

3737 ipod ipod
3737 ipod ipod
3738 products des produits
3738 products Produkte
3739 support soutien
3739 support Unterstützung
3740 songs Chansons
3740 songs songs
3741 mpeg-4 mpeg-4
3741 mpeg-4 mpeg-4
3742 aac aac
3742 aac aac
3743 format format
3743 format format
3744 firmware firmware
3744 firmware firmware
3745 update mettre à jour
3745 update aktualisieren
3746 ipods ipods
3746 ipods iPods
3747 update mettre à jour
3747 update aktualisieren
3748 older plus âgée
3748 older älter
3749 major Majeur
3749 major Haupt
3750 scale échelle
3750 scale Rahmen
3751 major Majeur
3751 major Haupt
3752 key clé
3752 key Schlüssel
3753 major Majeur
3753 major Haupt
3754 pitches emplacements
3754 pitches Stellplätze
3755 scale échelle
3755 scale Rahmen
3756 based basé
3756 based basierend
3757 neapolitan napolitain
3757 neapolitan neapolitanisch
3758 key clé
3758 key Schlüssel
3759 major Majeur
3759 major Haupt
3760 key clé
3760 key Schlüssel
3761 neapolitan napolitain
3761 neapolitan neapolitanis

3935 lithographed lithographié
3935 lithographed lithographiert
3936 lithographed lithographié
3936 lithographed lithographiert
3937 poster poster
3937 poster poster
3938 advertisement publicité
3938 advertisement Werbung
3939 poster poster
3939 poster poster
3940 weeks semaines
3940 weeks Wochen
3941 january janvier
3941 january Januar
3942 gismonda Gismonda
3942 gismonda gismonda
3943 play jouer
3943 play abspielen
3944 attracted attiré
3944 attracted angezogen
3945 victorien victorien
3945 victorien viktorianisch
3946 sardou sardou
3946 sardou sardou
3947 posted posté
3947 posted Gesendet
3948 city ville
3948 city Stadt
3949 attention attention
3949 attention Beachtung
3950 satisfied satisfait
3950 satisfied zufrieden
3951 bernhardt bernhardt
3951 bernhardt bernhardt
3952 success Succès
3952 success Erfolg
3953 poster poster
3953 poster poster
3954 contract Contrat
3954 contract Vertrag
3955 began a commencé
3955 began begann
3956 six-year six ans
3956 six-year sechs Jahre
3957 much

4129 year an
4129 year Jahr
4130 character personnage
4130 character Charakter
4131 fox Renard
4131 fox Fuchs
4132 tv la télé
4132 tv Fernseher
4133 series séries
4133 series Serie
4134 tru tru
4134 tru Tru
4135 calling appel
4135 calling Berufung
4136 played joué
4136 played gespielt
4137 main main
4137 main Main
4138 medical médical
4138 medical medizinisch
4139 medical médical
4139 medical medizinisch
4140 student étudiant
4140 student Schüler
4141 tru tru
4141 tru Tru
4142 davies davies
4142 davies Davies
4143 student étudiant
4143 student Schüler
4144 tru tru
4144 tru Tru
4145 davies davies
4145 davies Davies
4146 landed a atterri
4146 landed gelandet
4147 dushku chênes
4147 dushku Eichen
4148 alongside aux côtés de
4148 alongside neben
4149 role rôle
4149 role Rolle
4150 pearl perle
4150 pearl Perle
4151 robert robert
4151 robert robert
4152 de de
4152 de von
4153 niro niro
4153 niro niro
4154 leonardo Leonardo
4154 leonardo Leonardo
4155 dicaprio dicaprio
4155 dicaprio dicaprio


4536 santiago Santiago
4536 santiago Santiago
4537 de de
4537 de von
4538 cuba Cuba
4538 cuba Kuba
4539 santiago Santiago
4539 santiago Santiago
4540 cuba Cuba
4540 cuba Kuba
4541 province province
4541 province Provinz
4542 economy économie
4542 economy Wirtschaft
4543 relies repose
4543 relies verlässt sich
4544 agriculture agriculture
4544 agriculture Landwirtschaft
4545 large large
4545 large groß
4546 plantations plantations
4546 plantations Plantagen
4547 large large
4547 large groß
4548 plantations plantations
4548 plantations Plantagen
4549 cacao cacao
4549 cacao Kakao
4550 growing croissance
4550 growing wachsend
4551 bananas bananes
4551 bananas Bananen
4552 coffee café
4552 coffee Kaffee
4553 dotting pointillage
4553 dotting punktieren
4554 the la
4554 the das
4555 landscape paysage
4555 landscape Landschaft
4556 coffee café
4556 coffee Kaffee
4557 dotting pointillage
4557 dotting punktieren
4558 landscape paysage
4558 landscape Landschaft
4559 organometallic organométalliqu

4721 product Produkt
4722 keeping en gardant
4722 keeping halten
4723 system système
4723 system System
4724 ibm ibm
4724 ibm ibm
4725 stopped arrêté
4725 stopped gestoppt
4726 error Erreur
4726 error Error
4727 error Erreur
4727 error Error
4728 codes codes
4728 codes Codes
4729 dbms dbms
4729 dbms dbms
4730 codes codes
4730 codes Codes
4731 secret secret
4731 secret Geheimnis
4732 inspiration inspiration
4732 inspiration Inspiration
4733 ellison ellison
4733 ellison ellison
4734 edgar edgar
4734 edgar edgar
4735 paper papier
4735 paper Papier-
4736 written écrit
4736 written geschrieben
4737 relational relationnel
4737 relational relational
4738 codd codex
4738 codd Codices
4739 relational relationnel
4739 relational relational
4740 database base de données
4740 database Datenbank
4741 relational relationnel
4741 relational relational
4742 database base de données
4742 database Datenbank
4743 management la gestion
4743 management Management
4744 systems systèmes
4744 systems Systeme


4906 projects Projekte
4907 american américain
4907 american amerikanisch
4908 lacrosse lacrosse
4908 lacrosse Lacrosse
4909 conference conférence
4909 conference Konferenz
4910 american américain
4910 american amerikanisch
4911 lacrosse lacrosse
4911 lacrosse Lacrosse
4912 lacrosse-only lacrosse seulement
4912 lacrosse-only nur Lacrosse
4913 conference conférence
4913 conference Konferenz
4914 alc Alk
4914 alc alk
4915 ncaa ncaa
4915 ncaa ncaa
4916 division division
4916 division Aufteilung
4917 women femmes
4917 women Frau
4918 conference conférence
4918 conference Konferenz
4919 college Université
4919 college Hochschule
4920 athletic athlétique
4920 athletic sportlich
4921 members membres
4921 members Mitglieder
4922 located situé
4922 located gelegen
4923 eastern est
4923 eastern östlich
4924 half moitié
4924 half Hälfte
4925 united uni
4925 united vereinigt
4926 states États
4926 states Zustände
4927 leadership direction
4927 leadership Führung
4928 borislav borislav
4928 borisla

5107 put mettre
5107 put stellen
5108 future avenir
5108 future Zukunft
5109 in dans
5109 in in
5110 doubt doute
5110 doubt Zweifel
5111 doubt doute
5111 doubt Zweifel
5112 sites sites
5112 sites Websites
5113 rejected rejeté
5113 rejected abgelehnt
5114 shortlisting présélection
5114 shortlisting Kurzliste
5115 shortlisting présélection
5115 shortlisting Kurzliste
5116 phase phase
5116 phase phase
5117 phase phase
5117 phase phase
5118 announced annoncé
5118 announced angekündigt
5119 included inclus
5119 included inbegriffen
5120 january janvier
5120 january Januar
5121 regional régional
5121 regional regional
5122 regional régional
5122 regional regional
5123 casino casino
5123 casino Kasino
5124 casino casino
5124 casino Kasino
5125 built construit
5125 built gebaut
5126 city ville
5126 city Stadt
5127 of de
5127 of von
5128 manchester Manchester
5128 manchester Manchester
5129 stadium stade
5129 stadium Stadion
5130 east est
5130 east Osten
5131 manchester Manchester
5131 manchest

5295 schools écoles
5295 schools Schulen
5296 styles styles
5296 styles Stile
5297 properties Propriétés
5297 properties Eigenschaften
5298 registered inscrit
5298 registered Eingetragen
5299 municipal municipal
5299 municipal Gemeinde
5300 inventory inventaire
5300 inventory Inventar
5301 inventory inventaire
5301 inventory Inventar
5302 heritage patrimoine
5302 heritage Erbe
5303 heritage patrimoine
5303 heritage Erbe
5304 properties Propriétés
5304 properties Eigenschaften
5305 properties Propriétés
5305 properties Eigenschaften
5306 moved déplacé
5306 moved gerührt
5307 europe europe
5307 europe Europa
5308 signing signer
5308 signing Unterzeichnung
5309 manchester Manchester
5309 manchester Manchester
5310 manchester Manchester
5310 manchester Manchester
5311 phoenix phénix
5311 phoenix Phönix
5312 phoenix phénix
5312 phoenix Phönix
5313 summer été
5313 summer Sommer-
5314 play jouer
5314 play abspielen
5315 tony tony
5315 tony Tony
5316 hand main
5316 hand hand
5317 left la gauch

5491 screen écran
5491 screen Bildschirm
5492 actors acteurs
5492 actors Schauspieler
5493 federation fédération
5493 federation Föderation
5494 sag-aftra Guilde découragea
5494 sag-aftra Fall-AFTRA
5495 television télévision
5495 television Fernsehen
5496 radio radio
5496 radio Radio
5497 artists artistes
5497 artists Künstler
5498 recognize reconnaître
5498 recognize erkenne
5499 outstanding exceptionnel
5499 outstanding hervorragend
5500 outstanding exceptionnel
5500 outstanding hervorragend
5501 performances performances
5501 performances Aufführungen
5502 primetime prime time
5502 primetime Primetime
5503 performances performances
5503 performances Aufführungen
5504 film film
5504 film Film
5505 primetime prime time
5505 primetime Primetime
5506 television télévision
5506 television Fernsehen
5507 television télévision
5507 television Fernsehen
5508 nominations nominations
5508 nominations Nominierungen
5509 committees comités
5509 committees Ausschüsse
5510 awards prix
5510 award

5679 vigna vignoble
5679 vigna Weinberg
5680 angularis pierre angulaire
5680 angularis Eckstein
5681 widely largement
5681 widely weit
5682 grown cultivé
5682 grown gewachsen
5683 east est
5683 east Osten
5684 asia Asie
5684 asia Asien
5685 himalayas Himalaya
5685 himalayas Himalaya
5686 small petit
5686 small klein
5687 bean haricot
5687 bean Bohne
5688 mm mm
5688 mm mm
5689 well bien
5689 well Gut
5690 preserved conservé
5690 preserved konserviert
5691 preserved conservé
5691 preserved konserviert
5692 partial partiel
5692 partial teilweise
5693 partial partiel
5693 partial teilweise
5694 skeleton squelette
5694 skeleton Skelett
5695 skeleton squelette
5695 skeleton Skelett
5696 feathers plumes
5696 feathers Gefieder
5697 includes inclut
5697 includes beinhaltet
5698 long longue
5698 long lange
5699 tail queue
5699 tail Schwanz
5700 composed composé
5700 composed zusammengesetzt
5701 central central
5701 central zentral
5702 rachis rachis
5702 rachis Rücken
5703 central central
5703 

5874 true vrai
5874 true wahr
5875 lies mentir
5875 lies Lügen
5876 appeared apparu
5876 appeared erschienen
5877 simple simple
5877 simple einfach
5878 plan plan
5878 plan planen
5879 music la musique
5879 music Musik-
5880 video vidéo
5880 video Video
5881 simple simple
5881 simple einfach
5882 kid enfant
5882 kid Kind
5883 plan plan
5883 plan planen
5884 music la musique
5884 music Musik-
5885 video vidéo
5885 video Video
5886 'm 'm
5886 'm 'm
5887 just juste
5887 just gerade
5888 band B: et
5888 band band
5889 love amour
5889 love Liebe
5890 interest intérêt
5890 interest Interesse
5891 love amour
5891 love Liebe
5892 nickelback nickelback
5892 nickelback Nickelback
5893 interest intérêt
5893 interest Interesse
5894 rockstar rock star
5894 rockstar rockstar
5895 video vidéo
5895 video Video
5896 variety variété
5896 variety Vielfalt
5897 announced annoncé
5897 announced angekündigt
5898 co-star co-star
5898 co-star Co-Star
5899 august août
5899 august august
5900 dushku chênes
5900

6075 field champ
6075 field Feld
6076 solely uniquement
6076 solely einzig und allein
6077 represented représentée
6077 represented repräsentiert
6078 opposed opposé
6078 opposed entgegengesetzt
6079 nation nation
6079 nation Nation
6080 usual habituel
6080 usual gewöhnlich
6081 format format
6081 format format
6082 team équipe
6082 team team
6083 usual habituel
6083 usual gewöhnlich
6084 format format
6084 format format
6085 formula formule
6085 formula Formel
6086 formula formule
6086 formula Formel
6087 racing courses
6087 racing Rennen
6088 formula formule
6088 formula Formel
6089 racing courses
6089 racing Rennen
6090 series séries
6090 series Serie
6091 racing courses
6091 racing Rennen
6092 series séries
6092 series Serie
6093 nation-based basé sur la nation
6093 nation-based national basiert
6094 concept concept
6094 concept Konzept
6095 a1gp a1gp
6095 a1gp a1gp
6096 maktoum maktoum
6096 maktoum Maktoum
6097 founded fondé
6097 founded Gegründet
6098 sheikh cheik
6098 sheikh Sch

6271 thymine Thymin
6272 transcription transcription
6272 transcription Transkription
6273 sequence séquence
6273 sequence Sequenz
6274 dna goutte
6274 dna Gicht
6275 rna ARN
6275 rna RNA
6276 polymerase polymérase
6276 polymerase Polymerase
6277 read lis
6277 read lesen
6278 rna ARN
6278 rna RNA
6279 polymerase polymérase
6279 polymerase Polymerase
6280 complementary complémentaire
6280 complementary komplementär
6281 produces produit
6281 produces produziert
6282 antiparallel antiparallèle
6282 antiparallel antiparallel
6283 a une
6283 a ein
6284 primary primaire
6284 primary primär
6285 transcript transcription
6285 transcript Abschrift
6286 rna ARN
6286 rna RNA
6287 strand plage
6287 strand strand
6288 called appelé
6288 called namens
6289 primary primaire
6289 primary primär
6290 transcript transcription
6290 transcript Abschrift
6291 primary primaire
6291 primary primär
6292 transcript transcription
6292 transcript Abschrift
6293 alchemical alchimique
6293 alchemical alchemistisc

6460 canary Kanarienvogel
6461 west Ouest
6461 west West-
6462 canary canari
6462 canary Kanarienvogel
6463 islands îles
6463 islands Inseln
6464 azores Açores
6464 azores Azoren
6465 islands îles
6465 islands Inseln
6466 generations générations
6466 generations Generationen
6467 acherontia Achérontia
6467 acherontia Achorontia
6468 acherontia Achérontia
6468 acherontia Achorontia
6469 atropos atropos
6469 atropos Atropos
6470 atropos atropos
6470 atropos Atropos
6471 continuous continu
6471 continuous kontinuierlich
6472 year an
6472 year Jahr
6473 continuous continu
6473 continuous kontinuierlich
6474 broods couvées
6474 broods Brut
6475 broods couvées
6475 broods Brut
6476 africa Afrique
6476 africa Afrika
6477 singly séparément
6477 singly einzeln
6478 eggs des œufs
6478 eggs Eier
6479 laid posé
6479 laid gelegt
6480 solanaceae solanaceae
6480 solanaceae Solanaceae
6481 leaves feuilles
6481 leaves Blätter
6482 physalis physalis
6482 physalis Physalis
6483 potato Patate
6483 potato 

In [727]:
from similarity.ngram import NGram

def cognate_across_languages_sim(target, sim_func, agg_func):
    targ = target.strip().lower()
    translated = translations.get(targ)
    if not translated:
        return 0
    trans_texts = set([trans_word.text for trans_word in translated])
    similarities = [sim_func(targ,trans_text) 
                    for trans_text in trans_texts]
    return agg_func(similarities)

In [730]:
len(word_freq_wiki)

1901124

In [728]:
bigram_dist = NGram(2)
trigram_dist = NGram(3)
df['cal_ngram_2_dist_min'] = df.target.apply(lambda target : agg_feat_num_average(target, cognate_across_languages_sim, \
                                        lambda source, dest : bigram_dist.distance(source, dest), np.min))
df['cal_ngram_2_dist_mean'] = df.target.apply(lambda target : agg_feat_num_average(target, cognate_across_languages_sim, \
                                        lambda source, dest : bigram_dist.distance(source, dest), np.mean))
df['cal_ngram_3_dist_min'] = df.target.apply(lambda target : agg_feat_num_average(target, cognate_across_languages_sim, \
                                        lambda source, dest : trigram_dist.distance(source, dest), np.min))
df['cal_ngram_3_dist_mean'] = df.target.apply(lambda target : agg_feat_num_average(target, cognate_across_languages_sim, \
                                        lambda source, dest : trigram_dist.distance(source, dest), np.mean))

In [726]:
[translation.text for translation in translations['radio']]

['radio', 'Radio', 'radio', 'Radio', 'radio', 'Radio', 'radio', 'Radio']

In [729]:
df['num_syllables'] = df.target.apply(lambda target : agg_feat_num_average(target, num_syllables))
df['num_vowels'] = df.target.apply(lambda target : agg_feat_num_average(target, num_vowels))
df['vowel_consonant_ratio'] = df.target.apply(lambda target : agg_feat_num_average(target, \
                                        lambda target : num_vowels(target) / (len(target) - num_vowels(target))))
df['num_pronounciations'] = df.target.apply(lambda target : agg_feat_num_average(target, num_pronounciations))
df.loc[:,['target','length','cal_ngram_2_dist_min','cal_ngram_2_dist_mean', 'cal_ngram_3_dist_min', 'cal_ngram_3_dist_mean']]

Unnamed: 0,target,length,cal_ngram_2_dist_min,cal_ngram_2_dist_mean,cal_ngram_3_dist_min,cal_ngram_3_dist_mean
0,passed,6.000000,0.250000,0.513889,0.222222,0.518519
1,land,4.000000,0.000000,0.500000,0.000000,0.500000
2,future,6.000000,0.714286,0.857143,0.690476,0.845238
3,future generations,8.500000,0.448052,0.536526,0.443723,0.537834
4,generations,11.000000,0.181818,0.215909,0.196970,0.230429
5,recognizes,10.000000,0.450000,0.625000,0.400000,0.591667
6,community,9.000000,0.250000,0.541667,0.266667,0.550000
7,traditional,11.000000,0.166667,0.166667,0.138889,0.138889
8,traditional connection to that country,6.800000,0.394762,0.604048,0.422063,0.518254
9,country,7.000000,0.857143,0.928571,0.904762,0.952381


# Frequency Features


In [51]:
from nltk.stem.wordnet import *
from collections import defaultdict

wordNetLemmatizer = WordNetLemmatizer()

word_freq_wiki = {}
with open("resources/word-freq-dumps/enwiki-20150602-words-frequency.txt", encoding="utf8") as file:
    for line in file:
        word, freq = line.partition(" ")[::2]
        word_freq_wiki[word.strip()] = int(freq)

word_freq_bnc = {}
with open("resources/word-freq-dumps/bnc_freq_all.al", encoding="utf8") as file:
    for line in file:
        freq, word, pos, num_files = line.split()
        word_freq_bnc[word.strip()] = (int(freq), pos, int(num_files))

word_freq_bnc_lemma = {}
with open("resources/word-freq-dumps/bnc_lemma.al", encoding="utf8") as file:
    for line in file:
        sort_order, frequency, word, word_class = line.split()
        word_freq_bnc_lemma[word.strip()] = (int(sort_order), word_class, int(frequency))
        
word_concreteness = {}
with open("resources/word-freq-dumps/concreteness_brysbaert_et_al.txt", encoding="utf8") as file:
    for line in file:
        word, bigram, conc_m, conc_sd, \
        unknown, total, percent_known, \
        subtlex, dom_pos = line.split('\t')
        word_concreteness[word.strip()] = float(conc_m)
        
word_age_of_aquisition = {}
with open("resources/word-freq-dumps/AoA_ratings_Kuperman_et_al_BRM.csv", encoding="utf8") as file:
    for line in file:
        word, occur_total, occur_num, freq_pm, rating_Mean, rating_SD, dunno = line.split()
        word_age_of_aquisition[word.strip()] = float(rating_Mean.replace(',', '.')) if rating_Mean != 'NA' else 0

word_pknown_nobs_prev_freqZipf = {}
with open("resources/word-freq-dumps/word_prevelance.csv", encoding="utf8") as file:
    for line in file:
        word, p_known, nobs, prevelance, freqZipf = line.split(";")
        word_pknown_nobs_prev_freqZipf[word.strip()] = (float(p_known.replace(',','.')), 
                                                        float(nobs.replace(',','.')), 
                                                        float(prevelance.replace(',','.')), 
                                                        float(freqZipf.replace(',','.')))

brown_cluster_word2cluster = {}
brown_cluster_cluster2words = defaultdict(list)
with open("resources/brown-clustering/paths/rcv1.clean-c6000-p1.paths", encoding="utf8") as file:
    for line in file:
        binary_cluster, word, _ = line.split()
        brown_cluster_word2cluster[word] = binary_cluster
        brown_cluster_cluster2words[binary_cluster].append(word)
        
def get_dict_count(target, freqs):
    return freqs.get(target.strip().lower(), 0)

def perc_known(target):
    stats = word_pknown_nobs_prev_freqZipf.get(target)
    return stats[0] if stats else 0

def nobs(target):
    stats = word_pknown_nobs_prev_freqZipf.get(target)
    return stats[1] if stats else 0

def prevelance(target):
    stats = word_pknown_nobs_prev_freqZipf.get(target)
    return stats[2] if stats else 0

def freqZipf(target):
    stats = word_pknown_nobs_prev_freqZipf.get(target)
    return stats[3] if stats else 0

def brown_clustering_cluster_size(target):
    cluster = brown_cluster_word2cluster.get(target)
    return len(brown_cluster_cluster2words[cluster]) if cluster else 0

def brown_clustering_cluster_depth_simple(target):
    cluster = brown_cluster_word2cluster.get(target)
    return int(cluster, 2) if cluster else 0

def brown_clustering_cluster_depth_bit(target):
    cluster = brown_cluster_word2cluster.get(target)
    if not cluster:
        return 0
    return np.sum([1 for bit in cluster if bit == '1'])

def brown_clustering_cluster_size_all(target):
    cluster = brown_cluster_word2cluster.get(target)
    if not cluster:
        return 0
    upper_clusters = [cluster[0:(len(cluster) - index)] + '0' * index \
         for index, bit in enumerate(reversed(cluster)) if bit == '1']
    cluster_counts = [len(brown_cluster_cluster2words.get(clu, [])) \
                         for clu in upper_clusters]
    return np.sum(cluster_counts)

In [None]:
import phrasefinder as pf

google_books_n_grams = {}
options = pf.SearchOptions()
options.topk = 10
n_grams = df['target'].tolist()

for index, n_gram in enumerate(n_grams):
    try:
        print(index, n_gram)
        result = pf.search(pf.Corpus.AMERICAN_ENGLISH, n_gram, options)
        vals = [(phrase.match_count, phrase.volume_count, phrase.first_year, phrase.last_year)
                    for phrase in result.phrases]
        mean_vals = [np.sum(elem) / len(elem) for elem in zip(*vals)]
        google_books_n_grams[n_gram] = mean_vals
        if result.status != pf.Status.Ok:
            print('Request was not successful: {}'.format(result.status))
    except Exception as error:
        pass


In [67]:
wordNetLemmatizer = WordNetLemmatizer()
print(wordNetLemmatizer.lemmatize("written", 'v'))

write


In [54]:
from nltk.stem.wordnet import *
wordNetLemmatizer = WordNetLemmatizer()

df['freq_wiki'] = df.target.apply(lambda target : agg_feat_num_average(target, get_dict_count, word_freq_wiki))
df['freq_bnc'] = df.target.apply(lambda target : agg_feat_num_average(target, \
                                lambda target : word_freq_bnc.get(target)[0] if word_freq_bnc.get(target) else 0))
df['freq_bnc_lemma'] = df.target.apply(lambda target : agg_feat_num_average(target, \
                lambda target : word_freq_bnc_lemma.get(wordNetLemmatizer.lemmatize(target))[2] \
                                        if word_freq_bnc_lemma.get(wordNetLemmatizer.lemmatize(target)) else 0))
df['perc_known'] = df.target.apply(lambda target : agg_feat_num_average(target, perc_known))
df['nobs'] = df.target.apply(lambda target : agg_feat_num_average(target, nobs))
df['prevelance'] = df.target.apply(lambda target : agg_feat_num_average(target, prevelance))
df['freqZipf'] = df.target.apply(lambda target : agg_feat_num_average(target, freqZipf))
df['brown_clustering_cluster_size'] = df.target.apply(lambda target : agg_feat_num_average(target, brown_clustering_cluster_size))
df['brown_clustering_cluster_size_all'] = df.target.apply(lambda target : agg_feat_num_average(target, brown_clustering_cluster_size_all))
df['brown_clustering_cluster_depth_simple'] = df.target.apply(lambda target : agg_feat_num_average(target, brown_clustering_cluster_depth_simple))
df['brown_clustering_cluster_depth_bit'] = df.target.apply(lambda target : agg_feat_num_average(target, brown_clustering_cluster_depth_bit))
df['concreteness'] = df.target.apply(lambda target : agg_feat_num_average(target, \
                                                                          lambda target : word_concreteness.get(target, 0)))
df['age_of_aquisition'] = df.target.apply(lambda target : agg_feat_num_max(target, \
                                                                          lambda target : word_age_of_aquisition.get(target, 0)))
#df['google_books_n_gram_freq'] = df.target.apply(lambda target : google_books_n_grams.get(target)[0] \
#                                                 if google_books_n_grams.get(target) else 0)
#df['google_books_n_gram_doc_freq'] = df.target.apply(lambda target : google_books_n_grams.get(target)[1] \
#                                                    if google_books_n_grams.get(target)  else 0)
#df['google_books_n_gram_first_year'] = df.target.apply(lambda target : google_books_n_grams.get(target)[2] \
#                                                      if google_books_n_grams.get(target)  else 0)
#df['google_books_n_gram_last_year'] = df.target.apply(lambda target : google_books_n_grams.get(target)[3] \
#                                                     if google_books_n_grams.get(target)  else 0)

In [None]:
df.loc[:,['target', 'google_books_n_gram_freq', 'freq_wiki', 'freq_bnc', 'freq_bnc_lemma']]

# Psycholinguistic Features based on MRC Database


In [461]:
from wordmodel import Word

words_mrc_database = {}
with open("resources/mrc-database/mrc2.dct", encoding="utf8") as file:
    for index, line in enumerate(file):
        line = line.strip()
        word, phon, dphon, stress = line[51:].split('|')
        w = Word(
                wid = index,
                nlet = int(line[0:2]),
                nphon = int(line[2:4]),
                nsyl = int(line[4]),
                kf_freq = int(line[5:10]),
                kf_ncats = int(line[10:12]),
                kf_nsamp = int(line[12:15]),
                tl_freq = int(line[15:21]),
                brown_freq = int(line[21:25]),
                fam = int(line[25:28]),
                conc = int(line[28:31]),
                imag = int(line[31:34]),
                meanc = int(line[34:37]),
                meanp = int(line[37:40]),
                aoa = int(line[40:43]),
                tq2 = line[43],
                wtype = line[44],
                pdwtype = line[45],
                alphasyl = line[46],
                status = line[47],
                var = line[48],
                cap = line[49],
                irreg = line[50],
                word=word,
                phon=phon,
                dphon=dphon,
                stress=stress)
        words_mrc_database[w.word.strip().lower()] = w

def mrc_database(target, func):
    word = words_mrc_database.get(target.strip().lower())
    return func(word) if word else 0

In [None]:
df['mrc_kf_freq'] = df.target.apply(lambda target : agg_feat_num_average(target, mrc_database, lambda word : word.kf_freq))
df['mrc_kf_ncats'] = df.target.apply(lambda target : agg_feat_num_average(target, mrc_database, lambda word : word.kf_ncats))
df['mrc_tl_freq'] = df.target.apply(lambda target : agg_feat_num_average(target, mrc_database, lambda word : word.tl_freq))
df['mrc_brown_freq'] = df.target.apply(lambda target : agg_feat_num_average(target, mrc_database, lambda word : word.brown_freq))
df['mrc_fam'] = df.target.apply(lambda target : agg_feat_num_average(target, mrc_database, lambda word : word.fam))
df['mrc_conc'] = df.target.apply(lambda target : agg_feat_num_average(target, mrc_database, lambda word : word.conc))
df['mrc_imag'] = df.target.apply(lambda target : agg_feat_num_average(target, mrc_database, lambda word : word.imag))
df['mrc_meanc'] = df.target.apply(lambda target : agg_feat_num_average(target, mrc_database, lambda word : word.meanc))
df['mrc_meanp'] = df.target.apply(lambda target : agg_feat_num_average(target, mrc_database, lambda word : word.meanp))
df['mrc_aoa'] = df.target.apply(lambda target : agg_feat_num_average(target, mrc_database, lambda word : word.aoa))

df.loc[:,['target', 'mrc_kf_freq', 'mrc_kf_ncats', 'mrc_brown_freq','mrc_tl_freq','mrc_fam','mrc_conc','mrc_imag','mrc_meanc','mrc_meanp', 'mrc_aoa']]

# Language Model Features

In [658]:
import nltk

print("... build")
brown = nltk.corpus.brown
corpus = [word.lower() for word in brown.words()]

# Train on 95% f the corpus and test on the rest
spl = int(95*len(corpus)/100)
print(spl)
train = corpus[:spl]
test = corpus[spl:]

# Remove rare words from the corpus
fdist = nltk.FreqDist(w for w in train)
vocabulary = set(map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.items())))

train = map(lambda x: x if x in vocabulary else "*unknown*", train)
test = map(lambda x: x if x in vocabulary else "*unknown*", test)

print("... train")
from nltk.
from nltk.model import NgramModel
from nltk.probability import LidstoneProbDist

estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) 
lm = NgramModel(5, train, estimator=estimator)

print("len(corpus) = %s, len(vocabulary) = %s, len(train) = %s, len(test) = %s" % ( len(corpus), len(vocabulary), len(train), len(test) ))
print("perplexity(test) =", lm.perplexity(test))

... build
1103132
... train


ModuleNotFoundError: No module named 'nltk.model'

In [660]:
from nltk.corpus import reuters
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
 
first_sentence = reuters.sents()[0]
print(first_sentence) # [u'ASIAN', u'EXPORTERS', u'FEAR', u'DAMAGE', u'FROM' ...
 
# Get the bigrams
print(list(bigrams(first_sentence))) # [(u'ASIAN', u'EXPORTERS'), (u'EXPORTERS', u'FEAR'), (u'FEAR', u'DAMAGE'), (u'DAMAGE', u'FROM'), ...
 
# Get the padded bigrams
print(list(bigrams(first_sentence, pad_left=True, pad_right=True))) # [(None, u'ASIAN'), (u'ASIAN', u'EXPORTERS'), (u'EXPORTERS', u'FEAR'), (u'FEAR', u'DAMAGE'), (u'DAMAGE', u'FROM'),
 
# Get the trigrams
print(list(trigrams(first_sentence))) # [(u'ASIAN', u'EXPORTERS', u'FEAR'), (u'EXPORTERS', u'FEAR', u'DAMAGE'), (u'FEAR', u'DAMAGE', u'FROM'), ...
 
# Get the padded trigrams
print(list(trigrams(first_sentence, pad_left=True, pad_right=True))) # [(None, None, u'ASIAN'), (None, u'ASIAN', u'EXPORTERS'), (u'ASIAN', u'EXPORTERS', u'FEAR'), (u'EXPORTERS', u'FEAR', u'DAMAGE'), (u'FEAR', u'DAMAGE', u'FROM') ...
 

['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.', 'And', 'Japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'Asia', "'", 's', 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'far', '-', 'reaching', 'economic', 'damage', ',', 'businessmen', 'and', 'officials', 'said', '.']
[('ASIAN', 'EXPORTERS'), ('EXPORTERS', 'FEAR'), ('FEAR', 'DAMAGE'), ('DAMAGE', 'FROM'), ('FROM', 'U'), ('U', '.'), ('.', 'S'), ('S', '.-'), ('.-', 'JAPAN'), ('JAPAN', 'RIFT'), ('RIFT', 'Mounting'), ('Mounting', 'trade'), ('trade', 'friction'), ('friction', 'between'), ('between', 'the'), ('the', 'U'), ('U', '.'), ('.', 'S'), ('S', '.'), ('.', 'And'), ('And', 'Japan'), ('Japan', 'has'), ('has', 'raised'), ('raised', 'fears'), ('fears', 'among'), ('among', 'many'), ('many', 'of'), ('of', 'Asia'), ('Asia', "'"), ("'", 's'), ('s', 'exporting'), ('exporting', 'nations'), ('nations', 'that'), 

# WordNet Features
Here we implement all the relevant features based on WordNet and SentiWordNet. For example, the number of synsets the target word is contained in or the average length of the lemmas of all the synsets the target word is contained in. Note that all features that are computed in the following exploit neither the POS-Tag of the target word nor Word Sense Disambiguation by e.g. UKB-Algorithm.

In [49]:
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.wsd import lesk
from nltk.stem.wordnet import *

wordNetLemmatizer = WordNetLemmatizer()

def wn_synset_freq(target):
    return len(wn.synsets(target))

def wn_synset_avg_lemma_freq(target):
    return np.nan_to_num(np.mean([len(synset.lemmas()) 
            for synset in wn.synsets(target)]))

def wn_synset_avg_lemma_len(target):
    return np.nan_to_num(np.nanmean([len(lemma.name()) 
            for synset in wn.synsets(target) 
            for lemma in synset.lemmas()]))

def wn_synset_avg_hypernyms(target):
    return np.nan_to_num(np.nanmean([len(synset.hypernyms()) 
            for synset in wn.synsets(target)]))

def wn_synset_avg_hyponyms(target):
    return np.nan_to_num(np.mean([len(synset.hyponyms()) 
            for synset in wn.synsets(target)]))

def wn_synset_sum_hypernyms(target):
    return np.sum(([len(synset.hypernyms()) 
            for synset in wn.synsets(target)]))

def wn_synset_avg_definition_len(target):
    return np.nan_to_num(np.mean([len(str(synset.definition())) 
            for synset in wn.synsets(target)]))

def wn_synset_avg_hyptree_depth(target):
    return np.nan_to_num(np.mean([synset.max_depth() 
            for synset in wn.synsets(target)]))

def wn_synset_num_distinct_pos(target):
    return len(set([synset.pos() for synset in wn.synsets(target)]))

def wn_synset_avg_num_relations(target):
    return np.nan_to_num(np.mean([np.sum([len(synset.hypernyms()), len(synset.hyponyms()), 
             len(synset.instance_hypernyms()), len(synset.instance_hyponyms()),
             len(synset.member_holonyms()), len(synset.substance_holonyms()),
             len(synset.part_holonyms()), len(synset.member_meronyms()),
             len(synset.substance_meronyms()), len(synset.part_meronyms())]) 
             for synset in wn.synsets(target)]))

def wn_synset_avg_freq_pos(target, pos):
    return len(wn.synsets(target, pos = pos))

def wn_synset_sense_entropy_uniform(target):
    num_senses = len(wn.synsets(target))
    return -np.sum([((1 / num_senses) * np.log2(1 / num_senses)) 
                     for index in range(0, num_senses)])

def wn_synset_sense_entropy_pos_uniform(target):
    num_senses = len(wn.synsets(target))
    pos_distribution = [len(wn.synsets(target, pos = wn.NOUN)),
                        len(wn.synsets(target, pos = wn.VERB)),
                        len(wn.synsets(target, pos = wn.ADJ)),
                        len(wn.synsets(target, pos = wn.ADV))]
    return -np.sum([(np.nan_to_num((count / num_senses) * np.log2(count / num_senses))) 
            for count in pos_distribution]) if num_senses != 0 else 0

def wn_synsets_sense_entropy_pos_central(target, pos):
    num_senses_pos = len(wn.synsets(target, pos = pos))
    return -np.sum([((1 / num_senses_pos) * np.log2(1 / num_senses_pos))
                     for index in range(0, num_senses_pos)])

def wn_synsets_avg_lemma_freq(target, freqs_func, freqs):
    synsets = wn.synsets(target)
    if not synsets:
        return 0
    return np.mean([np.nan_to_num(freqs_func(lemma.name(), freqs)) for synset in synsets
                    for lemma in synset.lemmas()])

def penn_to_wn(tag):
    if not tag:
        return None
    if tag.startswith('N'):
        return 'n'
    if tag.startswith('V'):
        return 'v'
    if tag.startswith('J'):
        return 'a'
    if tag.startswith('R'):
        return 'r'
    return None

def wn_synsets_freq_ratio_to_max_agg_min(target, freqs_func, freqs):
    lemmas = [lemma.name().split('_') for synset in wn.synsets(target) 
                  for lemma in synset.lemmas()]
    if not lemmas:
        return 1
    freqis = [np.min([freqs_func(lemma, freqs) for lemma in lemmata]) 
              for lemmata in lemmas]
    target_freq = freqs_func(target, freqs)
    max_freq = np.max(freqis)
    return target_freq / max_freq

def wn_synsets_freq_ratio_to_max_agg_mean(target, freqs_func, freqs):
    lemmas = [lemma.name().split('_') for synset in wn.synsets(target) 
                  for lemma in synset.lemmas()]
    if not lemmas:
        return 1
    freqis = [np.mean([freqs_func(lemma, freqs) for lemma in lemmata]) 
              for lemmata in lemmas]
    target_freq = freqs_func(target, freqs)
    max_freq = np.max(freqis)
    return target_freq / max_freq

def wn_synsets_freq_ratio_to_max_agg_median(target, freqs_func, freqs):
    lemmas = [lemma.name().split('_') for synset in wn.synsets(target) 
                  for lemma in synset.lemmas()]
    if not lemmas:
        return 1
    freqis = [np.median([freqs_func(lemma, freqs) for lemma in lemmata]) 
              for lemmata in lemmas]
    target_freq = freqs_func(target, freqs)
    max_freq = np.max(freqis)
    return target_freq / max_freq
    
def swn_avg_objective_score(target):
    return np.nan_to_num(np.mean([senti_synset.obj_score() 
                for senti_synset in swn.senti_synsets(target)]))

def pos_tag(sentence, target):
    tokens = nltk.word_tokenize(sentence)
    wordPOSPairs = [token for token in nltk.pos_tag(tokens) if token[0] == target]
    return wordPOSPairs[0][1] if len(wordPOSPairs) > 0 else None

# TODO consider using stanford lemmatizer and compute word similarity metric
# to orignal target
def wordnet_lemma_len(target):
    return len(wordNetLemmatizer.lemmatize(target))

def wn_synset_lesk_wsd_ratio_hi_freq(target, sentence, pos, freqs_func, freqs):
    wsd_synset = lesk(sentence.split(), target, pos)
    if not wsd_synset:
        return 0
    lemmas = [lemma.name().split('_') for lemma in wsd_synset.lemmas()]
    if not lemmas:
        return 0
    freqis = [np.min([freqs_func(lemma, freqs) for lemma in lemmata])
              for lemmata in lemmas]
    target_freq = freqs_func(target, freqs)
    return np.sum([1 for freq in freqis if freq > target_freq]) / len(freqis)

def wn_synset_lesk_wsd_ratio_hi_freq_sum(target, sentence, pos, freqs_func, freqs):
    wsd_synset = lesk(sentence.split(), target, pos)
    if not wsd_synset:
        return 0
    lemmas = [lemma.name().split('_') for lemma in wsd_synset.lemmas()]
    if not lemmas:
        return 0
    freqis = [np.min([freqs_func(lemma, freqs) for lemma in lemmata])
              for lemmata in lemmas]
    target_freq = freqs_func(target, freqs)
    return np.sum([freq for freq in freqis if freq > target_freq]) / np.sum(freqis)

def wn_synset_lesk_wsd_ratio_hi_nopos_freq(target, sentence, freqs_func, freqs):
    wsd_synset = lesk(sentence.split(), target)
    if not wsd_synset:
        return 0
    lemmas = [lemma.name().split('_') for lemma in wsd_synset.lemmas()]
    if not lemmas:
        return 0
    freqis = [np.min([freqs_func(lemma, freqs) for lemma in lemmata])
              for lemmata in lemmas]
    target_freq = freqs_func(target, freqs)
    return np.sum([1 for freq in freqis if freq > target_freq]) / len(freqis)

def wn_synset_lesk_wsd_ratio_low_freq(target, sentence, pos, freqs_func, freqs):
    wsd_synset = lesk(sentence.split(), target, pos)
    if not wsd_synset:
        return 0
    lemmas = [lemma.name().split('_') for lemma in wsd_synset.lemmas()]
    if not lemmas:
        return 0
    freqis = [np.min([freqs_func(lemma, freqs) for lemma in lemmata])
              for lemmata in lemmas]
    target_freq = freqs_func(target, freqs)
    return np.sum([1 for freq in freqis if freq < target_freq]) / len(freqis)

def wn_synset_lesk_wsd_ratio_low_freq_sum(target, sentence, pos, freqs_func, freqs):
    wsd_synset = lesk(sentence.split(), target, pos)
    if not wsd_synset:
        return 0
    lemmas = [lemma.name().split('_') for lemma in wsd_synset.lemmas()]
    if not lemmas:
        return 0
    freqis = [np.min([freqs_func(lemma, freqs) for lemma in lemmata])
              for lemmata in lemmas]
    target_freq = freqs_func(target, freqs)
    return np.sum([freq for freq in freqis if freq < target_freq]) / np.sum(freqis)

def wn_synset_lesk_wsd_ratio_low_nopos_freq(target, sentence, freqs_func, freqs):
    wsd_synset = lesk(sentence.split(), target)
    if not wsd_synset:
        return 0
    lemmas = [lemma.name().split('_') for lemma in wsd_synset.lemmas()]
    if not lemmas:
        return 0
    freqis = [np.min([freqs_func(lemma, freqs) for lemma in lemmata])
              for lemmata in lemmas]
    target_freq = freqs_func(target, freqs)
    return np.sum([1 for freq in freqis if freq < target_freq]) / len(freqis)

def wn_synset_lesk_wsd_ratio_to_freq_sum(target, sentence, pos, freqs_func, freqs):
    wsd_synset = lesk(sentence.split(), target, pos)
    if not wsd_synset:
        return 0
    lemmas = [lemma.name().split('_') for lemma in wsd_synset.lemmas()]
    if not lemmas:
        return 0
    freqis = [np.min([freqs_func(lemma, freqs) for lemma in lemmata])
              for lemmata in lemmas]
    target_freq = freqs_func(target, freqs)
    return target_freq / np.sum(freqis)

#TODO implement with other wsd algorithm and lemma count
#Implement wsd algorithm based on word embeddings and also use pywsd
def wn_synset_lesk_wsd__norm_sense_rank(target, sentence, pos, freqs_func, freqs, wsd_func):
    wsd_synset = wsd_func(sentence.split(), target, pos)
    senses = wn.synsets(target)
    if not wsd_synset:
        return 0
    wsd_synset = lesk(sentence.split(), target, pos)
    sense_freqs = sorted([(sense, np.sum([lemma.count() for lemma in sense.lemmas()])) 
                   for sense in senses], key = lambda tpl : tpl[1], reverse=True)
    sense_index = [sense for sense, cnt in sense_freqs].index(wsd_synset)
    return sense_index / len(senses)

In [21]:
import h5py
data = h5py.File('resources/word-embeddings/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5')
for key in data.keys():
    print(key)
group = data['CNN_high_0']
print(group)

CNN
CNN_high_0
CNN_proj
RNN_0
RNN_1
char_embed
<HDF5 group "/CNN_high_0" (4 members)>


In [55]:
from nltk.wsd import lesk
from pywsd.lesk import adapted_lesk

df['wn_synset_freq'] = df.target.apply(lambda target : agg_feat_num_average(target, wn_synset_freq))
df['wn_synset_avg_lemma_freq'] = df.target.apply(lambda target : agg_feat_num_average(target, wn_synset_avg_lemma_freq))
df['wn_synset_avg_lemma_len'] = df.target.apply(lambda target : agg_feat_num_average(target, wn_synset_avg_lemma_len))

df['wn_synset_diff_len_avg_lemma_len'] = df.wn_synset_avg_lemma_len - df.length
df['wn_synset_avg_hypernyms'] = df.target.apply(lambda target : agg_feat_num_average(target, wn_synset_avg_hypernyms))
df['wn_synset_sum_hypernyms'] = df.target.apply(lambda target : agg_feat_num_average(target, wn_synset_sum_hypernyms))
df['wn_synset_avg_hyponyms'] = df.target.apply(lambda target : agg_feat_num_average(target, wn_synset_avg_hyponyms))

df['wn_synset_avg_definition_len'] = df.target.apply(lambda target : 
                                                     agg_feat_num_average(target, wn_synset_avg_definition_len))
df['wn_synset_avg_hyptree_depth'] = df.target.apply(lambda target :
                                                     agg_feat_num_average(target, wn_synset_avg_hyptree_depth))
df['wn_synset_num_distinct_pos'] = df.target.apply(lambda target : 
                                                     agg_feat_num_average(target, wn_synset_num_distinct_pos))
df['wn_synset_avg_num_relations'] = df.target.apply(lambda target : 
                                                     agg_feat_num_average(target, wn_synset_avg_num_relations))

df['wn_synset_avg_freq_pos_noun'] = df.target.apply(lambda target : 
                                                    agg_feat_num_average(target, wn_synset_avg_freq_pos, wn.NOUN))
df['wn_synset_avg_freq_pos_verb'] = df.target.apply(lambda target : 
                                                    agg_feat_num_average(target, wn_synset_avg_freq_pos, wn.VERB))
df['wn_synset_avg_freq_pos_adj'] = df.target.apply(lambda target : 
                                                   agg_feat_num_average(target, wn_synset_avg_freq_pos, wn.ADJ))
df['wn_synset_avg_freq_pos_adv'] = df.target.apply(lambda target : 
                                                   agg_feat_num_average(target, wn_synset_avg_freq_pos, wn.ADV))

df['wn_synset_avg_freq_pos_noun_norm'] = np.nan_to_num(df.wn_synset_avg_freq_pos_noun / df.wn_synset_freq)
df['wn_synset_avg_freq_pos_verb_norm'] = np.nan_to_num(df.wn_synset_avg_freq_pos_verb / df.wn_synset_freq)
df['wn_synset_avg_freq_pos_adj_norm'] = np.nan_to_num(df.wn_synset_avg_freq_pos_adj / df.wn_synset_freq)
df['wn_synset_avg_freq_pos_adv_norm'] = np.nan_to_num(df.wn_synset_avg_freq_pos_adv / df.wn_synset_freq)

df['pos_tag'] = df[['sentence', 'target']].apply(lambda vals : pos_tag(*vals), axis = 1)
df['wn_synset_sense_entropy_uniform'] = df.target.apply(lambda target : 
                                                        agg_feat_num_average(target, wn_synset_sense_entropy_uniform))
df['wn_synset_sense_entropy_pos_uniform'] = df.target.apply(lambda target :
                                                        agg_feat_num_average(target, wn_synset_sense_entropy_pos_uniform))
df['wn_synsets_sense_entropy_pos_central'] = df[['target', 'pos_tag']].apply(
    lambda vals : wn_synsets_sense_entropy_pos_central(vals[0], penn_to_wn(vals[1])), axis = 1)

df['swn_avg_objective_score'] = df.target.apply(lambda target : agg_feat_num_average(target, swn_avg_objective_score))

df['wordnet_lemma_len'] = df.target.apply(lambda target : agg_feat_num_average(target, wordnet_lemma_len))
df['diff_len_wordnet_lemma_len'] = df.length - df.wordnet_lemma_len
df['reduction_lemma_len'] = 1 - df.wordnet_lemma_len / df.length

df['wn_synsets_freq_ratio_to_max_agg_min'] = df.target.apply(lambda target : \
                                                agg_feat_num_average(target, wn_synsets_freq_ratio_to_max_agg_min, \
                                                                     get_dict_count, word_freq_wiki))
df['wn_synsets_freq_ratio_to_max_agg_mean'] = df.target.apply(lambda target : \
                                                agg_feat_num_average(target, wn_synsets_freq_ratio_to_max_agg_mean, \
                                                                     get_dict_count, word_freq_wiki))
df['wn_synsets_freq_ratio_to_max_agg_median'] = df.target.apply(lambda target : \
                                                agg_feat_num_average(target, wn_synsets_freq_ratio_to_max_agg_median, \
                                                                     get_dict_count, word_freq_wiki))
df['wn_synsets_avg_lemma_freq'] = df.target.apply(lambda target : \
                                                agg_feat_num_average(target, wn_synsets_avg_lemma_freq, \
                                                                     get_dict_count, word_freq_wiki))
df['wn_synsets_freq_ratio_to_avg'] = df.freq_wiki / df.wn_synsets_avg_lemma_freq
df['wn_synset_lesk_wsd_ratio_hi_freq'] = df[['target','sentence', 'pos_tag']].apply(lambda vals : \
            agg_feat_num_average(vals[0], wn_synset_lesk_wsd_ratio_hi_freq, vals[1], vals[2], \
                                 get_dict_count, word_freq_wiki), axis = 1)
df['wn_synset_lesk_wsd_ratio_low_freq'] = df[['target','sentence', 'pos_tag']].apply(lambda vals : \
            agg_feat_num_average(vals[0], wn_synset_lesk_wsd_ratio_low_freq, vals[1], vals[2], \
                                 get_dict_count, word_freq_wiki), axis = 1)
df['wn_synset_lesk_wsd_ratio_hi_nopos_freq'] = df[['target','sentence']].apply(lambda vals : \
            agg_feat_num_average(vals[0], wn_synset_lesk_wsd_ratio_hi_nopos_freq, vals[1], \
                                 get_dict_count, word_freq_wiki), axis = 1)
df['wn_synset_lesk_wsd_ratio_low_nopos_freq'] = df[['target','sentence']].apply(lambda vals : \
            agg_feat_num_average(vals[0], wn_synset_lesk_wsd_ratio_low_nopos_freq, vals[1], \
                                 get_dict_count, word_freq_wiki), axis = 1)
df['wn_synset_lesk_wsd_ratio_hi_freq_sum'] = df[['target','sentence', 'pos_tag']].apply(lambda vals : \
            agg_feat_num_average(vals[0], wn_synset_lesk_wsd_ratio_hi_freq_sum, vals[1], vals[2], \
                                 get_dict_count, word_freq_wiki), axis = 1)
df['wn_synset_lesk_wsd_ratio_low_freq_sum'] = df[['target','sentence', 'pos_tag']].apply(lambda vals : \
            agg_feat_num_average(vals[0], wn_synset_lesk_wsd_ratio_low_freq_sum, vals[1], vals[2], \
                                 get_dict_count, word_freq_wiki), axis = 1)
df['wn_synset_lesk_wsd_ratio_to_freq_sum'] = df[['target','sentence', 'pos_tag']].apply(lambda vals : \
            agg_feat_num_average(vals[0], wn_synset_lesk_wsd_ratio_to_freq_sum, vals[1], penn_to_wn(vals[2]), \
                                 get_dict_count, word_freq_wiki), axis = 1)
df['wn_synset_lesk_wsd__norm_sense_rank'] = df[['target','sentence', 'pos_tag']].apply(lambda vals : \
            agg_feat_num_average(vals[0], wn_synset_lesk_wsd__norm_sense_rank, vals[1], penn_to_wn(vals[2]), \
                                 get_dict_count, word_freq_wiki, lesk), axis = 1)

  out=out, **kwargs)


In [567]:
str(chr(90))

'Z'

Sense Rank example where covered in terms of copying piece of music is penalized compared to cover in terms of overlay

In [115]:
df.loc[df['target'] == 'covered',['target', 'sentence', 'wn_synset_lesk_wsd__norm_sense_rank']]

Unnamed: 0,target,sentence,wn_synset_lesk_wsd__norm_sense_rank
701,covered,"La India covered the song on her album , Sobre el Fuego as her third single from the album .",0.962963
4433,covered,"The skin is covered in chromatophores , which enable the squid to change color to suit its surroundings , making it practically invisible .",0.296296


# PorterStemmer, StanfordNLP and Dependency Tree Features
Here we implement features based on the PorterStemmer library from nltk.

In [293]:
from nltk.stem.porter import *
from nltk.stem.wordnet import *
from nltk.tag.stanford import StanfordNERTagger
from nltk.parse.stanford import StanfordDependencyParser
from nltk.tokenize import word_tokenize
import os
from functools import lru_cache
from collections import Counter

java_path = "C:/Program Files (x86)/Java/jdk1.8.0_144/bin/java.exe"
os.environ['JAVAHOME'] = java_path
path_to_jar = 'resources/stanford-dependency-parser/stanford-parser.jar'
path_to_models_jar = 'resources/stanford-dependency-parser/stanford-parser-3.9.1-models.jar'

porterStemmer = PorterStemmer()
wordNetLemmatizer = WordNetLemmatizer()
nerTagger = StanfordNERTagger('resources/stanford-ner-tagger/classifiers/english.all.3class.distsim.crf.ser.gz',
               'resources/stanford-ner-tagger/stanford-ner.jar',
               encoding='utf-8')
dependencyParser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

def porter_stem_len(target):
    return len(str(porterStemmer.stem(target)))

def porter_stemmer_num_steps(target):
    stem = target.lower()
    applied_steps = 0
    if porterStemmer.mode == porterStemmer.NLTK_EXTENSIONS and target in porterStemmer.pool:
            return applied_steps
    if porterStemmer.mode != porterStemmer.ORIGINAL_ALGORITHM and len(target) <= 2:
            return applied_steps
    step_funcs = [porterStemmer._step1a, porterStemmer._step1b, porterStemmer._step1c,
                  porterStemmer._step2, porterStemmer._step3, porterStemmer._step3,
                  porterStemmer._step4, porterStemmer._step5a, porterStemmer._step5b]
    for step_func in step_funcs:
        stem_step = step_func(stem)
        if stem_step != stem:
            stem = stem_step
            applied_steps += 1
    return applied_steps

def is_named_entity(sentence, target):
    tokenized_sent = word_tokenize(sentence)
    tagged_sent = nerTagger.tag(tokenized_sent)
    for token, tag in tagged_sent:
        if token == target and tag != 'O':
            return 1
    return 0

def named_entity_type(sentence, target):
    tokenized_sent = word_tokenize(sentence)
    tagged_sent = nerTagger.tag(tokenized_sent)
    return [tag for token, tag in tagged_sent if token == target][0]

@lru_cache(maxsize=None)
def targets_with_index(start, end, context):
    curr_pos = 0
    targets = []
    for index, token in enumerate(word_tokenize(context), 1):
        targets.append((token, index, curr_pos, (curr_pos + len(token))))
        curr_pos += len(token) + 1
    return [(target[0], target[1]) for target in targets \
            if target[2] >= start and target[3] <= end]

@lru_cache(maxsize=None)
def dependency_parse(sentence):
    dependency_parser = dependencyParser.raw_parse(sentence)
    dependencies = []
    parsetree = list(dependency_parser)[0]
    for index, node in parsetree.nodes.items():
        for relation, dependant in parsetree.nodes[index]['deps'].items():
            triple = ((node['word'], index), relation, \
                      (parsetree.nodes[dependant[0]]['word'], dependant[0]))
            if relation != 'root': dependencies.append(triple)
    return dependencies

@lru_cache(maxsize=None)
def dependency_parse_with_root(sentence):
    dependency_parser = dependencyParser.raw_parse(sentence)
    dependencies = []
    parsetree = list(dependency_parser)[0]
    for index, node in parsetree.nodes.items():
        for relation, dependant in parsetree.nodes[index]['deps'].items():
            triple = ((node['word'], index), relation, \
                      (parsetree.nodes[dependant[0]]['word'], dependant[0]))
            dependencies.append(triple)
    return dependencies

def dep_dist_to_head(target, start, end, context):
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    return np.nan_to_num(np.mean([np.abs(triple[0][1] - triple[2][1])-1 
                                for triple in triples if triple[2] in targets]))

def dep_dist_to_root(target, start, end, context):
    targets = targets_with_index(start, end, context)
    triples = dependency_parse_with_root(context)
    root_node = list(filter(lambda triple : triple[1] == 'root' , triples))[0]
    dist = np.nan_to_num(np.mean([np.abs(root_node[2][1] - triple[2][1])-1 
                                for triple in triples if triple[2] in targets]))
    return dist if dist != -1 else 0

def dep_relation_to_head(target, start, end, context):
    targets = targets_with_index(start, end, context)
    triples = dependency_parse_with_root(context)
    return [triple[1] for triple in triples if triple[2] in targets]

def dep_num_dependents(target, start, end, context):
    targets = targets_with_index(start, end, context)
    triples = dependency_parse_with_root(context)
    return len([triple[1] for triple in triples if triple[0] in targets])

def dep_max_num_dependents(context):
    triples = dependency_parse_with_root(context)
    return Counter([triple[0][0] for triple in triples]).most_common(1)[0][1]

In [295]:
df_small = df.loc[:30,df.columns]
df_small['dep_dist_to_head'] = df_small[['target', 'start', 'end', 'sentence']].apply(lambda vals : 
                                                                        dep_dist_to_head(*vals), axis=1)
df_small['dep_dist_to_root'] = df_small[['target', 'start', 'end', 'sentence']].apply(lambda vals : 
                                                                        dep_dist_to_root(*vals), axis=1)
df_small['dep_dist_to_root_norm'] = df_small[['dep_dist_to_root', 'sentence']].apply(lambda vals : \
                                                    float(vals[0]) / (len(word_tokenize(vals[1]))-1), axis=1)
df_small['dep_relation_to_head'] = df_small[['target', 'start', 'end', 'sentence']].apply(lambda vals : \
                                                            dep_relation_to_head(*vals), axis = 1)
df_small['dep_num_dependents'] = df_small[['target', 'start', 'end', 'sentence']].apply(lambda vals : \
                                                                    dep_num_dependents(*vals), axis = 1)
df_small['dep_max_num_dependents'] = df_small.sentence.apply(lambda sentence : dep_max_num_dependents(sentence))
df_small['dep_num_dependents_norm'] = df_small.dep_num_dependents / df_small.dep_max_num_dependents
df_small.loc[:, ['target', 'start', 'end', 'sentence', 'dep_dist_to_head', 'dep_dist_to_root', 'dep_dist_to_root_norm', 'dep_max_num_dependents', 'dep_num_dependents', 'dep_num_dependents_norm']]

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,target,start,end,sentence,dep_dist_to_head,dep_dist_to_root,dep_dist_to_root_norm,dep_max_num_dependents,dep_num_dependents,dep_num_dependents_norm
0,passed,28,34,"Normally , the land will be passed down to future generations in a way that recognizes the community 's traditional connection to that country .",0.0,0.0,0.0,6,6,1.0
1,land,15,19,"Normally , the land will be passed down to future generations in a way that recognizes the community 's traditional connection to that country .",2.0,2.0,0.083333,6,1,0.166667
2,future,43,49,"Normally , the land will be passed down to future generations in a way that recognizes the community 's traditional connection to that country .",0.0,2.0,0.083333,6,0,0.0
3,future generations,43,61,"Normally , the land will be passed down to future generations in a way that recognizes the community 's traditional connection to that country .",1.5,2.5,0.104167,6,2,0.333333
4,generations,50,61,"Normally , the land will be passed down to future generations in a way that recognizes the community 's traditional connection to that country .",3.0,3.0,0.125,6,2,0.333333
5,recognizes,76,86,"Normally , the land will be passed down to future generations in a way that recognizes the community 's traditional connection to that country .",1.0,8.0,0.333333,6,2,0.333333
6,community,91,100,"Normally , the land will be passed down to future generations in a way that recognizes the community 's traditional connection to that country .",2.0,10.0,0.416667,6,2,0.333333
7,traditional,104,115,"Normally , the land will be passed down to future generations in a way that recognizes the community 's traditional connection to that country .",0.0,12.0,0.5,6,0,0.0
8,traditional connection to that country,104,142,"Normally , the land will be passed down to future generations in a way that recognizes the community 's traditional connection to that country .",1.4,14.0,0.583333,6,5,0.833333
9,country,135,142,"Normally , the land will be passed down to future generations in a way that recognizes the community 's traditional connection to that country .",2.0,16.0,0.666667,6,2,0.333333


In [183]:
# Porter stemmer stem length, number of applied steps,
# difference of stem length to target and reduction ratio
df['porter_stem_len'] = df.target.apply(lambda target : agg_feat_num_average(target, porter_stem_len))
df['porter_stemmer_num_steps'] = df.target.apply(lambda target : agg_feat_num_average(target, porter_stemmer_num_steps))
df['diff_len_stem_len'] = df.length - df.porter_stem_len
df['reduction_stem_len'] = 1 - df.porter_stem_len / df.length

In [184]:
df.loc[:, ['target', 'length', 'num_syllables', 'num_vowels', 'porter_stemmer_num_steps', 'diff_len_stem_len', 'reduction_stem_len']]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0,target,length,num_syllables,num_vowels,porter_stemmer_num_steps,diff_len_stem_len,reduction_stem_len
0,passed,6.000000,,,1.000000,2.000000,0.333333
1,land,4.000000,,,0.000000,0.000000,0.000000
2,future,6.000000,,,1.000000,1.000000,0.166667
3,future generations,8.500000,,,2.000000,3.500000,0.411765
4,generations,11.000000,,,3.000000,6.000000,0.545455
5,recognizes,10.000000,,,2.000000,4.000000,0.400000
6,community,9.000000,,,2.000000,3.000000,0.333333
7,traditional,11.000000,,,2.000000,5.000000,0.454545
8,traditional connection to that country,6.800000,,,0.800000,1.600000,0.235294
9,country,7.000000,,,1.000000,0.000000,0.000000


# Dictionary Features


In [643]:
import textatistic

academic_words = {}
with open("resources/dictionaries/academic_word_list.txt", encoding="utf8") as file:
    for line in file:
        word, rank = line.split()
        academic_words[word.strip()] = rank

prefixes = {}
with open("resources/dictionaries/prefixes.txt", encoding="utf8") as file:
    for line in file:
        prefix, definition, examples = line.split('\t')
        prefixes[prefix.replace('-', '').strip()] = definition

subtlex_us = {}
with open("resources/dictionaries/SUBTLEXus.txt", encoding="utf8") as file:
    for line in file:
        word, freq, cd_count, freq_low, cd_low, subtl_wf, lg10_wf, Subtlcd, lg10_cd = line.split('\t')
        subtlex_us[word.strip().lower()] = (int(freq), int(cd_count))
        
subtlex_uk = pd.read_csv("resources/dictionaries/SUBTLEXuk.txt", sep = "\t")
subtlex_uk_dict = dict(zip(subtlex_uk['Spelling'], subtlex_uk['CD_count']))
        
suffixes = {}
with open("resources/dictionaries/suffixes.txt", encoding="utf8") as file:
    for line in file:
        suffix, definition, examples = line.split('\t')
        suffixes[suffix.replace('-', '').strip()] = definition

with open("resources/dictionaries/biology_glossary.csv", encoding="utf8") as file:
    content = [line.strip().lower() for line in file.readlines()]
    gloss_biology = set(content)

with open("resources/dictionaries/geography_glossary.csv", encoding="utf8") as file:
    content = [line.strip().lower() for line in file.readlines()]
    gloss_geography = set(content)
    
with open("resources/dictionaries/physics_glossary.csv", encoding="utf8") as file:
    content = [line.strip().lower() for line in file.readlines()]
    gloss_physics = set(content)
    
with open("resources/dictionaries/stopwords_en.txt", encoding="utf8") as file:
    content = [line.strip().lower() for line in file.readlines()]
    stop_words = set(content)
    
with open("resources/dictionaries/most_freq_used_3000_words.txt", encoding="utf8") as file:
    content = [line.strip().lower() for line in file.readlines()]
    most_freq_used_3000_words = set(content)
    
with open("resources/dictionaries/most_freq_used_5000_words.txt", encoding="utf8") as file:
    content = [line.split()[1].strip().lower() for line in file.readlines()]
    most_freq_used_5000_words = set(content)

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
from whoosh.index import create_in
from whoosh.fields import *
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)
ix = create_in("indexdir", schema)
writer = ix.writer()
writer.add_document(title=u"First document", path=u"/a", \
                    content=u"This is the first document we've added!")
writer.add_document(title=u"Second document", path=u"/b",\
                     content=u"The second one is even more interesting!")
writer.commit()
from whoosh.qparser import QueryParser
with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema).parse("first")
    results = searcher.search(query)
    results[0]
        
{"title": u"First document", "path": u"/a"}

In [None]:
df['dict_dale_chall'] = df.target.apply(lambda target : agg_feat_num_average(target, textatistic.notdalechall_count))
df['dict_570_academic_words'] = df.target.apply(lambda target : agg_feat_num_max(target, \
                                                lambda target : int(target in academic_words)))
df['common_prefix'] = df.target.apply(lambda target : int(np.sum([1 for prefix in prefixes if target.startswith(prefix)]) > 0))
df['common_suffix'] = df.target.apply(lambda target : int(np.sum([1 for suffix in suffixes if target.endswith(suffix)]) > 0))
df['gloss_biology'] = df.target.apply(lambda target : int(target in gloss_biology))
df['gloss_geography'] = df.target.apply(lambda target : int(target in gloss_geography))
df['stop_word'] = df.target.apply(lambda target : int(target in stop_words))
df['most_freq_used_3000_words'] = df.target.apply(lambda target : agg_feat_num_average(target, \
                                                lambda target : int(target in most_freq_used_3000_words)))
df['most_freq_used_5000_words'] = df.target.apply(lambda target : agg_feat_num_average(target, \
                                                lambda target : int(target in most_freq_used_5000_words)))
df['subtlex_cd_us'] = df.target.apply(lambda target : agg_feat_num_average(target, \
                lambda target : subtlex_us[target.strip().lower()][1] if subtlex_us.get(target.strip().lower()) else 0))
df['subtlex_cd_uk'] = df.target.apply(lambda target : agg_feat_num_average(target, \
                                lambda target : subtlex_uk_dict.get(target, 0)))

In [654]:
df.loc[df['hyphenated']>0,['target', 'common_prefix', 'common_suffix', 'subtlex_cd_us', 'hyphenated']]

Unnamed: 0,target,common_prefix,common_suffix,subtlex_cd_us,hyphenated
33,Wave Hill Walk-Off,0,0,533.333333,1
35,Walk-Off,0,0,0.000000,1
72,best-selling,0,0,0.000000,1
240,E-flat major,0,1,880.000000,1
243,E-flat,0,0,0.000000,1
297,tree-lined,0,0,0.000000,1
299,tree-lined avenues,0,0,13.500000,1
337,well-maintained,0,0,0.000000,1
355,al-ʿayn,0,0,0.000000,1
401,mezzo-sopranos,0,0,0.000000,1


# Word Embedding Features


In [None]:
import gensim

model = gensim.models.Word2Vec.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin', binary=True)  

# Context-Aware Features
Here we compute not only the context extraction/definition in the first place but also the corresponding context features afterwards. Also we need to implement proper strategies to cope with the target occuring multiple times in the sentence. To avoid mistakes, we should use the actual start and end tags from the dataset.

### a. Context-Token Aggregation
First we define how feature values of multiple context-tokens should be aggreagated.

In [181]:
from nltk.tokenize import word_tokenize

def agg_ctx_feat_num_average(tokens, func_feature, *args):
    return np.mean([func_feature(token, *args) for token in tokens])

def agg_ctx_feat_num_median(tokens, func_feature, *args):
    return np.median([func_feature(token, *args) for token in tokens])

def agg_ctx_feat_num_max(tokens, func_feature, *args):
    return np.max([func_feature(token, *args) for token in tokens])

def agg_ctx_feat_num_min(tokens, func_feature, *args):
    return np.min([func_feature(token, *args) for token in tokens])

def agg_ctx_feat_num_sum(tokens, func_feature, *args):
    return np.sum([func_feature(token, *args) for token in tokens])

### b. Context Definition
Here we compute different kinds of context definitions. For example, as a baseline we extract all tokens from the sentence except the target. A second approach is to use a n preceeding or n succeding tokens, or a combined window apporach were we extract n tokens preceeding and succeding of the target. A more sophisticated apporach involves dependency parsing of the sentence and applying different extraction heuristics. Finally we also implement a context extraction approach exploting FrameNet semantic parsing.

In [None]:
from collections import Counter

def mult_target(sentence, target):
    counter = Counter(word_tokenize(sentence))
    targets = word_tokenize(target)
    return np.sum([counter[target] for target in targets]) / len(targets)

df['mult_target'] = df[['sentence', 'target']].apply(lambda vals : mult_target(*vals), axis = 1)
df[df.mult_target == 4].head()

In [731]:
from nltk.tokenize import word_tokenize
from nltk.parse.stanford import StanfordDependencyParser
from nltk.parse.stanford import StanfordNeuralDependencyParser
import os

java_path = "C:/Program Files (x86)/Java/jdk1.8.0_144/bin/java.exe"
os.environ['JAVAHOME'] = java_path
path_to_jar = 'resources/stanford-dependency-parser/stanford-parser.jar'
path_to_models_jar = 'resources/stanford-dependency-parser/stanford-parser-3.9.1-models.jar'

dependencyParser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

def post_process_ctx(context):
    return [token for token in context if token.isalnum()]

def preprocess_target(target):
    return target.strip()

def target_index_char_based(start, end, ctx_tokens):
    size = np.sum([len(token) for token in ctx_tokens]) + len(ctx_tokens)
    target_pos = (start + end) / 2
    target_pos_rel = target_pos / size
    return int(target_pos_rel * len(post_process_ctx(ctx_tokens)))

def targets_with_index(start, end, context):
    curr_pos = 0
    targets = []
    for index, token in enumerate(word_tokenize(context), 1):
        targets.append((token, index, curr_pos, (curr_pos + len(token))))
        curr_pos += len(token) + 1
    print(targets)
    return [(target[0], target[1]) for target in targets \
            if target[2] >= start and target[3] <= end]

def dependency_parse(sentence):
    dependency_parser = dependencyParser.raw_parse(sentence)
    dependencies = []
    parsetree = list(dependency_parser)[0]
    print(parsetree)
    for index, node in parsetree.nodes.items():
        for relation, dependant in parsetree.nodes[index]['deps'].items():
            triple = ((node['word'], index), relation, \
                      (parsetree.nodes[dependant[0]]['word'], dependant[0]))
            if relation != 'root': dependencies.append(triple)
    return dependencies

def ctx_extraction_all(context, target):
    ctx_tokens = word_tokenize(context)
    if target in ctx_tokens:
        ctx_tokens.remove(target)
    return ctx_tokens

def ctx_extraction_all_filtered(context, target):
    ctx_tokens = word_tokenize(context)
    post_ctx_tokens = post_process_ctx(ctx_tokens)
    if target in ctx_tokens:
        ctx_tokens.remove(target)
    return post_process_ctx

def ctx_extraction_window_pre_n(context, target, start, end, n = 3):
    target = preprocess_target(target)
    ctx_tokens = word_tokenize(context[:start])
    post_ctx_tokens = post_process_ctx(ctx_tokens)
    return post_ctx_tokens[-n:]

def ctx_extraction_window_suc_n(context, target, start, end, n = 3):
    target = preprocess_target(target)
    ctx_tokens = word_tokenize(context[end:])
    post_ctx_tokens = post_process_ctx(ctx_tokens)
    return post_ctx_tokens[:n]

def ctx_extraction_window_pre_suc_n(context, target, start, end, n = 3):
    ctx_tokens_pre = ctx_extraction_window_pre_n(context, target, start, end, n)
    ctx_tokens_suc = ctx_extraction_window_suc_n(context, target, start, end, n)
    ctx_tokens_pre.extend(ctx_tokens_suc)
    return ctx_tokens_pre

def ctx_extraction_dep_in(context, target, start, end):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    return list(set([triple[0][0] for triple in triples if triple[2] in targets]))

def ctx_extraction_dep_out(context, target, start, end):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    return list(set([triple[2][0] for triple in triples if triple[0] in targets]))

def ctx_extraction_dep_in_out(context, target, start, end):
    ctx_tokens_in = ctx_extraction_dep_in(context, target, start, end)
    ctx_tokens_out = ctx_extraction_dep_out(context, target, start, end)
    ctx_tokens_in.extend(ctx_tokens_out)
    return list(set(ctx_tokens_in))

def ctx_extraction_dep_recu_in_n_steps(context, target, start, end, n = 2):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    result_tokens = []
    curr_target = targets
    for step in range(0, n):
        step_result = [triple[0] for triple in triples 
                       if triple[2] in curr_target]
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
    return list(set([token[0] for token in result_tokens]))

def ctx_extraction_dep_recu_out_n_steps(context, target, start, end, n = 2):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    result_tokens = []
    curr_target = targets
    for step in range(0, n):
        step_result = [triple[2] for triple in triples 
                       if triple[0] in curr_target]
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
    return list(set([token[0] for token in result_tokens]))

def ctx_extraction_dep_recu_in_out_n_steps(context, target, start, end, n = 2):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    result_tokens = []
    curr_target = targets
    for step in range(0, n):
        step_result = [triple[2] for triple in triples 
                       if triple[0] in curr_target]
        step_result_out = [triple[0] for triple in triples 
                       if triple[2] in curr_target]
        step_result.extend(step_result_out)
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
    return list(set([token[0] for token in result_tokens]))

def ctx_extraction_dep_recu_in_cover(context, target, start, end, cover = 0.1):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    ctx_tokens = word_tokenize(context)
    ctx_tokens_post = post_process_ctx(ctx_tokens)
    result_tokens = []
    curr_target = targets
    curr_cover = 0
    while curr_cover < cover:
        step_result = [triple[0] for triple in triples 
                       if triple[2] in curr_target]
        if set(step_result) == set(curr_target):
                break
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
        curr_cover = len(result_tokens) / len(ctx_tokens_post)
    return list(set([token[0] for token in result_tokens]))

def ctx_extraction_dep_recu_out_cover(context, target, start, end, cover = 0.1):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    ctx_tokens = word_tokenize(context)
    ctx_tokens_post = post_process_ctx(ctx_tokens)
    result_tokens = []
    curr_target = targets
    curr_cover = 0
    while curr_cover < cover:
        step_result = [triple[2] for triple in triples 
                       if triple[0] in curr_target]
        if set(step_result) == set(curr_target):
                break
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
        curr_cover = len(result_tokens) / len(ctx_tokens_post)
    return list(set([token[0] for token in result_tokens]))

def ctx_extraction_dep_recu_in_out_cover(context, target, start, end, cover = 0.1):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    ctx_tokens = word_tokenize(context)
    ctx_tokens_post = post_process_ctx(ctx_tokens)
    result_tokens = []
    curr_target = targets
    curr_cover = 0
    while curr_cover < cover:
        step_result = [triple[2] for triple in triples 
                       if triple[0] in curr_target]
        step_result_out = [triple[0] for triple in triples 
                       if triple[2] in curr_target]
        step_result.extend(step_result_out)
        if set(step_result) == set(curr_target):
                break
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
        curr_cover = len(result_tokens) / len(ctx_tokens_post)
    return list(set([token[0] for token in result_tokens]))

In [732]:
dependency_parse('This is a simple sentence')

defaultdict(<function DependencyGraph.__init__.<locals>.<lambda> at 0x0000024CA072F378>,
            {0: {'address': 0,
                 'ctag': 'TOP',
                 'deps': defaultdict(<class 'list'>, {'root': [5]}),
                 'feats': None,
                 'head': None,
                 'lemma': None,
                 'rel': None,
                 'tag': 'TOP',
                 'word': None},
             1: {'address': 1,
                 'ctag': 'DT',
                 'deps': defaultdict(<class 'list'>, {}),
                 'feats': '_',
                 'head': 5,
                 'lemma': '_',
                 'rel': 'nsubj',
                 'tag': 'DT',
                 'word': 'This'},
             2: {'address': 2,
                 'ctag': 'VBZ',
                 'deps': defaultdict(<class 'list'>, {}),
                 'feats': '_',
                 'head': 5,
                 'lemma': '_',
                 'rel': 'cop',
                 'tag': 'VBZ',
           

[(('sentence', 5), 'nsubj', ('This', 1)),
 (('sentence', 5), 'cop', ('is', 2)),
 (('sentence', 5), 'det', ('a', 3)),
 (('sentence', 5), 'amod', ('simple', 4))]

In [None]:
sentence = "Normally, the land will be passed down by future generations in a way " + \
             "that recognizes the community's traditional connection to that country "
target = 'passed'

print('ctx_etraction_all:')
print(ctx_extraction_all_filtered(sentence, target))

print('ctx_extraction_window_pre_n:')
print(ctx_extraction_window_pre_n(sentence, "Normally"))
print(ctx_extraction_window_pre_n(sentence, "the"))
print(ctx_extraction_window_pre_n(sentence, "land"))
print(ctx_extraction_window_pre_n(sentence, target, n = 5))

print('ctx_extraction_window_suc_n:')
print(ctx_extraction_window_suc_n(sentence, "country"))
print(ctx_extraction_window_suc_n(sentence, "to"))
print(ctx_extraction_window_suc_n(sentence, "connection"))
print(ctx_extraction_window_suc_n(sentence, "community", n = 5))

print('ctx_extraction_window_pre_suc_n:')
print(ctx_extraction_window_pre_suc_n(sentence, "passed"))
print(ctx_extraction_window_pre_suc_n(sentence, "the"))
print(ctx_extraction_window_pre_suc_n(sentence, "to"))

print('ctx_extraction_dep_in:')
print(ctx_extraction_dep_in(sentence, "land"))

print('ctx_extraction_dep_out:')
print(ctx_extraction_dep_out(sentence, target))
print(ctx_extraction_dep_out(sentence, "land"))

print('ctx_extraction_dep_in_out:')
print(ctx_extraction_dep_in_out(sentence, "land"))

print('ctx_extraction_dep_recu_in_n_steps:')
print(ctx_extraction_dep_recu_in_n_steps(sentence, "the", n = 3))

print('ctx_extraction_dep_recu_out_n_steps:')
print(ctx_extraction_dep_recu_out_n_steps(sentence, "the"))

print('ctx_extraction_dep_recu_in_out_n_steps:')
print(ctx_extraction_dep_recu_in_out_n_steps(sentence, "the"))

print('ctx_extraction_dep_recu_in_cover:')
print(ctx_extraction_dep_recu_in_cover(sentence, "the", cover=0.1))

print('ctx_extraction_dep_recu_out_cover:')
print(ctx_extraction_dep_recu_out_cover(sentence, "the", cover=0.1))

print('ctx_extraction_dep_recu_in_out_cover:')
print(ctx_extraction_dep_recu_in_out_cover(sentence, "the", cover=0.1))

### c. Context Extraction

After we defined all the context extraction approaches, we can apply them on the actual dataset. To do so, we first extract all the distinct sentences from the actual training set and create a new dataframe containing only the sentence ids, the sentence, the target and all the computed contexts. This also makes it easier to integrate context extraction functions implemented in other languages. Afterwards we can compute the context features and join them back with the target features dataframe.

In [None]:
import numpy as np

df_context = df.loc[0:1, ['id', 'sentence', 'target', 'start', 'end']]

df_context['ctx_extraction_window_pre_n'] = df_context.apply(lambda columns : 
                                        ctx_extraction_window_pre_n(columns['sentence'], columns['target'], \
                                                                   columns['start'], columns['end']), axis = 1)

df_context['ctx_extraction_window_suc_n'] = df_context.apply(lambda columns : 
                                        ctx_extraction_window_suc_n(columns['sentence'], columns['target'], \
                                                                   columns['start'], columns['end']), axis = 1)

df_context['ctx_extraction_window_pre_suc_n'] = df_context.apply(lambda columns : 
                                        ctx_extraction_window_pre_suc_n(columns['sentence'], columns['target'], \
                                                                   columns['start'], columns['end']), axis = 1)

df_context['ctx_extraction_dep_in'] = df_context.apply(lambda columns : 
                                        ctx_extraction_dep_in(columns['sentence'], columns['target'], \
                                                                   columns['start'], columns['end']), axis = 1)

df_context['ctx_extraction_dep_out'] = df_context.apply(lambda columns : 
                                        ctx_extraction_dep_out(columns['sentence'], columns['target'], \
                                                                   columns['start'], columns['end']), axis = 1)
# 1. Compute dep_in_out using defined function
df_context['ctx_extraction_dep_in_out'] = df_context.apply(lambda columns : 
                                        ctx_extraction_dep_in_out(columns['sentence'], columns['target'], \
                                                                   columns['start'], columns['end']), axis = 1)

# 2. Compute dep_in_out by combining precomputed results
df_context['ctx_extraction_dep_in_out_dir'] = df_context[['ctx_extraction_dep_in', \
                                                      'ctx_extraction_dep_out']].apply(lambda vals : vals[0]+vals[1], axis=1)


df_context['ctx_extraction_dep_recu_in_n_steps']  = df_context.apply(lambda columns : 
                                        ctx_extraction_dep_recu_in_n_steps(columns['sentence'], columns['target'], \
                                        columns['start'], columns['end'], n=2), axis = 1)

df_context['ctx_extraction_dep_recu_out_n_steps']  = df_context.apply(lambda columns : 
                                        ctx_extraction_dep_recu_out_n_steps(columns['sentence'], columns['target'], \
                                        columns['start'], columns['end'], n=2), axis = 1)


df_context['ctx_extraction_dep_recu_in_out_n_steps']  = df_context.apply(lambda columns : 
                                        ctx_extraction_dep_recu_in_out_n_steps(columns['sentence'], columns['target'], \
                                        columns['start'], columns['end'], n=2), axis = 1)


df_context['ctx_extraction_dep_recu_in_cover']  = df_context.apply(lambda columns : 
                                        ctx_extraction_dep_recu_in_cover(columns['sentence'], columns['target'], \
                                        columns['start'], columns['end'], cover=0.2), axis = 1)


df_context['ctx_extraction_dep_recu_out_cover']  = df_context.apply(lambda columns : 
                                        ctx_extraction_dep_recu_out_cover(columns['sentence'], columns['target'], \
                                        columns['start'], columns['end'], cover=0.2), axis = 1)


df_context['ctx_extraction_dep_recu_in_out_cover']  = df_context.apply(lambda columns : 
                                        ctx_extraction_dep_recu_in_out_cover(columns['sentence'], columns['target'], \
                                        columns['start'], columns['end'], cover=0.2), axis = 1)

df_context

### d. Context Features
After defining all the context definitions and extracting the different kinds of contexts from the sentence, we compute features on the context words. Therefore we first define which of the precomputed contexts to use.

#### (1) Readability Measures
Here we implement some of the most popular and well-known historical readability measures. Most of them need multiple sentences to compute them properly, however, we will apply them on the extracted context.

In [108]:
def readability_flesch_kincaid(ctx_len, ctx_sum_syllables):
    return 206.835 - (1.015 * ctx_len) - (84.6 * (ctx_sum_syllables / ctx_len))


In [None]:
df_context['context'] = df_context['ctx_extraction_window_pre_suc_n']

df_context['ctx_num_tokens'] = df_context.context.apply(lambda context : len(context))
df_context['ctx_avg_length'] = df_context.context.apply(lambda context : agg_ctx_feat_num_average(context, len))
df_context['ctx_sum_syllables'] = df_context.context.apply(lambda context : agg_ctx_feat_num_sum(context, num_syllables))
df_context['ctx_avg_word_freq_wiki'] = df_context.context.apply(lambda context : \
                                                    agg_feat_num_average(context, get_dict_count, word_freq_wiki))
df_context.head()

In [121]:
from textatistic import Textatistic

df_context['readability_flesch_kincaid'] = df_context[['num_ctx_tokens', 'sum_ctx_syllables']] \
                            .apply(lambda vals : readability_flesch_kincaid(vals[0], vals[1]), axis = 1)
df_context['rb_dalechall_score'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').dalechall_score)
df_context['rb_flesch_score'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').flesch_score)
df_context['rb_fleschkincaid_score'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').fleschkincaid_score)
df_context['rb_gunningfog_score'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').gunningfog_score)
df_context['rb_polysyblword_count'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').polysyblword_count)
df_context['rb_smog_score'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').smog_score)
df_context['rb_sybl_count'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').sybl_count)

df_context[['target', 'context', 'rb_dalechall_score', 'rb_flesch_score', 'rb_fleschkincaid_score', \
            'rb_gunningfog_score', 'rb_polysyblword_count', 'rb_smog_score', 'rb_sybl_count']].head()

Unnamed: 0,target,context,rb_dalechall_score,rb_flesch_score,rb_fleschkincaid_score,rb_gunningfog_score,rb_polysyblword_count,rb_smog_score,rb_sybl_count
0,passed,"[land, will, be, down, to, future]",6.565767,102.045,0.516667,2.4,0,3.1291,7
1,land,"[Normally, the, will, be, passed]",7.0425,100.24,0.52,2.0,0,3.1291,6


# Feature Importance
Here we compute individual feature importance based on different metrics. For example, we implement and compute the F-Score, providing an idea of the discrimination power the feature has.

In [None]:
def feat_importance_f_score(dataframe, feat_name, label_name):
    df = dataframe.copy()
    mean_feat = np.mean(df.loc[:, [feat_name]])[0]
    means = df.loc[: , [feat_name, label_name]].groupby(label_name).mean().reset_index()
    mean_negativ = means.loc[means[label_name] == 0, [feat_name]][feat_name][0]
    mean_positiv = means.loc[means[label_name] == 1, [feat_name]][feat_name][1]
    # Compute the sum of deviations of the class mean from the overall mean
    class_mean_devs = (mean_positiv - mean_feat)**2 + (mean_negativ - mean_feat)**2
    # Compute neagtive instance based values
    neg_inst = df.loc[df[label_name] == 0, [feat_name]]
    std_dev_neg = (np.sum((neg_inst - mean_negativ)**2) / (len(neg_inst) - 1))[feat_name]
    #Compute positive instance based values
    pos_inst = df.loc[df[label_name] == 1, [feat_name]]
    std_dev_pos = (np.sum((pos_inst - mean_positiv)**2) / (len(pos_inst) - 1))[feat_name]
    return class_mean_devs / (std_dev_neg + std_dev_pos)

def compute_all_feat_importance_metrics(dataframe, label_name):
    pass
    

df_feat = df.drop(['id', 'sentence', 'target', 'nat', 'non_nat', 
                   'nat_marked', 'non_nat_marked', 'prob'], axis = 1)
print(df_feat.mean())
print(df_feat.groupby('binary').mean())

In [None]:
'id', 'sentence', "start", "end", "target", 
              "nat", "non_nat", "nat_marked", "non_nat_marked", "binary", "prob"]