## imports

In [1]:
from rai_app import suggests
import string
from functools import lru_cache
import pandas as pd

## functions

In [25]:


def lev_dist(a, b):
    '''
    This function will calculate the levenshtein distance between two input
    strings a and b
    
    params:
        a (String) : The first string you want to compare
        b (String) : The second string you want to compare
        
    returns:
        This function will return the distnace between string a and b.
        
    example:
        a = 'stamp'
        b = 'stomp'
        lev_dist(a,b)
        >> 1.0
    '''
    
    @lru_cache(None)  # for memorization
    def min_dist(s1, s2):

        if s1 == len(a) or s2 == len(b):
            return len(a) - s1 + len(b) - s2

        # no change required
        if a[s1] == b[s2]:
            return min_dist(s1 + 1, s2 + 1)

        return 1 + min(
            min_dist(s1, s2 + 1),      # insert character
            min_dist(s1 + 1, s2),      # delete character
            min_dist(s1 + 1, s2 + 1),  # replace character
        )

    return min_dist(0, 0)

#source: https://towardsdatascience.com/text-similarity-w-levenshtein-distance-in-python-2f7478986e75

In [24]:
def lev_dist_list(term):
    item = term.split(' ')
    dist = []
    for word in item:
        dist.append(lev_dist('baerbock',word))
    if any(ele < 5 for ele in dist):
        return term
    else:
        return None

In [32]:
lev_dist('baum','klawum')

3

In [46]:
def lev_dist_row(row):
    scores = []
    for root_word in row.root.split():
        for target_word in row.target.split():
            scores.append(lev_dist(root_word, target_word))
    if any(score<3 for score in scores):
        return row.target
    else:
        return 'delete'

In [51]:
#df_test
#data = {'root':['christian eriksen','christian eriksen'],'target':['christian erikesn baum test','trizion seifokmon der allerechte test']}
#df_fake = pd.DataFrame(data)
#df_fake
#df_test = pd.concat([df_test,df_fake])
#df_test
#df_test.apply(lev_dist_row, axis=1)

0            christian eriksen aktuell
1              christian eriksen alter
2               christian eriksen arzt
3     christian eriksen aktuelle teams
4              christian eriksen agent
5             christian eriksen attack
6    christian eriksen and simon kjaer
7          christian eriksen and covid
8     christian eriksen and harry kane
9         christian eriksen and family
0          christian erikesn baum test
1                               delete
dtype: object

## testing

In [2]:
gd = suggests.get_suggests('baerbock', source='google')

2022-04-29 12:04:18,300 | 15354 | INFO | logger | google | baerbock 
2022-04-29 12:04:19,106 | 15354 | INFO | logger | google | baerbock 
2022-04-29 12:04:19,992 | 15354 | INFO | logger | google | baerbock 


In [3]:
gd['suggests']

['baerbock außenministerin',
 'baerbock abgenommen',
 'baerbock assange',
 'baerbock amt',
 'baerbock auto',
 'baerbock außenpolitik',
 'baerbock ard',
 'baerbock an der front',
 'baerbock ansprache',
 'baerbock afghanistan',
 'baerbock beliebtheit',
 'baerbock buch',
 'baerbock bademantel',
 'baerbock bilder',
 'baerbock bei putin',
 'baerbock besuch ukraine',
 'baerbock biografie',
 'baerbock besuch russland',
 'baerbock bild',
 'baerbock beliebt',
 'baerbock china',
 'baerbock cv',
 'baerbock cnn',
 'baerbock cairo',
 'baerbock cartoon',
 'baerbock chemnitz',
 'baerbock clinton',
 'baerbock cdu',
 'baerbock che guevara',
 'baerbock comic']

In [6]:
g['suggests']

['baerbock außenministerin',
 'baerbock abgenommen',
 'baerbock assange',
 'baerbock amt',
 'baerbock auto',
 'baerbock außenpolitik',
 'baerbock ard',
 'baerbock an der front',
 'baerbock ansprache',
 'baerbock afghanistan',
 'baerbock beliebtheit',
 'baerbock buch',
 'baerbock bademantel',
 'baerbock bilder',
 'baerbock bei putin',
 'baerbock besuch ukraine',
 'baerbock biografie',
 'baerbock besuch russland',
 'baerbock bild',
 'baerbock beliebt',
 'baerbock china',
 'baerbock cv',
 'baerbock cnn',
 'baerbock cairo',
 'baerbock cartoon',
 'baerbock chemnitz',
 'baerbock clinton',
 'baerbock cdu',
 'baerbock che guevara',
 'baerbock comic']

In [4]:
bd = suggests.get_suggests('baerbock', source='bing')

2022-04-29 12:04:45,256 | 15354 | INFO | logger | bing | baerbock 
2022-04-29 12:04:46,599 | 15354 | INFO | logger | bing | baerbock 
2022-04-29 12:04:48,071 | 15354 | INFO | logger | bing | baerbock 


In [5]:
bd['suggests']

['baerbock annalena',
 'baerbock alter',
 'baerbock aktuell',
 'baerbock ausbildung',
 'baerbock annalena ehemann',
 'baerbock annalena privat',
 'baerbock afghanistan',
 'baerbock anne will',
 'baerbock buch',
 'baerbock bilder',
 'baerbock bosnien',
 'baerbock bikini',
 'baerbock beine',
 'baerbock blinken',
 'baerbock balkan',
 'baerbock beruf',
 'baerbock china',
 'baerbock cv',
 'baerbock cnn',
 'baerbock corona bonus',
 'baerbock corona',
 'baerbock cartoon',
 'baerbock chancen',
 'baerbock china politik']

In [6]:
b['suggests']

['baerbock annalena',
 'baerbock alter',
 'baerbock aktuell',
 'baerbock ausbildung',
 'baerbock annalena ehemann',
 'baerbock annalena privat',
 'baerbock afghanistan',
 'baerbock anne will',
 'baerbock buch',
 'baerbock bilder',
 'baerbock bikini',
 'baerbock bei putin',
 'baerbock beine',
 'baerbock blinken',
 'baerbock balkan',
 'baerbock beruf',
 'baerbock china',
 'baerbock cv',
 'baerbock corona bonus',
 'baerbock corona',
 'baerbock cartoon',
 'baerbock chancen',
 'baerbock comic',
 'baerbock corona hilfe',
 'baerbock droht russland',
 'baerbock dumme zitate',
 'baerbock dienstwagen',
 'baerbock droht',
 'baerbock droht putin',
 'baerbock doktorarbeit',
 'baerbock dissertation',
 'baerbock donbass',
 'baerbock englisch',
 'baerbock ehemann',
 'baerbock english',
 'baerbock eltern',
 'baerbock englisch video',
 'baerbock email',
 'baerbock englisch youtube',
 'baerbock eiffelturm',
 'baerbock familie',
 'baerbock facebook',
 'baerbock feet',
 'baerbock fotos',
 'baerbock fauxpas

In [5]:
tree = suggests.get_suggests_tree('baerbock', source='bing', max_depth=1)

2022-07-08 16:09:30,965 | 2156 | INFO | rai_app.suggests.logger | bing | baerbock 
2022-07-08 16:09:31,695 | 2156 | INFO | rai_app.suggests.logger | bing | baerbock 
2022-07-08 16:09:32,508 | 2156 | INFO | rai_app.suggests.logger | bing | baerbock 
2022-07-08 16:09:32,957 | 2156 | INFO | rai_app.suggests.logger | bing | baerbock annalena 
2022-07-08 16:09:33,719 | 2156 | INFO | rai_app.suggests.logger | bing | baerbock annalena 
2022-07-08 16:09:34,434 | 2156 | INFO | rai_app.suggests.logger | bing | baerbock annalena 
2022-07-08 16:09:35,115 | 2156 | INFO | rai_app.suggests.logger | bing | baerbock außenministerin 
2022-07-08 16:09:35,824 | 2156 | INFO | rai_app.suggests.logger | bing | baerbock außenministerin 
2022-07-08 16:09:36,318 | 2156 | INFO | rai_app.suggests.logger | bing | baerbock außenministerin 
2022-07-08 16:09:36,891 | 2156 | INFO | rai_app.suggests.logger | bing | baerbock alter 
2022-07-08 16:09:37,525 | 2156 | INFO | rai_app.suggests.logger | bing | baerbock alter 


In [6]:
df2 = pd.DataFrame(tree)
df2

Unnamed: 0,qry,datetime,source,data,suggests,self_loops,tags,depth,root,crawl_id
0,baerbock,2022-07-08 14:09:30.395067,bing,"[<ul class=""sa_drw"" id=""sa_ul"" role=""listbox"" ...","[baerbock annalena, baerbock außenministerin, ...",[],[],0,baerbock,
1,baerbock annalena,2022-07-08 14:09:32.620720,bing,"[<ul class=""sa_drw"" id=""sa_ul"" role=""listbox"" ...","[baerbock annalena aktuell, baerbock annalena ...",[],[],1,baerbock,
2,baerbock außenministerin,2022-07-08 14:09:34.549337,bing,"[<ul class=""sa_drw"" id=""sa_ul"" role=""listbox"" ...","[baerbock außenministerin satire, baerbock auß...",[],[],1,baerbock,
3,baerbock alter,2022-07-08 14:09:36.464923,bing,"[<ul class=""sa_drw"" id=""sa_ul"" role=""listbox"" ...","[annalena baerbock alter, annalena baerbock ki...",[],[],1,baerbock,
4,baerbock aktuell,2022-07-08 14:09:38.409418,bing,"[<ul class=""sa_drw"" id=""sa_ul"" role=""listbox"" ...","[annalena baerbock aktuell, baerbock beliebthe...",[],[],1,baerbock,
5,baerbock annalena lebenslauf,2022-07-08 14:09:39.823340,bing,"[<ul class=""sa_drw"" id=""sa_ul"" role=""listbox"" ...","[annalena baerbock lebenslauf und ausbildung, ...",[],[],1,baerbock,
6,baerbock assange,2022-07-08 14:09:41.834980,bing,"[<ul class=""sa_drw"" id=""sa_ul"" role=""listbox"" ...","[annalena baerbock assange, bedroht, berufungs...",[],[],1,baerbock,
7,baerbock annalena privat,2022-07-08 14:09:43.907234,bing,"[<ul class=""sa_drw"" id=""sa_ul"" role=""listbox"" ...","[annalena baerbock privat auto, baerbock annal...",[],[],1,baerbock,
8,baerbock ausbildung,2022-07-08 14:09:45.459834,bing,"[<ul class=""sa_drw"" id=""sa_ul"" role=""listbox"" ...","[baerbock ausbildung abschluss, annalena baerb...",[],[],1,baerbock,
9,baerbock bilder,2022-07-08 14:09:47.435869,bing,"[<ul class=""sa_drw"" id=""sa_ul"" role=""listbox"" ...","[annalena baerbock bilder, bilder von annalena...",[],[],1,baerbock,


In [3]:
df = pd.DataFrame(tree)
df

Unnamed: 0,qry,datetime,source,data,suggests,self_loops,tags,depth,root,crawl_id
0,baerbock,2022-07-08 14:05:08.180283,google,"[baerbock a, [[baerbock a<b>ssange</b>, 0, [51...","[baerbock assange, baerbock außenministerin, b...",[],"{'q': '_aID29LNBMkNdvT1HGcufNq-kFE', 't': {'bp...",0,baerbock,
1,baerbock assange,2022-07-08 14:05:09.710532,google,"[baerbock assange a, [[baerbock assange a<b>us...","[baerbock assange auslieferung, annalena baerb...",[],"{'i': 'baerbock assange a', 'q': '2-Pdr-yU7hi8...",1,baerbock,
2,baerbock außenministerin,2022-07-08 14:05:11.553655,google,"[baerbock außenministerin a, [[baerbock außenm...","[baerbock außenministerin alter, baerbock als ...",[],"{'i': 'baerbock außenministerin a', 'q': 'pGBQ...",1,baerbock,
3,baerbock aachen,2022-07-08 14:05:13.002763,google,"[baerbock aachen a, [[a<b>nnalena </b>baerbock...","[annalena baerbock aachen, baerbock aachen bra...",[],"{'i': 'baerbock aachen a', 'q': '6F2buxiUTvVqA...",1,baerbock,
4,baerbock abgenommen,2022-07-08 14:05:14.831184,google,"[baerbock abgenommen a, [[a<b>nnalena </b>baer...","[annalena baerbock abgenommen, baerbock abgeno...",[],"{'i': 'baerbock abgenommen a', 'q': '2LGtFEE7x...",1,baerbock,
5,baerbock aktuell,2022-07-08 14:05:16.302184,google,"[baerbock aktuell a, [[baerbock<b> heute </b>a...","[baerbock heute abend, baerbock heute ard, bae...",[],"{'i': 'baerbock aktuell a', 'q': 'OTFzBCaMS3qr...",1,baerbock,
6,baerbock außenminister,2022-07-08 14:05:18.122192,google,"[baerbock außenminister a, [[baerbock a<b>ls <...","[baerbock als außenminister, annalena baerbock...",[],"{'i': 'baerbock außenminister a', 'q': 'rDJMvh...",1,baerbock,
7,baerbock afrika,2022-07-08 14:05:20.060159,google,"[baerbock afrika a, [[a<b>nnalena </b>baerbock...","[annalena baerbock afrika, baerbock afrika bes...",[],"{'i': 'baerbock afrika a', 'q': '97KMdaHBO064X...",1,baerbock,
8,baerbock ausgeladen,2022-07-08 14:05:21.853188,google,"[baerbock ausgeladen a, [[baerbock ausgeladen ...","[baerbock ausgeladen berlin, baerbock ausgelad...",[],"{'q': 'QUDd5j8fzLvxiDbEov_XFKmiJPQ', 't': {'bp...",1,baerbock,
9,baerbock auto,2022-07-08 14:05:23.480739,google,"[baerbock auto a, [[a<b>nnalena </b>baerbock a...","[annalena baerbock auto, annalena baerbock aut...",[],"{'i': 'baerbock auto a', 'q': '85q6gOinQecr-Yc...",1,baerbock,


In [7]:
# export as csv
df = pd.DataFrame(tree)
df.to_csv('test_tree.csv', sep=';')

In [36]:
#df1 = pd.read_pickle(r'rai_app/export_file/export_test.pickle')
df_test = df1.head(10)
df_test

Unnamed: 0,root,edge,source,target,rank,depth,search_engine,datetime,grandparent,parent,source_add,target_add
0,christian eriksen,"(christian eriksen, christian eriksen aktuell)",christian eriksen,christian eriksen aktuell,1,0,google,2022-07-08 14:15:59.326792,,,christian eriksen,aktuell
1,christian eriksen,"(christian eriksen, christian eriksen alter)",christian eriksen,christian eriksen alter,2,0,google,2022-07-08 14:15:59.326792,,,christian eriksen,alter
2,christian eriksen,"(christian eriksen, christian eriksen arzt)",christian eriksen,christian eriksen arzt,3,0,google,2022-07-08 14:15:59.326792,,,christian eriksen,arzt
3,christian eriksen,"(christian eriksen, christian eriksen aktuelle...",christian eriksen,christian eriksen aktuelle teams,4,0,google,2022-07-08 14:15:59.326792,,,christian eriksen,aktuelle teams
4,christian eriksen,"(christian eriksen, christian eriksen agent)",christian eriksen,christian eriksen agent,5,0,google,2022-07-08 14:15:59.326792,,,christian eriksen,agent
5,christian eriksen,"(christian eriksen, christian eriksen attack)",christian eriksen,christian eriksen attack,6,0,google,2022-07-08 14:15:59.326792,,,christian eriksen,attack
6,christian eriksen,"(christian eriksen, christian eriksen and simo...",christian eriksen,christian eriksen and simon kjaer,7,0,google,2022-07-08 14:15:59.326792,,,christian eriksen,and simon kjaer
7,christian eriksen,"(christian eriksen, christian eriksen and covid)",christian eriksen,christian eriksen and covid,8,0,google,2022-07-08 14:15:59.326792,,,christian eriksen,and covid
8,christian eriksen,"(christian eriksen, christian eriksen and harr...",christian eriksen,christian eriksen and harry kane,9,0,google,2022-07-08 14:15:59.326792,,,christian eriksen,and harry kane
9,christian eriksen,"(christian eriksen, christian eriksen and family)",christian eriksen,christian eriksen and family,10,0,google,2022-07-08 14:15:59.326792,,,christian eriksen,and family


## trimming with levenshtein distance

In [9]:
b = ['baerbock annalena',
 'baerbock alter',
 'baerbock aktuell',
 'baerbock ausbildung',
 'baerbock annalena ehemann',
 'baerbock annalena privat',
 'baerbock afghanistan',
 'baerbock anne will',
 'baerbock buch',
 'baerbock bilder',
 'baerbock bikini',
 'baerbock bei putin',
 'baerbock beine',
 'baerbock blinken',
 'baerbock balkan',
 'baerbock beruf',
 'baerbock china',
 'baerbock cv',
 'baerbock corona bonus',
 'baerbock corona',
 'baerbock cartoon',
 'baerbock chancen',
 'baerbock comic',
 'baerbock corona hilfe',
 'baerbock droht russland',
 'baerbock dumme zitate',
 'baerbock dienstwagen',
 'baerbock droht',
 'baerbock droht putin',
 'baerbock doktorarbeit',
 'baerbock dissertation',
 'baerbock donbass',
 'baerbock englisch',
 'baerbock ehemann',
 'baerbock english',
 'baerbock eltern',
 'baerbock englisch video',
 'baerbock email',
 'baerbock englisch youtube',
 'baerbock eiffelturm',
 'baerbock familie',
 'baerbock facebook',
 'baerbock feet',
 'baerbock fotos',
 'baerbock fauxpas',
 'baerbock frisur',
 'baerbock pressefreiheit',
 'baerbock fehler',
 'baerbock gruene',
 'baerbock greenpeace',
 'baerbock gehalt',
 'baerbock g7',
 'baerbock germany',
 'baerbock gestern',
 'baerbock gewicht',
 'baerbock gegen putin',
 'baerbock habeck',
 'baerbock helm',
 'baerbock heute',
 'baerbock haustiere',
 'baerbock homepage',
 'baerbock hunde',
 'baerbock herkunft',
 'baerbock high heels',
 'baerbock israel',
 'baerbock in moskau',
 'baerbock in polen',
 'baerbock in russland',
 'baerbock in paris',
 'baerbock interview',
 'baerbock impfpflicht',
 'baerbock in israel',
 'baerbock jung',
 'baerbock jetzt',
 'baerbock junge dame',
 'baerbock jeans',
 'baerbock juristin',
 'baerbock jugendfoto',
 'baerbock jordanien',
 'baerbock junge welt',
 'baerbock kobold',
 'baerbock kinder',
 'baerbock kritik',
 'baerbock kosovo',
 'baerbock kontakt',
 'baerbock kleider',
 'baerbock kleidung',
 'baerbock kiew',
 'baerbock lebenslauf',
 'baerbock lawrow',
 'baerbock live',
 'baerbock lustig',
 'baerbock lindner',
 'baerbock laschet',
 'baerbock lanz',
 'baerbock laschet scholz',
 'baerbock mann',
 'baerbock moskau',
 'baerbock marokko',
 'baerbock merz',
 'baerbock morgan',
 'baerbock mode',
 'baerbock meme',
 'baerbock mantel moskau',
 'baerbock n wort',
 'baerbock nicht neutral',
 'baerbock nord stream 2',
 'baerbock n-wort',
 'baerbock n word',
 'baerbock n',
 'baerbock nato',
 'baerbock news heute',
 'baerbock ostkokaine',
 'baerbock oder habeck',
 'baerbock olympia',
 'baerbock outfit',
 'baerbock oderbruch',
 'baerbock oberweite',
 'baerbock oberstaufen',
 'baerbock ohrringe',
 'baerbock paris',
 'baerbock plagiat',
 'baerbock polen',
 'baerbock putin',
 'baerbock privat',
 'baerbock pressefreiheit',
 'baerbock peinlich',
 'baerbock pressekonferenz',
 'baerbock quellen',
 'baerbock iq',
 'baerbock eqs',
 'baerbock greenpeace',
 'baerbock gehalt',
 'baerbock annalena',
 'baerbock china',
 'baerbock gestern',
 'baerbock russland',
 'baerbock rede',
 'baerbock rede und',
 'baerbock rede englisch',
 'baerbock rede uno',
 'baerbock rede heute',
 'baerbock rede russland',
 'baerbock rede bundestag',
 'baerbock swift',
 'baerbock serbien',
 'baerbock studium',
 'baerbock skandale',
 'baerbock strache',
 'baerbock sprachfehler',
 'baerbock song',
 'baerbock spricht englisch',
 'baerbock twitter',
 'baerbock trampolin',
 'baerbock termine',
 'baerbock taz',
 'baerbock trampolinspringen',
 'baerbock tagesschau',
 'baerbock trampolin video',
 'baerbock triell',
 'baerbock un rede',
 'baerbock ukraine',
 'baerbock usa',
 'baerbock uno',
 'baerbock und',
 'baerbock und habeck',
 'baerbock ukraine-krise',
 'baerbock umfragewerte',
 'baerbock vita',
 'baerbock versprecher',
 'baerbock video',
 'baerbock versagt',
 'baerbock verspricht sich',
 'baerbock vor der uno',
 'baerbock vor un',
 'baerbock vereidigung',
 'baerbock wikipedia',
 'baerbock witwenrente',
 'baerbock witze',
 'baerbock warschau',
 'baerbock welt',
 'baerbock witwenrente abschaffen',
 'baerbock wohnort',
 'baerbock wahlkreis',
 'xbox',
 'xfinity',
 'xpo',
 'xrp',
 'x22',
 'xcel',
 'xom',
 'xmas',
 'baerbock youtube',
 'baerbock yad vashem',
 'baerbock young global leaders',
 'baerbock young leader',
 'annalena baerbock youtube',
 'baerbock wang yi',
 'annalena baerbock englisch youtube',
 'youtube baerbock englisch',
 'baerbock zitate',
 'baerbock zum russischen angriff',
 'baerbock zug',
 'baerbock zu ukraine',
 'baerbock zdf',
 'baerbock zur witwenrente',
 'baerbock zeit',
 'baerbock zustimmung']

In [12]:
len(b)

208

In [19]:
list(map(lev_dist_list, b))

['baerbock annalena',
 'baerbock alter',
 'baerbock aktuell',
 'baerbock ausbildung',
 'baerbock annalena ehemann',
 'baerbock annalena privat',
 'baerbock afghanistan',
 'baerbock anne will',
 'baerbock buch',
 'baerbock bilder',
 'baerbock bikini',
 'baerbock bei putin',
 'baerbock beine',
 'baerbock blinken',
 'baerbock balkan',
 'baerbock beruf',
 'baerbock china',
 'baerbock cv',
 'baerbock corona bonus',
 'baerbock corona',
 'baerbock cartoon',
 'baerbock chancen',
 'baerbock comic',
 'baerbock corona hilfe',
 'baerbock droht russland',
 'baerbock dumme zitate',
 'baerbock dienstwagen',
 'baerbock droht',
 'baerbock droht putin',
 'baerbock doktorarbeit',
 'baerbock dissertation',
 'baerbock donbass',
 'baerbock englisch',
 'baerbock ehemann',
 'baerbock english',
 'baerbock eltern',
 'baerbock englisch video',
 'baerbock email',
 'baerbock englisch youtube',
 'baerbock eiffelturm',
 'baerbock familie',
 'baerbock facebook',
 'baerbock feet',
 'baerbock fotos',
 'baerbock fauxpas

In [15]:
b

['baerbock annalena',
 'baerbock alter',
 'baerbock aktuell',
 'baerbock ausbildung',
 'baerbock annalena ehemann',
 'baerbock annalena privat',
 'baerbock afghanistan',
 'baerbock anne will',
 'baerbock buch',
 'baerbock bilder',
 'baerbock bikini',
 'baerbock bei putin',
 'baerbock beine',
 'baerbock blinken',
 'baerbock balkan',
 'baerbock beruf',
 'baerbock china',
 'baerbock cv',
 'baerbock corona bonus',
 'baerbock corona',
 'baerbock cartoon',
 'baerbock chancen',
 'baerbock comic',
 'baerbock corona hilfe',
 'baerbock droht russland',
 'baerbock dumme zitate',
 'baerbock dienstwagen',
 'baerbock droht',
 'baerbock droht putin',
 'baerbock doktorarbeit',
 'baerbock dissertation',
 'baerbock donbass',
 'baerbock englisch',
 'baerbock ehemann',
 'baerbock english',
 'baerbock eltern',
 'baerbock englisch video',
 'baerbock email',
 'baerbock englisch youtube',
 'baerbock eiffelturm',
 'baerbock familie',
 'baerbock facebook',
 'baerbock feet',
 'baerbock fotos',
 'baerbock fauxpas

# main testing

In [11]:
def load_data(file):
    df = pd.read_excel(file)
    df['querrys'] = df['Vorname'] + ' ' + df['Name'] # create querry term column 
    return df

In [12]:
def get_user_input():
    input_source = input('Which search engine? Available selection: "google" or "bing"\n Input: ')
    input_max_depth = int(input('To what maximum depth should the search be carried out? Note: It is recommended to start with 1 \n Input: '))
    user_input = {'source': input_source,
             'max_depth': input_max_depth}
    return user_input

In [16]:
def main():
    import_path = 'rai_app/import_file/import.xlsx'
    export_path = 'rai_app/export_file/'

    file = load_data(import_path)
    df = pd.DataFrame()

    user_input = get_user_input()
    # Crawl querrys and safe output 
    for querry in file['querrys']:
        # Generating a suggestions tree
        tree = suggests.get_suggests_tree(querry.lower(), source=user_input['source'], max_depth=user_input['max_depth'])
        
        # Reduce to new information obtained in suggestions
        edges = suggests.to_edgelist(tree)
        edges = suggests.add_parent_nodes(edges)
        try:
            edges = edges.apply(suggests.add_metanodes, axis=1)
            # Append output to df
            df = pd.concat([df,edges])
        except AttributeError:
            pass
    
    df['target'] = df.apply(lev_dist_row, axis=1)
    df = df.loc[df.target!='delete']
    
    return df


In [18]:
df = main()

2022-07-08 17:50:38,026 | 2156 | INFO | rai_app.suggests.logger | google | christian eriksen 
2022-07-08 17:50:38,550 | 2156 | INFO | rai_app.suggests.logger | google | christian eriksen 
2022-07-08 17:50:39,114 | 2156 | INFO | rai_app.suggests.logger | google | christian eriksen 
2022-07-08 17:50:39,512 | 2156 | INFO | rai_app.suggests.logger | google | alec baldwin 
2022-07-08 17:50:40,304 | 2156 | INFO | rai_app.suggests.logger | google | alec baldwin 
2022-07-08 17:50:40,871 | 2156 | INFO | rai_app.suggests.logger | google | alec baldwin 
2022-07-08 17:50:41,466 | 2156 | INFO | rai_app.suggests.logger | google | gil ofarim 
2022-07-08 17:50:42,234 | 2156 | INFO | rai_app.suggests.logger | google | gil ofarim 
2022-07-08 17:50:42,724 | 2156 | INFO | rai_app.suggests.logger | google | gil ofarim 
2022-07-08 17:50:43,370 | 2156 | INFO | rai_app.suggests.logger | google | armin laschet 
2022-07-08 17:50:44,011 | 2156 | INFO | rai_app.suggests.logger | google | armin laschet 
2022-07-08

In [21]:
df

Unnamed: 0,root,edge,source,target,rank,depth,search_engine,datetime,grandparent,parent,source_add,target_add
0,christian eriksen,"(christian eriksen, christian eriksen aktuell)",christian eriksen,christian eriksen aktuell,1,0,google,2022-07-08 15:50:37.480972,,,christian eriksen,aktuell
1,christian eriksen,"(christian eriksen, christian eriksen alter)",christian eriksen,christian eriksen alter,2,0,google,2022-07-08 15:50:37.480972,,,christian eriksen,alter
2,christian eriksen,"(christian eriksen, christian eriksen arzt)",christian eriksen,christian eriksen arzt,3,0,google,2022-07-08 15:50:37.480972,,,christian eriksen,arzt
3,christian eriksen,"(christian eriksen, christian eriksen aktuelle...",christian eriksen,christian eriksen aktuelle teams,4,0,google,2022-07-08 15:50:37.480972,,,christian eriksen,aktuelle teams
4,christian eriksen,"(christian eriksen, christian eriksen agent)",christian eriksen,christian eriksen agent,5,0,google,2022-07-08 15:50:37.480972,,,christian eriksen,agent
...,...,...,...,...,...,...,...,...,...,...,...,...
15,florian weiss,"(florian weiss, florian wess botox)",florian weiss,florian wess botox,16,0,google,2022-07-08 15:52:07.490813,,,florian weiss,wess botox
16,florian weiss,"(florian weiss, florian weiss bridge)",florian weiss,florian weiss bridge,17,0,google,2022-07-08 15:52:07.490813,,,florian weiss,bridge
17,florian weiss,"(florian weiss, florian weiss besserwisser)",florian weiss,florian weiss besserwisser,18,0,google,2022-07-08 15:52:07.490813,,,florian weiss,besserwisser
18,florian weiss,"(florian weiss, florian weiss betonbohren)",florian weiss,florian weiss betonbohren,19,0,google,2022-07-08 15:52:07.490813,,,florian weiss,betonbohren


In [53]:
#df1['target'] = df1.apply(lev_dist_row, axis=1)
df1.loc[df1.target!='delete']

Unnamed: 0,root,edge,source,target,rank,depth,search_engine,datetime,grandparent,parent,source_add,target_add
0,christian eriksen,"(christian eriksen, christian eriksen aktuell)",christian eriksen,christian eriksen aktuell,1,0,google,2022-07-08 14:15:59.326792,,,christian eriksen,aktuell
1,christian eriksen,"(christian eriksen, christian eriksen alter)",christian eriksen,christian eriksen alter,2,0,google,2022-07-08 14:15:59.326792,,,christian eriksen,alter
2,christian eriksen,"(christian eriksen, christian eriksen arzt)",christian eriksen,christian eriksen arzt,3,0,google,2022-07-08 14:15:59.326792,,,christian eriksen,arzt
3,christian eriksen,"(christian eriksen, christian eriksen aktuelle...",christian eriksen,christian eriksen aktuelle teams,4,0,google,2022-07-08 14:15:59.326792,,,christian eriksen,aktuelle teams
4,christian eriksen,"(christian eriksen, christian eriksen agent)",christian eriksen,christian eriksen agent,5,0,google,2022-07-08 14:15:59.326792,,,christian eriksen,agent
...,...,...,...,...,...,...,...,...,...,...,...,...
402,florian weiss,"(florian weiss badbergen, florian weiss badber...",florian weiss badbergen,florian weiss badbergen corona,9,1,google,2022-07-08 14:55:25.201143,,florian weiss,badbergen,corona
403,florian weiss,"(florian weiss badbergen, florian weiss badber...",florian weiss badbergen,florian weiss badbergen coaching,10,1,google,2022-07-08 14:55:25.201143,,florian weiss,badbergen,coaching
404,florian weiss,"(florian weiss badbergen, florian weiss badber...",florian weiss badbergen,florian weiss badbergen chef,11,1,google,2022-07-08 14:55:25.201143,,florian weiss,badbergen,chef
405,florian weiss,"(florian weiss badbergen, florian weiss badber...",florian weiss badbergen,florian weiss badbergen corona regeln,12,1,google,2022-07-08 14:55:25.201143,,florian weiss,badbergen,corona regeln
