In [1]:
#Plotly
import dash
import dash_table
import dash_core_components as dcc
import dash_html_components as html
import dash_bootstrap_components as dbc
import dash_daq as daq
from dash.dependencies import Input, Output

import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot, enable_mpl_offline

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

import seaborn as sns
import numpy as np
import pandas as pd


import unidecode
import betacode
import betacode.conv

import string
import re

import gensim.models.word2vec as w2v
import os

#import cltk
#from cltk.corpus.greek.beta_to_unicode import Replacer
#from cltk.corpus.utils.formatter import tonos_oxia_converter

Start with the dataset I orginally provided you with, and scale the features, and write the scaled_features version to csv

In [2]:
df = pd.read_csv('/users/nicklist/Desktop/words2_10_copy.csv')

def converter_no_accents(i):
    unaccented_string = unidecode.unidecode(i)
    word = betacode.conv.uni_to_beta(unaccented_string)
    return word

def converter_accents(i):
    word = betacode.conv.uni_to_beta(i)
    return word

df['beta_no_accents'] = df['word'].apply(converter_no_accents)
df['beta_accents'] = df['word'].apply(converter_accents)

X = df.sort_values('beta_no_accents')
X.head(20)


Unnamed: 0,word,x,y,beta_no_accents,beta_accents
16103,’’,2.529501,0.84152,'',''
18184,’’’,1.030001,1.001979,''','''
15845,’ἀλλ’,-0.031513,1.235881,'all','a)ll'
17516,’εν,0.570085,-0.872183,'en,'en
10427,*,0.105325,-0.45526,*,*
15589,**,0.870172,1.488369,**,**
16416,Ἀαρὼν,-0.203836,-0.066657,Aaron,*)aarw\n
12274,Ααρων,0.682403,-4.700191,Aaron,*aarwn
16442,Ἀαρῶνος,-3.650787,0.520087,Aaronos,*)aarw=nos
5603,Ἄβαι,0.611824,-1.146564,Abai,*)/abai


In [3]:
#scale x y colomns 
scaled_features = X.copy()
col_names = ['x', 'y']
features = scaled_features[col_names]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)

X = scaled_features[col_names] = features

In [None]:
scaled_features.to_csv('/users/nicklist/Desktop/scaled_features.csv')

Load the scaled csv version, and extract the Greek word you want to highlight in the scatterplot. I provide the Greek word below ("καρδία"), but this variable will need to be integrated into the dcc.Dropdown.

The filter provides you with the Greek word name and its xy coordinates. You can then use these for the annotation.

The arrow annotation script is taken from https://plotly.com/python/text-and-annotations/ under the section "Styling and Coloring Annotations"

In [4]:
# Compute DBSCAN
db = DBSCAN(eps=0.02, min_samples=3).fit(X) # or X
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
# numpy.set_printoptions(threshold=sys.maxsize)
# print(db.labels_)

Estimated number of clusters: 1318
Estimated number of noise points: 6204


In [5]:
Y = pd.read_csv('/Users/nicklist/Desktop/scaled_features.csv')
greek2vec = w2v.Word2Vec.load(os.path.join("trained2_10", "greek2vec.w2v"))
all_words_x = scaled_features['x']
all_words_y = scaled_features['y']
all_words = scaled_features['word']
tags = scaled_features['beta_no_accents']
accent_tags = scaled_features['beta_accents']
df1 = pd.read_csv('/users/nicklist/Desktop/scaled_features.csv', usecols=['beta_no_accents', 'beta_accents'])
beta_cols = df1.to_dict('records')

####colours for DBSCAN labels####
colourscale = [[0,'rgb(102, 153, 255)'],
               [1,'rgb(102, 153, 255)']]

colourscale1=[[0,"rgb(211,211,211)"],#grey
           [0.2, "rgb(34,139,34)"],#green
            [0.4, "rgb(186,85,211)"],#medium orchid (purple)
            [0.6, "rgb(255,215,0)"],#gold
            [0.8, "rgb(255,69,0)"],#orange red
            [1, "rgb(139,69,19)"]]#saddle brown

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

app = dash.Dash(__name__, external_stylesheets=external_stylesheets)    

markdown_text = '''List, N. G. *A Vector Space Model for Koine Greek*. M.A. Thesis. University of Otago, 2021.'''

####creates placeholder scatterplot when app is first launched####
initial_fig = fig=go.Figure(
                go.Scatter(
                    x=all_words_x,
                    y=all_words_y,
                    mode = 'markers',
                    hovertext = all_words, 
                    marker= dict(size=4, color=db.labels_, colorscale= colourscale)
                )
            )

initial_fig.update_layout(title= 'Koine Greek DBSCAN', height=800, #width=1200, 
                          template = 'simple_white')
    
b = dcc.Graph(
    figure = initial_fig,
    id='initial-graph'
)

####generates an html table from dataframe####
def generate_table(dataframe, max_rows=10):
    return html.Table([
        html.Thead(
            html.Tr([html.Th(col) for col in dataframe.columns])
        ),
        html.Tbody([
            html.Tr([
                html.Td(dataframe.iloc[i][col]) for col in dataframe.columns
            ]) for i in range(min(len(dataframe), max_rows))
        ])
    ])

####computes the cosine similarity for placeholder table####
def cosine_sim(greek):
    greek2vec = w2v.Word2Vec.load(os.path.join("trained2_10", "greek2vec.w2v"))
    result = greek2vec.wv.most_similar(greek)
    df4 = pd.DataFrame(result)
    df4.columns = ['Word', 'Cosine Value']    
    return [generate_table(df4)]
initial_table = cosine_sim('Ααρων')

####Dash app page layout####
app.layout = html.Div(
    children=[
        html.H1(id='title', children='A Vector Space Model for Koine Greek', className = 'tweleve columns'),
        html.Div([
            dcc.Markdown(children=markdown_text, className = 'twelve columns')
                 ], className = 'row'),
        #html.Img(src= "http://www.perseus.tufts.edu/img/keyCaps.gif"),
        #html.Div([
        #dbc.Button("Words", id="btn_words", n_clicks=0, color="info", outline=True, className="mr-1"),
        #dbc.Button("Points", id="btn_points", n_clicks=0, color="info", outline=True, className="mr-1"),
        html.Div([
            html.Div([
                dcc.Dropdown(
                    id='drop_beta',
                    options=[],
                    value= '',
                    placeholder = "Type Beta Code here...",
                    style={'width': '50%'},
                    persistence_type = 'session'
                ),
            ], className = 'nine columns'),
        ], className = 'row'),
        html.Br(),
        html.Div([
            html.Div([
                daq.ToggleSwitch(
                    id='switch_accents',
                    color='grey',
                    label='Beta Accents',
                    labelPosition='right',
                    value = False, #<== set to without accents
                    persistence_type = 'session',
                ),
            ], className = 'two columns'),
        ], className = 'row'),
        html.Br(),
        html.Div([   
            html.Div([
                dcc.Dropdown(
                    id='dropdown_value',
                    options=[{'label': i,'value': i} for i in all_words],
                    value= 'Ααρων',
                    multi = False, ##select multiple options in dropdown
                    searchable = True,
                    placeholder = "Or select a lexeme...",
                    style={'width': '50%'},
                    persistence_type = 'session'
                ),
            ], className = 'nine columns'),
        ], className = 'row'),
        html.Br(),
        html.Div([    
            html.Div([
                daq.ToggleSwitch(
                    id='switch_points_words',
                    color='grey',
                    label='View Words',
                    labelPosition='right',
                    value = False,
                    persistence_type = 'session'
                ),
            ], className = 'two columns'),
        ], className = 'row'),
        html.Br(),
        html.Div([
            html.Div(id='graph', children=b, className = 'eight columns'),
            html.Div([
                html.H4(id='cosine_label', children='Ααρων'),
                html.Div(id='cosine_output', children= initial_table)
            ], className = 'four columns'),
        ], className = 'row')
])

####update the dropdown contents on drop_beta based on boolean value of switch_value######
@app.callback(
    Output("drop_beta", "options"),
    [Input('switch_accents', 'value')]
)
def update_drop_beta(switch_value):
    if switch_value==True: #==with accents
        updated_options = [{'label': i["beta_accents"],'value': i["beta_accents"]} for i in beta_cols]
    elif switch_value==False: #==without accents
        updated_options = [{'label': i["beta_no_accents"],'value': i["beta_no_accents"]} for i in beta_cols]
    return updated_options

####update the dropdown_value value, taking the Greek word in the same row of the dataframe, based on
####the boolean value of switch_value######
@app.callback(
    Output("dropdown_value", "value"),
    [Input("drop_beta", "value"),
    Input('switch_accents', 'value')]
)
def update_tag(input_word, switch_value):
    if switch_value == False: #==without accents
        N = pd.read_csv('/Users/nicklist/Desktop/scaled_features.csv')
        greek = N.loc[N['beta_no_accents']==input_word]['word'].iloc[0]
    elif switch_value == True: #==with accents
        Y = pd.read_csv('/Users/nicklist/Desktop/scaled_features.csv')
        greek = Y.loc[Y['beta_accents']==input_word]['word'].iloc[0]
    return greek

####generate the cosine similarity table######    
@app.callback(
    Output("cosine_output", "children"),
    [Input("dropdown_value", "value")]
)
def cosine_sim(greek):
    result = greek2vec.wv.most_similar(greek)
    df2 = pd.DataFrame(result)
    df2.columns = ['Word', 'Cosine Value']       
    return [generate_table(df2)]

####update the title of the table######
@app.callback(
    Output("cosine_label", "children"),
    [Input("dropdown_value", "value")]
)
def update_label(greek):
    return greek

####Generate the Scatterplot, based on the dropdown_value, and the boolean value of switch_points_words###
@app.callback(Output('graph', 'children'),
              [Input('switch_points_words', 'value'),
              Input('dropdown_value', 'value')])
def update_switch_and_word(switch_points_words, selected_word):
    #changed_id = [p['prop_id'] for p in dash.callback_context.triggered][0]
    if switch_points_words == True:
        all_words_x = scaled_features['x']
        all_words_y = scaled_features['y']
        all_words = scaled_features['word'] 
        listform = []
        for w in selected_word:
            if w not in listform:
                listform.append(selected_word)
        Y = pd.read_csv('/Users/nicklist/Desktop/scaled_features.csv')
        Y['word'] = Y.word.str.extract('({0})'.format('|'.join(listform)), flags=re.IGNORECASE)
        word = Y[~pd.isna(Y.word)]
        keyword_x = word.iloc[-1]['x']
        keyword_y = word.iloc[-1]['y']
        keyword = word.iloc[-1]['word']
        
        tog_fig=go.Figure(
                go.Scatter(
                    x=all_words_x,
                    y=all_words_y,
                    mode = 'markers+text',
                    text = all_words,
                    textposition = "bottom right",
                    marker= dict(size=10, color=db.labels_, colorscale= colourscale)  
                ),
            )
        tog_fig.add_annotation(
                x= keyword_x,
                y= keyword_y,
                xref="x",
                yref="y",
                text= selected_word,
                showarrow=True,
                font=dict(
                    family="Courier New, monospace",
                    size=16,
                    color="#ffffff"
                    ),
                align="center",
                arrowhead=2,
                arrowsize=1,
                arrowwidth=2,
                arrowcolor="#636363",
                ax=20,
                ay=-30,
                bordercolor="#c7c7c7",
                borderwidth=2,
                borderpad=4,
                bgcolor="#ff7f0e",
                opacity=0.8
                )
        tog_fig.update_layout(title= 'Koine Greek DBSCAN', height=800, width=1200, 
                              template = 'simple_white')
        g = dcc.Graph(
            figure = tog_fig,
            id='graph1'
        )  
    elif switch_points_words == False:
        all_words_x = scaled_features['x']
        all_words_y = scaled_features['y']
        all_words = scaled_features['word'] 
        
        listform = []
        for w in selected_word:
            if w not in listform:
                listform.append(selected_word)
        Y = pd.read_csv('/Users/nicklist/Desktop/scaled_features.csv')
        Y['word'] = Y.word.str.extract('({0})'.format('|'.join(listform)), flags=re.IGNORECASE)
        word = Y[~pd.isna(Y.word)]
        keyword_x = word.iloc[-1]['x']
        keyword_y = word.iloc[-1]['y']
        keyword = word.iloc[-1]['word']
        
        fig=go.Figure(
                go.Scatter(
                    x=all_words_x,
                    y=all_words_y,
                    mode = 'markers',
                    hovertext = all_words,
                    marker= dict(size=4, color=db.labels_, colorscale= colourscale)  
                ),
            )
        fig.add_annotation(
                x= keyword_x,
                y= keyword_y,
                xref="x",
                yref="y",
                text= selected_word,
                showarrow=True,
                font=dict(
                    family="Courier New, monospace",
                    size=16,
                    color="#ffffff"
                    ),
                align="center",
                arrowhead=2,
                arrowsize=1,
                arrowwidth=2,
                arrowcolor="#636363",
                ax=20,
                ay=-30,
                bordercolor="#c7c7c7",
                borderwidth=2,
                borderpad=4,
                bgcolor="#ff7f0e",
                opacity=0.8
                )
        fig.update_layout(title= 'Koine Greek DBSCAN', height=800, #width=1200, 
                          template = 'simple_white')
        g = dcc.Graph(
            figure = fig,
            id='graph2'
        )
    return g


In [6]:
if __name__ == '__main__':
    app.run_server(port=4050)

Dash is running on http://127.0.0.1:4050/

 in production, use a production WSGI server like gunicorn instead.

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:4050/ (Press CTRL+C to quit)
127.0.0.1 - - [15/Oct/2020 14:32:29] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [15/Oct/2020 14:32:31] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
127.0.0.1 - - [15/Oct/2020 14:32:31] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -
127.0.0.1 - - [15/Oct/2020 14:32:31] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


Exception on /_dash-update-component [POST]
Traceback (most recent call last):
  File "/Users/nicklist/venv/lib/python3.7/site-packages/flask/app.py", line 2447, in wsgi_app
    response = self.full_dispatch_request()
  File "/Users/nicklist/venv/lib/python3.7/site-packages/flask/app.py", line 1952, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/Users/nicklist/venv/lib/python3.7/site-packages/flask/app.py", line 1821, in handle_user_exception
    reraise(exc_type, exc_value, tb)
  File "/Users/nicklist/venv/lib/python3.7/site-packages/flask/_compat.py", line 39, in reraise
    raise value
  File "/Users/nicklist/venv/lib/python3.7/site-packages/flask/app.py", line 1950, in full_dispatch_request
    rv = self.dispatch_request()
  File "/Users/nicklist/venv/lib/python3.7/site-packages/flask/app.py", line 1936, in dispatch_request
    return self.view_functions[rule.endpoint](**req.view_args)
  File "/Users/nicklist/venv/lib/python3.7/site-packages/dash/dash.py",

127.0.0.1 - - [15/Oct/2020 14:32:31] "[35m[1mPOST /_dash-update-component HTTP/1.1[0m" 500 -
127.0.0.1 - - [15/Oct/2020 14:32:33] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [15/Oct/2020 14:32:49] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [15/Oct/2020 14:32:49] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [15/Oct/2020 14:32:49] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [15/Oct/2020 14:32:50] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


## Select Specific Words

In [None]:
# example of words from James

xml_files = glob('/Users/admin/Desktop/James.xml')
replacer = Replacer()
corpus = []
for xml in xml_files:
    with open(xml, 'r') as x:
        tree = parse(x)
        root = tree.getroot()
        for sentence in root.iter('sentence'):
            for word in sentence.iter('word'):
                for lemma in word.iter('lemma'):
                    entry = lemma.get('entry')
                    if entry is None:
                        entry = replacer.beta_code(word.get('form'))
                        if entry not in corpus:
                            corpus.append(entry)
                    elif tonos_oxia_converter(entry) not in new_list:
                        if entry not in corpus:
                            corpus.append(entry)
    x.close()

with open('/Users/admin/Desktop/James_output.txt', 'w') as f:
    f.write('\n'.join(str(c) for c in corpus))
f.close()

print(len(corpus))
#print(corpus)

In [None]:
#convert modern Greeek tonos to Ancient Greek oxia
import string

with open("/Users/nicklist/Desktop/LN_family.txt", 'r') as fin:
    contents = fin.read()
     
with open("/Users/nicklist/Desktop/new_LN_family.txt", 'w') as fout:
    newcontents = contents.translate(str.maketrans("άέήίόύώ", "άέήίόύώ"))
    fout.write(newcontents)


In [None]:
import re
f = open('/Users/nicklist/Desktop/new_LN_family.txt', 'r')
data1 = re.findall(r"\S+", f.read())
print(data1)

In [None]:
Y = pd.read_csv('/users/nicklist/Desktop/words2_10_copy.csv')


def converter_no_accents(i):
    unaccented_string = unidecode.unidecode(i)
    word = betacode.conv.uni_to_beta(unaccented_string)
    return word

def converter_accents(i):
    word = betacode.conv.uni_to_beta(i)
    return word

Y['beta_no_accents'] = Y['word'].apply(converter_no_accents)
Y['beta_accents'] = Y['word'].apply(converter_accents)

#df3 = df3.sort_values('beta_no_accents')
#df3.head(20)


In [None]:
import re
Y = pd.read_csv('/users/nicklist/Desktop/words2_10_copy.csv')
Y['beta_no_accents'] = Y['word'].apply(converter_no_accents)
Y['beta_accents'] = Y['word'].apply(converter_accents)

f = open('/Users/nicklist/Desktop/new_LN_family.txt', 'r')
data1 = re.findall(r"\S+", f.read())

result = Y.loc[Y['word'].isin(data1)]

print(result)

In [None]:
#scale x y colomns 
scaled_features2 = result.copy()
col_names = ['x', 'y']
features = scaled_features1[col_names]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)

In [None]:
Y = scaled_features2[col_names] = features
scaled_features2 = pd.DataFrame(scaled_features1)
print(scaled_features2)

In [None]:
scaled_features2.to_csv('/users/nicklist/Desktop/scaled_features2.csv')

In [None]:
# Compute DBSCAN
db = DBSCAN(eps=0.25, min_samples=3).fit(Y)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
# numpy.set_printoptions(threshold=sys.maxsize)
# print(db.labels_)

In [None]:
Y = pd.read_csv('/Users/nicklist/Desktop/scaled_features2.csv')
greek2vec = w2v.Word2Vec.load(os.path.join("trained2_10", "greek2vec.w2v"))

all_words_x = scaled_features2['x']
all_words_y = scaled_features2['y']
all_words = scaled_features2['word']

tags = scaled_features2['beta_no_accents']
accent_tags = scaled_features2['beta_accents']
df1 = pd.read_csv('/users/nicklist/Desktop/scaled_features2.csv', usecols=['beta_no_accents', 'beta_accents'])
beta_cols = df1.to_dict('records')

####colours for DBSCAN labels####
colourscale=[[0,"rgb(211,211,211)"],#grey
           [0.2, "rgb(34,139,34)"],#green
            [0.4, "rgb(186,85,211)"],#medium orchid (purple)
            [0.6, "rgb(255,215,0)"],#gold
            [0.8, "rgb(255,69,0)"],#orange red
            [1, "rgb(139,69,19)"]]#saddle brown

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

app = dash.Dash(__name__, external_stylesheets=external_stylesheets)    

markdown_text = '''List, N. G. *A Vector Space Model for Koine Greek*. M.A. Thesis. University of Otago, 2021.'''

####creates placeholder scatterplot when app is first launched####
initial_fig = fig=go.Figure(
                go.Scatter(
                    x=all_words_x,
                    y=all_words_y,
                    mode = 'markers',
                    hovertext = all_words, 
                    marker= dict(size=4, color=db.labels_, colorscale= colourscale)
                )
            )

initial_fig.update_layout(title= 'Koine Greek DBSCAN', height=800, #width=1200, 
                          template = 'simple_white')
    
b = dcc.Graph(
    figure = initial_fig,
    id='initial-graph'
)

####generates an html table from dataframe####
def generate_table(dataframe, max_rows=10):
    return html.Table([
        html.Thead(
            html.Tr([html.Th(col) for col in dataframe.columns])
        ),
        html.Tbody([
            html.Tr([
                html.Td(dataframe.iloc[i][col]) for col in dataframe.columns
            ]) for i in range(min(len(dataframe), max_rows))
        ])
    ])

####computes the cosine similarity for placeholder table####
def cosine_sim(greek):
    greek2vec = w2v.Word2Vec.load(os.path.join("trained2_10", "greek2vec.w2v"))
    result = greek2vec.wv.most_similar(greek)
    df4 = pd.DataFrame(result)
    df4.columns = ['Word', 'Cosine Value']    
    return [generate_table(df4)]
initial_table = cosine_sim('Ααρων')

####Dash app page layout####
app.layout = html.Div(
    children=[
        html.H1(id='title', children='A Vector Space Model for Koine Greek', className = 'tweleve columns'),
        html.Div([
            dcc.Markdown(children=markdown_text, className = 'twelve columns')
                 ], className = 'row'),
        #html.Img(src= "http://www.perseus.tufts.edu/img/keyCaps.gif"),
        #html.Div([
        #dbc.Button("Words", id="btn_words", n_clicks=0, color="info", outline=True, className="mr-1"),
        #dbc.Button("Points", id="btn_points", n_clicks=0, color="info", outline=True, className="mr-1"),
        html.Div([
            html.Div([
                dcc.Dropdown(
                    id='drop_beta',
                    options=[],
                    value= '',
                    placeholder = "Type Beta Code here...",
                    style={'width': '50%'},
                    persistence_type = 'session'
                ),
            ], className = 'nine columns'),
        ], className = 'row'),
        html.Br(),
        html.Div([
            html.Div([
                daq.ToggleSwitch(
                    id='switch_accents',
                    color='grey',
                    label='Beta Accents',
                    labelPosition='right',
                    value = False, #<== set to without accents
                    persistence_type = 'session',
                ),
            ], className = 'two columns'),
        ], className = 'row'),
        html.Br(),
        html.Div([   
            html.Div([
                dcc.Dropdown(
                    id='dropdown_value',
                    options=[{'label': i,'value': i} for i in all_words],
                    value= 'Cosine',
                    multi = False, ##select multiple options in dropdown
                    searchable = True,
                    placeholder = "Or select a lexeme...",
                    style={'width': '50%'},
                    persistence_type = 'session'
                ),
            ], className = 'nine columns'),
        ], className = 'row'),
        html.Br(),
        html.Div([    
            html.Div([
                daq.ToggleSwitch(
                    id='switch_points_words',
                    color='grey',
                    label='View Words',
                    labelPosition='right',
                    value = False,
                    persistence_type = 'session'
                ),
            ], className = 'two columns'),
        ], className = 'row'),
        html.Br(),
        html.Div([
            html.Div(id='graph', children=b, className = 'eight columns'),
            html.Div([
                html.H4(id='cosine_label', children='Cosine'),
                html.Div(id='cosine_output', children= initial_table)
            ], className = 'four columns'),
        ], className = 'row')
])

####update the dropdown contents on drop_beta based on boolean value of switch_value######
@app.callback(
    Output("drop_beta", "options"),
    [Input('switch_accents', 'value')]
)
def update_drop_beta(switch_value):
    if switch_value==True: #==with accents
        updated_options = [{'label': i["beta_accents"],'value': i["beta_accents"]} for i in beta_cols]
    elif switch_value==False: #==without accents
        updated_options = [{'label': i["beta_no_accents"],'value': i["beta_no_accents"]} for i in beta_cols]
    return updated_options

####update the dropdown_value value, taking the Greek word in the same row of the dataframe, based on
####the boolean value of switch_value######
@app.callback(
    Output("dropdown_value", "value"),
    [Input("drop_beta", "value"),
    Input('switch_accents', 'value')]
)
def update_tag(input_word, switch_value):
    if switch_value == False: #==without accents
        N = pd.read_csv('/Users/nicklist/Desktop/scaled_features2.csv')
        greek = N.loc[N['beta_no_accents']==input_word]['word'].iloc[0]
    elif switch_value == True: #==with accents
        Y = pd.read_csv('/Users/nicklist/Desktop/scaled_features2.csv')
        greek = Y.loc[Y['beta_accents']==input_word]['word'].iloc[0]
    return greek

####generate the cosine similarity table######    
@app.callback(
    Output("cosine_output", "children"),
    [Input("dropdown_value", "value")]
)
def cosine_sim(greek):
    result = greek2vec.wv.most_similar(greek)
    df2 = pd.DataFrame(result)
    df2.columns = ['Word', 'Cosine Value']       
    return [generate_table(df2)]

####update the title of the table######
@app.callback(
    Output("cosine_label", "children"),
    [Input("dropdown_value", "value")]
)
def update_label(greek):
    return greek

####Generate the Scatterplot, based on the dropdown_value, and the boolean value of switch_points_words###
@app.callback(Output('graph', 'children'),
              [Input('switch_points_words', 'value'),
              Input('dropdown_value', 'value')])
def update_switch_and_word(switch_points_words, selected_word):
    #changed_id = [p['prop_id'] for p in dash.callback_context.triggered][0]
    if switch_points_words == True:
        all_words_x = scaled_features2['x']
        all_words_y = scaled_features2['y']
        all_words = scaled_features2['word'] 
        listform = []
        for w in selected_word:
            if w not in listform:
                listform.append(selected_word)
        Y = pd.read_csv('/Users/nicklist/Desktop/scaled_features2.csv')
        Y['word'] = Y.word.str.extract('({0})'.format('|'.join(listform)), flags=re.IGNORECASE)
        word = Y[~pd.isna(Y.word)]
        keyword_x = word.iloc[-1]['x']
        keyword_y = word.iloc[-1]['y']
        keyword = word.iloc[-1]['word']
        
        tog_fig=go.Figure(
                go.Scatter(
                    x=all_words_x,
                    y=all_words_y,
                    mode = 'markers+text',
                    text = all_words,
                    textposition = "bottom right",
                    marker= dict(size=10, color=db.labels_, colorscale= colourscale)  
                ),
            )
        tog_fig.add_annotation(
                x= keyword_x,
                y= keyword_y,
                xref="x",
                yref="y",
                text= selected_word,
                showarrow=True,
                font=dict(
                    family="Courier New, monospace",
                    size=16,
                    color="#ffffff"
                    ),
                align="center",
                arrowhead=2,
                arrowsize=1,
                arrowwidth=2,
                arrowcolor="#636363",
                ax=20,
                ay=-30,
                bordercolor="#c7c7c7",
                borderwidth=2,
                borderpad=4,
                bgcolor="#ff7f0e",
                opacity=0.8
                )
        tog_fig.update_layout(title= 'Koine Greek DBSCAN', height=800, #width=1200, 
                              template = 'simple_white')
        g = dcc.Graph(
            figure = tog_fig,
            id='graph1'
        )  
    elif switch_points_words == False:
        all_words_x = scaled_features2['x']
        all_words_y = scaled_features2['y']
        all_words = scaled_features2['word'] 
        
        listform = []
        for w in selected_word:
            if w not in listform:
                listform.append(selected_word)
        Y = pd.read_csv('/Users/nicklist/Desktop/scaled_features2.csv')
        Y['word'] = Y.word.str.extract('({0})'.format('|'.join(listform)), flags=re.IGNORECASE)
        word = Y[~pd.isna(Y.word)]
        keyword_x = word.iloc[-1]['x']
        keyword_y = word.iloc[-1]['y']
        keyword = word.iloc[-1]['word']
        
        fig=go.Figure(
                go.Scatter(
                    x=all_words_x,
                    y=all_words_y,
                    mode = 'markers',
                    hovertext = all_words,
                    marker= dict(size=4, color=db.labels_, colorscale= colourscale)  
                ),
            )
        fig.add_annotation(
                x= keyword_x,
                y= keyword_y,
                xref="x",
                yref="y",
                text= selected_word,
                showarrow=True,
                font=dict(
                    family="Courier New, monospace",
                    size=16,
                    color="#ffffff"
                    ),
                align="center",
                arrowhead=2,
                arrowsize=1,
                arrowwidth=2,
                arrowcolor="#636363",
                ax=20,
                ay=-30,
                bordercolor="#c7c7c7",
                borderwidth=2,
                borderpad=4,
                bgcolor="#ff7f0e",
                opacity=0.8
                )
        fig.update_layout(title= 'Koine Greek DBSCAN', height=800, #width=1200, 
                          template = 'simple_white')
        g = dcc.Graph(
            figure = fig,
            id='graph2'
        )
    return g


In [None]:
if __name__ == '__main__':
    app.run_server(port=4050)

In [None]:
#def cosine_sim(greek):
    #import cltk
    #from cltk.corpus.utils.formatter import tonos_oxia_converter
    

greek = 'Ααρων'
greek2vec = w2v.Word2Vec.load(os.path.join("trained2_10", "greek2vec.w2v"))
result = greek2vec.wv.most_similar(greek)
#greek_word = tonos_oxia_converter(greek, reverse=True)
df = pd.DataFrame(result)
df.columns = ['Word', 'Cosine Value']
df2 = df.drop(['Cosine Value'], axis=1)
list1 = []
for w in df2:
        if w not in list1:
            list1.append(w)
print(list1)


greek = 'Ααρων'
#cosine_sim(word)

In [None]:
greek = 'γένος'
result = greek2vec.wv.most_similar(greek)
df7 = pd.DataFrame(result)
df7.columns = ['A', 'B']
df7.drop(['B'], axis=1)
print(df7)