In [None]:
import urielplus.urielplus as uriel
u = uriel.URIELPlus()

In [None]:
#CONFIGURATIONS
#Caching is default false to save memory, setting it True means updates to databases or imputation are saved to files
u.set_cache(True)

#Aggregation is default 'U' for union of typological feature data across sources in URIEL+, setting it 'A' means aggregation will be average of typological feature data across sources
u.set_aggregation('A')

#fill_with_base_lang means when aggregating typological feature data, the data of a language's parent is used to fill in missing data for the language
#Ex: English is the parent language of Hong Kong English, and is used to fill in Hong Kong English's missing typological data
#This configuration is default True as it provides the best feature coverage and imputation quality metrics
u.set_fill_with_base_lang(False)

#Prints the dictionary of parent languages and dialects used for filling in base language missing typological data during aggregation
print(u.get_dialects())

#Distance metric is default "angular". The distance metric chosen is how language vectors are compared for language distance calculations
u.set_distance_metric("cosine")

In [None]:
#DATABASES
#Convert languages codes from ISO 639-3 codes to Glottocodes
u.set_glottocodes()

#Integrates all databases
u.integrate_databases()

#Integrates the provided databases
u.integrate_custom_databases("GRAMBANK", "BDPROTO", "UPDATED_SAPHON")

#Integrate a specific database
u.integrate_grambank()

In [None]:
#IMPUTATION
#Computes the union or average of feature data across sources in URIEL+
u.aggregate()

#Imputes typological data with the provided imputation strategy
u.imputation_interface(strategy="knn")

#Imputes typological data with a specific imputation strategy
u.softimpute_imputation()

#NOTE: All imputation methods print the imputation qualtiy metrics

In [None]:
#VECTORS
#Prints all the vectors of the provided feature type for all provided languages
print(u.get_vector("featural", "stan1293", "stan1290", "nucl1643"))

In [None]:
#DISTANCES
#Prints all languages available for distance calculations for a specific distance type
print(u.get_available_distance_languages("syntactic"))

#Prints the provided feature type distance between all the provided languages. 
#2 languages provided returns a single number
#3 or more languages provided returns a square matrix of distances
print(u.new_distance("inventory", "stan1293", "stan1290"))

#Prints the specific distance between all the provided languages
print(u.new_genetic_distance("stan1293", "stan1290"))

#Prints the distance between all the provided languages using only the provided features
print(u.new_custom_distance(["S_SVO", "Indo-European", "P_NASALS"], "stan1293", "stan1290"))

#Can have only a provided source used if all the features provided are in that source
print(u.new_custom_distance(["S_ARTICLE_AFTER_NOUN", "M_ASSOC_PLURAL_MARK"], "stan1293", "stan1290", source="GRAMBANK"))

In [None]:
#METRICS
#Prints the number of languages that fall under the provided resource level with data of the provided feature type
print(u.feature_coverage("high-resource", "syntactic"))

#Prints feature coverage for languages of all resource level and all feature types
print(u.all_feature_coverage())

#Prints the confidence score of the typological distance of the languages provided
print(u.featural_confidence_score("stan1293", "stan1290", "syntactic"))

#Prints the confidence score of the non-typological distance of the languages provided
print(u.non_featural_confidence_score("stan1293", "stan1290", "geographic"))

#Prints the confidence score of the provided feature type distance of the languages provided
print(u.confidence_score("stan1293", "stan1290", "phonological"))