In [2]:
import os, sys
import pandas as pd
from sklearn.manifold import MDS
from sklearn.cluster import KMeans

currentdir = os.getcwd()
parentdir = os.path.dirname(currentdir)
sys.path.append(parentdir)

from os.path import isfile, join
from techminer2.thesaurus import load_file_as_dict


from techminer2 import *

directory = "/workspaces/techminer2/data/"

def find_abbreviations(
    directory="./",
):
    """
    Find abbreviations and reorder the thesaurus to reflect the search.

    """

    def extract_abbreviation(x):
        if "(" in x:
            abbreviation = x[x.find("(") + 1 : x.find(")")]
            return abbreviation
        return None

    # ----< Load and reverse the thesaurus >------------------------------------------------------
    thesaurus_file = join(directory, "keywords.txt")
    if isfile(thesaurus_file):
        th = load_file_as_dict(thesaurus_file)
    else:
        raise FileNotFoundError("The file {} does not exist.".format(thesaurus_file))
    reversed_th = {value: key for key, values in th.items() for value in values}

    # ----< search for abbreviations >-------------------------------------------------------------
    df = pd.DataFrame(
        {
            "text": reversed_th.keys(),
            "key": reversed_th.values(),
        }
    )
    df["abbreviation"] = df["key"].map(extract_abbreviation)

    # ----< filter by each abbreviation >----------------------------------------------------------
    abbreviations = df.abbreviation.dropna().drop_duplicates()
    results = {}
    for abbreviation in abbreviations.to_list():
        keywords = df[df.text.str.contains(abbreviation)]
        if len(keywords) > 0:
            
            results[abbreviation] = keywords

            print(abbreviation)
            for text in keywords.text.to_list():
                print("    ", text)

    # ----< remove found keywords >-----------------------------------------------------------------
    keys = [text  for key in results.keys() for text in results[key].key.to_list()]
    findings = {key: th[key] for key in sorted(keys)}
    for key in findings.keys():
        th.pop(key)

    with open(thesaurus_file + '_', "w", encoding="utf-8") as file:

        for key in sorted(findings.keys()):
            file.write(key + "\n")
            for item in findings[key]:
                file.write("    " + item + "\n")

        for key in sorted(th.keys()):
            file.write(key + "\n")
            for item in th[key]:
                file.write("    " + item + "\n")


find_abbreviations(directory)

bop
     base of pyramid (bop)
     bop consumers
     bop contexts
     bop entrepreneurs
     bop entrepreneurship
     bop inhabitants
language
     computer language python
     english (language)
     english language
     natural language processing
micro-operating mechanism
     mom(micro-operating mechanism)
smes
     fintech smes
     indian fintech smes
     small and medium-sized enterprises (smes)
     soe smes
sri
     socially responsible investing (sri)
     sri lanka
soes
     state-owned enterprises (soes)
utaut
     unified theory of acceptance and use technology (utaut)
     utaut


In [None]:
clean_keywords(directory)
thematic_map_communities('author_keywords', min_occ=10, directory=directory)

In [None]:
most_global_cited_documents(directory=directory)

In [None]:
import pandas as pd

nlp = pd.read_csv("/workspaces/techminer2/data/documents.csv").nlp_phrases
nlp = nlp.str.split(";")
nlp = nlp.explode()
nlp = nlp.str.strip()
nlp = nlp.str.lower()
nlp = nlp.dropna()
nlp = nlp.drop_duplicates()
nlp = nlp.sort_values()
nlp = nlp.reset_index(drop=True)
nlp.to_csv("nlp.txt", index=False)


In [None]:
!head /workspaces/techminer2/data/raw-documents.csv

In [None]:
documents.raw_author_keywords

In [None]:
from techminer import *
directory = "/workspaces/techminer-api/data/"
collaboration_indicators("countries", directory=directory).head()

In [None]:
import pandas as pd

m = pd.DataFrame(
    {
    'a': [1, 1, 1, 2, 2, 2],
    'b': ['A', 'B', 'D', 'A', 'B', 'C'],
    'v': [1, 2, 3, 4, 5, 6]
    }
)
m.pivot(index='a', columns='b', values='v')

In [None]:
import pandas as pd

sorted(pd.read_csv("documents.csv").columns)