### Mine useful terms from different clusters
### Datasets are downloadable from  https://mars.cyverse.org/data_dumps/GEOME.txt.zip

#### replace GEOME with other relevant collection name to get the data set. 

#### Download, unzip, and put the .txt file in the appropriate folder according to the jupyter file


#### Author: Hong Cui



### Obtain clusters and their informative terms 

In [None]:
import sys
import re
import string
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import fasttext # pip install fasttext-0.9.2-cp310-cp310-win_amd64.whl
import pickle
from joblib import Parallel, delayed
from sklearn.cluster import Birch
import fastcluster
from scipy.cluster.hierarchy import fcluster
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_rows = 4000

import multiprocessing

cores = multiprocessing.cpu_count()
cores = min(1, cores-1)

In [None]:
# srcs = {
#         "GEOME": ['record_colloquialName', 'record_scientificName','record_kingdom', 'record_phylum',
#                   'record_subPhylum', 'record_order', 'record_infraOrder', 'record_superOrder',
#                   'record_subOrder', 'record_class', 'record_subClass', 'record_infraClass', 'record_tribe', 
#                   'record_subTribe', 'record_superFamily', 'record_family', 'record_subFamily', 'record_genus',
#                   'record_subGenus', 'record_specificEpithet', 'record_infraspecificEpithet', 'record_taxonRemarks', 
#                   'record_taxonRank', 'record_sex', 'record_preservative', 'record_fixative', 'record_relaxant', 
#                   'parent_fieldNotes','parent_locality', 'parent_verbatimLocality', 'parent_habitat', 
#                   'parent_microHabitat', 'record_morphospeciesDescription', 'record_lifeStage'], 
#         "OPENCONTEXT":['Temporal Coverage_label', 
#              'Has taxonomic identifier_label','Has anatomical identification_label',
#              'Consists of_label',  'Has type_label', 'item category', 'context label'], 
#         "SMITHSONIAN":[ 'scientificName', 'higherClassification', 'kingdom', 'phylum', 'class', 'order', 
#                        'family', 'genus', 'subgenus', 'specificEpithet', 'infraspecificEpithet', 'sex',
#                        'lifeStage', 'habitat', 'preparations', 'waterBody', 'occurrenceRemarks','locality'], 
#         "SESAR":['description_supplementMetadata_geologicalAge','description_collectionMethod', 
#                  'description_material',  'description_sampleType', 
#                  'description_supplementMetadata_classificationComment', 'description_description', 
#                  'description_supplementMetadata_purpose', 'description_collectionMethodDescr',
#                  'description_supplementMetadata_primaryLocationType', 
#                  'description_supplementMetadata_geologicalUnit', 'description_supplementMetadata_locality',
#                  'description_supplementMetadata_localityDescription', 'description_supplementMetadata_fieldName']
#        }

# srcs = {
#         "GEOME": ['record_scientificName', 'parent_locality'], 
#         "OPENCONTEXT":['Has taxonomic identifier_label','Has anatomical identification_label',
#              'Consists of_label',  'Has type_label', 'item category'], 
#         "SMITHSONIAN":[ 'scientificName', 'higherClassification', 'waterBody', 'locality'], 
#         "SESAR":['description_material',  'description_sampleType', 
#                  'description_supplementMetadata_purpose', 
#                  'description_supplementMetadata_primaryLocationType', 
#                  'description_supplementMetadata_geologicalUnit', 'description_supplementMetadata_locality',
#                  'description_supplementMetadata_fieldName']
#        }

srcs = {
        "GEOME": ['parent_habitat', 'parent_microHabitat'], 
#         "OPENCONTEXT": ['Consists of_label']     
#"OPENCONTEXT":['Consists of_label',  'Has type_label'] 
         "SMITHSONIAN":[ 'habitat'], 
#         "SESAR":['description_material',  'description_sampleType', 
#                  'description_supplementMetadata_purpose', 
#                  'description_supplementMetadata_primaryLocationType', 
#                  'description_supplementMetadata_fieldName']
       }

# srcs = {
#         "GEOME": ['record_scientificName', 'parent_locality'], 
#         "OPENCONTEXT":['Has taxonomic identifier_label','Has anatomical identification_label',
#              'Consists of_label',  'Has type_label', 'item category'], 
#         "SMITHSONIAN":[ 'scientificName', 'higherClassification', 'waterBody', 'locality'], 
#         "SESAR":['description_material',  'description_sampleType', 
#                  'description_supplementMetadata_purpose', 
#                  'description_supplementMetadata_primaryLocationType', 
#                  'description_supplementMetadata_fieldName']
#       }

srcids = {
        "GEOME": 'record_bcid', 
        #"OPENCONTEXT": 'citation uri', 
        "SMITHSONIAN":'occurrenceID', 
        #"SESAR":'igsn'
       }

#srcnames = ["GEOME", "OPENCONTEXT", "SMITHSONIAN", "SESAR"]

srcnames = ["GEOME", "SMITHSONIAN"]
       

### frequency counts of the entries in one collection and one field

In [None]:
def saveCounts2CSV (c, cname, threshold):
    th = pd.DataFrame(columns=[cname, "count"])
    for i in c.index:
        if c[i] >= threshold:
            th  =  th.append({cname:i,  "count":c[i]}, ignore_index = True)
    th.to_csv("counts "+ cname+".csv")

In [None]:
src="OPENCONTEXT"
data = pd.read_csv('data/'+src+'.txt',sep='#', keep_default_na=False, encoding='utf-8')


In [None]:
a = data['context label'].value_counts()
# if contains greater than 4000 rows, so it cann't be displayed fully by default
# print out entries in f one by one
for r in a.index:
    print(str(a[r]) +" :"+r)  #count: phrases
   
saveCounts2CSV(a, "OC context", 10)
    

b = data['Consists of_label'].value_counts()
b
saveCounts2CSV(b, "OC consists", 10)

c = data['Has type_label'].value_counts()
c
saveCounts2CSV(c, "OC type", 10)


d = data['item category'].value_counts()
d
saveCounts2CSV(d, "OC item category", 0)

In [None]:
src="GEOME"
data = pd.read_csv('data/'+src+'.txt',sep='#', keep_default_na=False, encoding='utf-8')



In [None]:
# from fuzzywuzzy import fuzz
# from fuzzywuzzy import process

# data.columns
# #data[fuzz.ratio(data['parent_habitat'], 'arctostaphylos pungens')> 90]

# #arctostaphylos pungens

In [None]:
e = data['parent_habitat'].value_counts()
e
saveCounts2CSV(e, "GEOME habitat", 10)

f = data['parent_microHabitat'].value_counts()
f
saveCounts2CSV(e, "GEOME mircoHabitat", 10)


In [None]:
def getData(src):
    id_property = srcids[src]
    intrinsic_properties = srcs[src].copy()
    print(src)
    #print(id_property) 
    #print(intrinsic_properties)
    data = pd.DataFrame()
    data = pd.read_csv('data/'+src+'.txt',sep='#', keep_default_na=False, encoding='utf-8')
    #print(data)
    if not id_property:
        data['id'] = range(0,data.shape[0]) #when no id field is supplied
        id_property = 'id'
    
    #print(intrinsic_properties)
    #print(id_property)
    intrinsic_properties.append(id_property)
    #print(intrinsic_properties)
    data = data.filter(intrinsic_properties, axis=1)
    #print(data.columns) 
    data = data.rename(columns={id_property:'id'})
    data['src'] = src
    #data['id'] = src+':'+data['id'].astype(str)
    data['original']= ''

    for p in intrinsic_properties:
        if not p == id_property:
            #print('property: '+p)
            data['original'] = data[p].astype(str)+';'+data['original'] 
    #print(data)
    data['original'] = data['original'].str.replace(r";+", ";")
    data = data.dropna(subset = ['original']).reset_index(drop=True)
    data = data[['src', 'id', 'original']] #id + original
    data.reset_index(inplace=True, drop=True)              
    return data
                                
#test = getData("GEOME")
#print(test)
                    

In [None]:
def gatherTerm(cframe, threshold):
    tfidf_vectorizer = TfidfVectorizer(max_df = 1, max_features=500, min_df=0, stop_words='english',
                                  use_idf=True, ngram_range=(1, 3))
    try:
        tfidf_matrix = tfidf_vectorizer.fit_transform(cframe['description'])
        #print('tfidf_matrix')
        #print(tfidf_matrix[0,2])
        terms = tfidf_vectorizer.get_feature_names()
        #print('terms')
        #print(terms)
        ordered_index = tfidf_matrix.toarray().argsort()[:, ::-1]
        #print('ordered_index')
        #print(ordered_index) 

        #print("threshold:"+str(threshold))
        #print("length of ordered index:"+str(len(ordered_index)))
        #print("# of clusters:"+str(num_clusters))
        allterms = []
        for i in range(0, len(cframe)):
            #print('i='+str(i))
            n_terms = []
            for ind in ordered_index[i,]: 
                #print('ind='+str(ind))
                #print(ordered_index[i,])
                if tfidf_matrix[i, ind] > threshold:
                    n_terms.append(terms[ind]) 
            allterms.append(n_terms)
        return allterms
        
    except ValueError:#no term obtained from tfidf when all records holds the same set of terms
        return []

In [None]:
df_content_src = pd.DataFrame()

data_list=[]
data_list = Parallel(n_jobs=min(len(srcnames), cores), verbose=50)(delayed(getData)(src) for src in srcnames)


In [None]:
df_content_src = pd.concat(data_list, ignore_index=True)
df_content_src['original_beforeclean'] = df_content_src['original']
df_content_src.shape #5,833,656
del data_list

In [None]:
nltk.download('stopwords')
estopwords = stopwords.words('english')+["sample", "samples", "sampling", "sampled", "sample_id", "cm"]
stemmer = SnowballStemmer("english")

In [None]:
def clean(line):
    if pd.isna(line):
        return ''
    else:
        newline =''
        for token1 in re.split(r'#+', line):
            for token2 in token1.split():
                token2 = token2.lower()
                if token2.startswith('https://') or token2.startswith('http://') or re.match(".*?\d.*", token2) or len(token2)<=2 or token2 in estopwords:
                    next
                else:
                    token2 = re.sub(r'['+string.punctuation+']', ' ', token2)
                    for token3 in re.split(r'[/<>. ]', token2):
                        if len(token3)<=2 or token3 in estopwords:
                            next
                        else:
                            newline = newline+' '+token3.strip()
                        
        return newline.strip()
    
    
# line = "Char WS-9513.01.06-b*""#event.###[blank]/records:/[blank]  [ *found * {}: [and at https://wwww.applies.com/###12 cm. l.d. ###width 1dm ###rock>mineral>blue mineral###"    
# # line = '""#""#""#""#"74"#""#""#"74"#""#""#""#""#""#""#""#""#"2020"#""#""#"University of Florida"#""#"23.8361515"#""#""#""#""#""#""#"Alpheidae sp. 1"#""#""#""#""#""#"BOMAN_3342"#""#""#""#""#""#"Oman"#""#"NSF_OMAN"#""#""#""#""#""#""#""#""#"OMAN_020A"#""#""#""#"NSF_OMAN"#""#""#""#""#""#""#"1"#""#""#""#""#""#""#""#""#""#""#""#""#""#""#""#""#"57.9767976"#""#"Event"#""#""#""#""#"Sample"#""#""#""#""#""#""#"in Pocillopora and Acropora rubble"#""#""#""#""#""#""#""#""#""#""#""#""#""#""#""#""#""#""#"Damanyat Islands, S of June Island"#""#""#""#""#""#""#""#""#""#""#""#""#"Abby Uehling"#""#""#""#""#""#""#""#""#""#""#""#""#""#""#""#""#""#"74"#""#"Abby Uehling"#""#""#""#""#""#""#""#""#""#""#""#""#""#""#""#""#""#""#""#""#"Muscat Governorate"#""#""#"OMAN_020A"#""#"Arthropoda"#""#""#""#""#""#""#""#""#""#""#""#""#""#""#""#""#""#""'
# # re.split(r'#+', line)
# newline = clean(line)
# newline

In [None]:
allplines_src = list()
allplines_src = Parallel(n_jobs=cores)(delayed(clean)(line) for line in df_content_src['original'])


### frequency counts of combined fields from one or more collections

In [None]:
# all fields selected are combined into 'original'
df_content_src['original'] = allplines_src


In [None]:
for text in df_content_src['original']: 
    if 'arctostaphylos pungens' in text:
        temp = df_content_src[df_content_src['original'] == text]['original_beforeclean']
        for t in temp:
            print(t)
        



In [None]:
g = df_content_src['original'].value_counts()
g
saveCounts2CSV(g, "three habitats", 10)


In [None]:

print("df_content_src.shape[0] before dropna:"+str(df_content_src.shape[0])) 
df_content_src = df_content_src.replace(r'^\s*$', np.NaN, regex=True)
df_content_src = df_content_src.dropna(how='any').reset_index(drop=True)
print("df_content_src.shape[0] after dropna:"+str(df_content_src.shape[0])) 
df_content_src.drop_duplicates(subset='original', ignore_index=True, inplace=True) #remove redundant rows
print("df_content_src.shape[0] after deduplicationa:"+str(df_content_src.shape[0])) 
pd.unique(df_content_src['src'])
plines_src = df_content_src['original']

#del allplines_src

In [None]:
model = fasttext.load_model('data/cc.en.300.bin') #takes 7GB

In [None]:
vlines_src = list() #records represented as vectors
for pline in plines_src:
    vlines_src.append(model.get_sentence_vector(pline))

#normalize to unit vector
vlines_src = normalize(vlines_src, axis=1)
print("len(vlines_src):")
print(len(vlines_src))
#del model #reclaim memory
#print(vlines_src[0])
#print(len(vlines_src)) 


#with open("vlines_src.all.fasttext"+".pkl", 'wb') as outp:
#    pickle.dump(vlines_src, outp, pickle.HIGHEST_PROTOCOL)

In [None]:
def birch(rep, vlines_src):
    brc_src = Birch(branching_factor=20000, n_clusters=12, threshold=0.5) #n_clusters = None, meaning unlimited  
    brc_src.fit(vlines_src)

    labels_src = brc_src.predict(vlines_src)
    df_src = pd.DataFrame({'src':df_content_src['src'], 'id':df_content_src['id'], 'birchcluster':labels_src, 'content':plines_src, 'original':df_content_src['original']}) 

    tab = df_src.groupby(['src','birchcluster']).size()
    print(rep+" source clusters:")
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        #print(tab.unstack().transpose())
        print(tab.transpose())

    #with open("clusters.src.birch."+rep+".pkl", 'wb') as outp:
     #   pickle.dump(df_src, outp, pickle.HIGHEST_PROTOCOL)

  
    return df_src


In [None]:
rep = 'fasttext'
df = birch(rep, vlines_src)

In [None]:
# with open("clusters.src.birch."+rep+".pkl", 'rb') as inp:
#     df = pickle.load(inp)
    
# df

In [None]:
#produce terms from Birch clustering result

frame = df[['birchcluster','content']]
frame = frame.rename(columns={'birchcluster':'cluster', 'content':'description'})
cframe =frame.groupby('cluster', as_index = False).agg({'description': ' '.join})
terms = gatherTerm(cframe, 0.1)
i=0
for term in terms:
    print('cluster '+ str(cframe.iloc[i]['cluster'])+':')
    print(term)
    i = i+1
    print()

In [None]:
# ## clustering of subclusters of BIRCH result with parallal programming

from fastcluster import linkage_vector
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import dendrogram


#birchcluster_df: the input observation data (D=300xN)
#birchcluster_number:  cluster number of birchcluster_df in BIRCH result 
#min_obs: minimal observations in birchcluster_df to run hierarichal clustering, must be >=2

def h_clustering_fast(birchcluster_df, birchcluster_number, link_method, min_obs=2 ):
    if len(birchcluster_df) < min_obs:
        return [] 
    else:
        linkage_matrix = fastcluster.linkage_vector(birchcluster_df, link_method) #single, complete, average, weighted, median, centroid, ward
        return linkage_matrix
    
#result = h_clustering_fast(vlines[df.index[df['birchcluster']==0]], 0, 'ward', 20)
#result
    





#birchcluster_df: df holding the birchcluster observation content
#birchcluster_number:  cluster number of birchcluster_df in BIRCH result 
#linkage_matrix: the linkage_matrix for the birchcluster
#t: threshold to obtain clusters from hierarchical clustering for the specified criterion
#criterion: criterion used to obtain clusters

def obtain_h_clusters(birchcluster_df, birchcluster_number, linkage_matrix, t=10, criterion="maxclust"):
    if np.any(linkage_matrix):
        clusters = fcluster(linkage_matrix, t, criterion=criterion) #distance, inconsistency
        num_clusters = len(np.unique(clusters))
        birchcluster_df = birchcluster_df.reset_index()
        cresult = birchcluster_df[['index']]
        cresult['hcluster']= clusters
         
        #index lines with clusters
        records = {'description':df[df['birchcluster']==birchcluster_number].content, 'cluster':clusters, 'birchcluster':birchcluster_number}
        frame = pd.DataFrame(records, columns=['description', 'cluster', 'birchcluster'])
        cframe =frame.groupby('cluster', as_index = True).agg({'description': ' '.join})
        return cresult
    else:
        return []
    

    
#len(results)
#16 empty
#cresult = obtain_h_clusters(df[df['birchcluster']==3], 3, linkage_matrix=results[3], t=3 if df[df['birchcluster']==3].shape[0] < 100 else 10 , criterion="maxclust")
#cresult
    



In [None]:
results = []        
results = Parallel(n_jobs=8, verbose=1)(delayed(h_clustering_fast)(vlines_src[df.index[df['birchcluster']==c]], c, 'ward', 2)  for c in range(0, len(df.birchcluster.unique())))
print("# of linkage matrices obtained: "+str(len(results)))

cresultlist = []
for cresult in Parallel(n_jobs=8,verbose=1)(delayed(obtain_h_clusters)(df[df['birchcluster']==c], c, linkage_matrix=results[c], t=3 if df[df['birchcluster']==c].shape[0] < 100 else 10, criterion='maxclust') for c in range(0, len(df.birchcluster.unique()))):
    cresultlist.append(cresult)


combined = pd.DataFrame()
#concat list of cresult row-wise
for cresult in cresultlist:
    combined = pd.concat([combined, cresult], ignore_index=True)
        

#print(combined)    
combined.set_index('index', inplace=True)

#df now holds all the results
df = df.join(combined) 




In [None]:

#produce terms from Birch + hierarchical clustering results

df["cluster"] = df["birchcluster"].astype(str) +'-'+ df["hcluster"].astype(str)
frame = df[['cluster','content']]
frame = frame.rename(columns={'content':'description'})
cframe =frame.groupby('cluster', as_index = False).agg({'description': ' '.join})
terms = gatherTerm(cframe, 0.1)
i=0
allterms = list()
for term in terms:
    print('cluster '+cframe.iloc[i]['cluster']+':')
    print(term)
    i = i+1
    print()
    allterms.extend(term)






In [None]:
from collections import Counter
counter = Counter(allterms)
counter.most_common()