# Preparation

In [None]:
import pandas as pd
import numpy as np
import re

# Loading

In [2]:
# Read source file
rawdata = pd.read_csv('scopus2.csv')

In [3]:
processeddata = rawdata[rawdata["Author Keywords"].notnull() | rawdata["Index Keywords"].notnull()] #remove null row based on a column

In [4]:
rawdata.shape

(1512, 41)

In [5]:
processeddata.shape

(1229, 41)

In [None]:
processeddata

In [6]:
processeddata.to_csv('elimitedmissingkeywords.csv')

In [7]:
processeddata= pd.read_csv('elimitedmissingkeywords.csv')

In [None]:
processeddata

In [15]:
#used for output
author = processeddata.iloc[:,1]
title = processeddata.iloc[:,2]
year = processeddata.iloc[:,3]
source = processeddata.iloc[:,4]

#used for label instances
authkey = processeddata.iloc[:,17]
editkey = processeddata.iloc[:,18]

# Prepare functions

In [16]:
dictfile=open('USBRDict.txt','r') # open British-American dictionary 

ubdict={} # create an empty dictionary

# Fill dictionary with key:value pairs
for line in dictfile: 
    dictentry = str(line).split(':')
    ubdict[dictentry[0]]=dictentry[1][0:-1]

dictfile.close()    #close the dictionary file
    
#define a british2Amecian converting function

def britishise(keyword): 
    flag = 0
    for word in keyword.split():
        if word in ubdict:
            keyword = re.sub(word,ubdict[word],keyword)
    return keyword

In [17]:
import inflection #import library for sinuglarise words

In [18]:
def regularise(word):
    
    
    processedword = re.sub('[\"\']','', word.lower()) # remove ' and " , put everything to lower case
    processedword = re.sub(r'\(.*\)','', processedword) # remove () and everything inside
    processedword = re.sub(' +',' ',processedword) # remove continuous space
    processedword = re.sub('^ +','',processedword) # remove space at the beginning
    processedword = inflection.singularize(processedword) # convert all plural to singlar: Unifying to Singular
    processedword = britishise(processedword) # convert American spelling to british spelling: Unifying to British
    return processedword

# Convert loading to a list of dictionaries 

In [19]:
dictlist = [] #create an empty list

In [20]:
#Fill the list with dictionaries
for i in range(0,len(authkey)):
    singledict = {'': 0}
    if authkey[i]:
        for keywordsingle in str(authkey[i]).split(';'):
                        
            processedkeyword = regularise(keywordsingle)
                        
            singledict[processedkeyword] = 1     
            
    if editkey[i]:
        for keywordsingle in str(editkey[i]).split(';'):
                        
            processedkeyword = regularise(keywordsingle)
                        
            singledict[processedkeyword] = 1     
            
    dictlist.append(singledict)

In [21]:
dictlist

[{'': 0,
  'accounting and finance': 1,
  'african american': 1,
  'college major': 1,
  'influence factor': 1,
  'information source': 1,
  'nan': 1},
 {'': 0,
  'complaint management': 1,
  'goffman': 1,
  'impression management': 1,
  'nan': 1,
  'practical knowing': 1,
  'teamwork': 1,
  'trust': 1},
 {'': 0,
  'cost of ownership': 1,
  'management accounting standard': 1,
  'nan': 1,
  'semiconductor industry': 1,
  'standard development history': 1},
 {'': 0,
  'commitment': 1,
  'information management': 1,
  'knowledge acquisition': 1,
  'knowledge based system': 1,
  'knowledge management': 1,
  'knowledge management system': 1,
  'knowledge sharing': 1,
  'knowledge system': 1,
  'knowledge-sharing': 1,
  'motivation': 1,
  'personal information management': 1,
  'related factor': 1,
  'research model': 1,
  'social aspect': 1,
  'survey': 1,
  'three component model': 1,
  'transparency': 1},
 {'': 0,
  'accounting': 1,
  'business model': 1,
  'nan': 1,
  'non- reporting': 

In [22]:
len(dictlist)

1229

In [23]:
#remove invalid dictionary entries
for dict in dictlist:
    dict.pop('', 0)
    dict.pop('nan', 1)

# Convert a list of dictionaries to one-hot vector

In [24]:
from sklearn.feature_extraction import DictVectorizer

In [25]:
v = DictVectorizer(sparse=False)

In [26]:
verctorised = v.fit_transform(dictlist)

If save to csv: #np.savetxt(".csv", verctorised, delimiter=",") : Too large

In [27]:
np.savez_compressed('regularised', a=verctorised) #save to npz files

In [28]:
loaded = np.load('regularised.npz')['a']

In [29]:
np.array_equal(loaded,verctorised) # check whether loaded array is same as saved array

True

# Agglomerative Clustering

## Prepare data and library

In [30]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
loaded = np.load('regularised.npz')['a'] #loaded vectorised data

In [None]:
np.array_equal(loaded,verctorised) # check only for the first time

## Define algorithm

In [44]:
hierechycluster = AgglomerativeClustering(n_clusters=30, affinity='euclidean', 
                                            memory=None, connectivity=None, 
                                            compute_full_tree='auto', linkage='ward')

## Run training

In [45]:
%prun -l nmf.py hierechycluster.fit(loaded)

 

## Save trained model

In [33]:
from sklearn.externals import joblib

In [46]:
joblib.dump(hierechycluster, 'agglomeraiveregularised30groups.pkl')

['agglomeraiveregularised30groups.pkl']

## The group label is in "hierechycluster.labels_ " now

In [38]:
len(hierechycluster.labels_) - len(processeddata) #test the length before combining

0

# Build a new array for instance with rawmaterial and label ( "hierechycluster.labels_")

In [47]:
a = hierechycluster.labels_
a.resize((len(a), 1))

In [48]:
b = np.hstack((processeddata, a))

In [49]:
processingdata = processeddata.copy(deep=True)

In [50]:
processingdata['label'] = hierechycluster.labels_

In [51]:
processingdata

Unnamed: 0.1,Unnamed: 0,Authors,Title,Year,Source title,Volume,Issue,Art. No.,Page start,Page end,...,ISBN,CODEN,PubMed ID,Language of Original Document,Abbreviated Source Title,Document Type,Access Type,Source,EID,label
0,0,"Young J., Herath S.K., McCoy R.M.",Why not accounting and finance: An African Ame...,2018,Journal of Education for Business,93,4,,165,171,...,,,,English,J. Edu. Bus.,Article,,Scopus,2-s2.0-85044462722,15
1,1,"Aili C., Nilsson L.-E.",Backstage accounting in preschool: analysing d...,2018,Ethnography and Education,13,2,,218,234,...,,,,English,Ethnogr. Educ.,Article,,Scopus,2-s2.0-85018843303,15
2,2,"Sandholzer M., Wouters M.",The history of the standard for the calculatio...,2018,Accounting History,,,,,,...,,,,English,Account. Hist.,Article in Press,,Scopus,2-s2.0-85045146460,15
3,3,"Hwang Y., Lin H., Shin D.",Knowledge system commitment and knowledge shar...,2018,International Journal of Information Management,39,,,220,227,...,,IJMAE,,English,Int J Inf Manage,Article,,Scopus,2-s2.0-85039744212,7
4,4,"Girella L., Tizzano R., Ferrari E.R.",Concepts travelling across disciplinary fields...,2018,Journal of Management and Governance,,,,1,30,...,,,,English,J. Manage. Gov.,Article in Press,,Scopus,2-s2.0-85043688733,20
5,5,"Kinslow B., De Heer H.D., Warren M.",Few adults with functional limitations advised...,2018,Physiotherapy Theory and Practice,,,,1,7,...,,PTHPE,,English,Physiother. Theory Pract.,Article in Press,,Scopus,2-s2.0-85042907106,1
6,6,"Horton K.E., Wanderley C.D.A.",Identity conflict and the paradox of embedded ...,2018,Management Accounting Research,38,,,39,50,...,,,,English,Manage. Account. Res.,Article,,Scopus,2-s2.0-85009774798,15
7,7,"Bai Y., Li J., Bai Y., Ma W., Yang X., Ma F.",Development and validation of a questionnaire ...,2018,BMC Health Services Research,18,1,107,,,...,,,,English,BMC Health Serv. Res.,Article,,Scopus,2-s2.0-85042027900,26
8,8,"Murphy B., Quinn M.",The emergence of mandatory continuing professi...,2018,Accounting History,23,1-Feb,,93,116,...,,,,English,Account. Hist.,Article,,Scopus,2-s2.0-85044140728,15
9,9,"Persson M.E., Radcliffe V.S., Stein M.",Elmer G Beamer and the American Institute of C...,2018,Accounting History,23,1-Feb,,71,92,...,,,,English,Account. Hist.,Article,,Scopus,2-s2.0-85043692705,15


In [52]:
processingdata.to_csv('labelled300groups.csv')

# Group Analysing

In [53]:
def statistic(akey,ekey,materiality):

    fdict={}
    returndict = {}
    keythreshold = materiality * len(akey)
    #Fill the list with dictionaries
    for i in range(1,len(akey)):

        singledict = {}
#        if akey.values[i] != 'NaN' and akey.values[i] != 'nan':
        for keywordsingle in str(akey.values[i]).split(';'):

            processedkeyword = regularise(keywordsingle)

            singledict[processedkeyword] = 1     

#        if ekey.values[i] != 'NaN' and ekey.values[i] != 'nan':
        for keywordsingle in str(ekey.values[i]).split(';'):

            processedkeyword = regularise(keywordsingle)

            singledict[processedkeyword] = 1     


        for key in singledict:
            if key in fdict:
                fdict[key] = fdict[key] + 1
            else:
                fdict[key] = 1
                
    if 'nan' in fdict: fdict.pop('nan')
    
    if 'NaN' in fdict: fdict.pop('NaN')
    
    for key in fdict:
        if fdict[key] > keythreshold:
            returndict[key] = fdict[key]
    
    return returndict

In [58]:
def checkpart(group, materiality):
    

    selectedpart = processingdata.loc[processingdata['label'] == group]

    authkey = selectedpart.iloc[:,17]
    editkey = selectedpart.iloc[:,18]
    
    dictforreturn = statistic(authkey,editkey,materiality)
    
    return dictforreturn, len(selectedpart)

In [59]:
isolatedgroup = 0
monstergroup = 0

isolatedlist = []
monsterlist = []

for i in range(0,30):
     

    
    
    groupkeyword, groupsize = checkpart(i,0.3)
    
    if groupsize <= 5: 
        isolatedgroup = isolatedgroup + 1
        isolatedlist.append(groupsize)
    elif groupsize >= 400: 
        monstergroup = monstergroup + 1
        monsterlist.append(groupsize)
        print('GROUP',i,'Size',groupsize) 
        print (groupkeyword)
    else:
        print('GROUP',i,'Size',groupsize) 
        print (groupkeyword)
        
print ('isolatedgroup',isolatedgroup)

print (isolatedlist)


print ('monstergroup', monstergroup)

print (monsterlist)

GROUP 0 Size 7
{'female': 5, 'united state': 5, 'body mass index': 4, 'logistic model': 4, 'body mass': 5, 'male': 4, 'obesity': 5, 'human': 6, 'article': 5, 'statistical model': 3, 'cross-sectional study': 3, 'risk': 3, 'risk factor': 3, 'statistic': 3}
GROUP 1 Size 58
{'human': 57, 'middle aged': 27, 'article': 46, 'female': 44, 'adult': 43, 'questionnaire': 21, 'male': 48}
GROUP 2 Size 19
{'health care delivery': 6, 'human': 14, 'health care organization': 11, 'review': 15, 'accounting': 14, 'united state': 12, 'financial management': 8}
GROUP 3 Size 18
{'health care delivery': 12, 'human': 10, 'reimbursement': 6, 'article': 16, 'economic': 9, 'united state': 7, 'financial management': 8, 'delivery of health care': 12, 'health care policy': 8, 'health care cost': 9, 'organization and management': 15}
GROUP 5 Size 10
{'major clinical study': 5, 'prospective study': 5, 'body weight': 4, 'adult': 9, 'female': 7, 'cohort analysis': 5, 'aged': 5, 'body mass': 6, 'obesity': 6, 'male': 7, 