## Step 1.5 : Journal Field Information
- Part 1: Import Scopus Journal List to get upper-level field category for journals
- Part 2: Identify Journal from the Known Retraction List that are not listed in the Scopus' Journal List
- Part 3: Use YAKE to find keyword in each field and match with the journals
- Part 4: Manually generate keywords and match with the rest of the journals
- Part 5: Append the result of each step to the original list

In [None]:
!pip install yake
!pip install openpyxl

In [None]:
import yake
import pandas as pd
import numpy as np

In [None]:
from datetime import date, datetime as dt
today = str(date.today())

In [None]:
# Set path --- Link to the box folder with your name
# Download Box Desktop to copy the pathname

# Folder name: step1.5-inputfile
box_path_1 = {enterdirectorytofolder}
# Folder name: step1-inputfile
box_path_2 = {enterdirectorytofolder}

### Part 1: Import Scopus Journal List to get upper-level field category for journals

In [None]:
# Input Scopus Journal List
journalscopus = pd.read_csv(box_path_1 + 'journalcategoryscopus.csv')
journalscopus['JournalandConferenceProceedings_lowercase'] = journalscopus['JournalandConferenceProceedings'].str.lower()
journalscopus.head()

## Part 2: Identify Journal from the Known Retraction List that are not listed in the Scopus' Journal List

In [None]:
# Input the list of our journals from the Known Retraction List
journalknownretraction = pd.read_csv(box_path_1 + 'knownretractionlist-journalcategories.csv')
journalknownretraction['JournalandConferenceProceedings_lowercase'] = journalknownretraction['JournalandConferenceProceedings'].str.lower()
journalknownretraction.head()

In [None]:
# Identify Journals from the known retraction list that are categorized
journalknownretraction_cat = journalknownretraction.merge(journalscopus, left_on= 'JournalandConferenceProceedings_lowercase', right_on= 'JournalandConferenceProceedings_lowercase', how='left')

journalknownretraction_cat = journalknownretraction[~journalknownretraction['MainCategory'].isnull()]
print(journalknownretraction_cat.info())

# Identify Journals from the known retraction list that are not categorized to any field
journalknownretraction_notcat = journalknownretraction[journalknownretraction['MainCategory'].isnull()]
print(journalknownretraction_notcat.info())

In [None]:
print('Count of Journal that are not categorized: ', int(journalknownretraction_notcat.JournalandConferenceProceedings.count()))
print('Percentage of Journals that are not categoriezed: ', round(int(journalknownretraction_notcat.JournalandConferenceProceedings.count())/int(journalknownretraction.JournalandConferenceProceedings.count())*100, 2))

## Part 3: Use YAKE to find keyword in each field and match with the journals

In [None]:
# Create a list of journal name str from journals that are categorized
lifescience_list = journalknownretraction_cat[journalknownretraction_cat['MainCategory'].str.contains('Life Science')]['JournalandConferenceProceedings_lowercase']
lifescience_str = " ".join(lifescience_list)

healthscience_list = journalknownretraction_cat[journalknownretraction_cat['MainCategory'].str.contains('Health Science')]['JournalandConferenceProceedings_lowercase']
healthscience_str = " ".join(healthscience_list)

physicalscience_list = journalknownretraction_cat[journalknownretraction_cat['MainCategory'].str.contains('Physical Science')]['JournalandConferenceProceedings_lowercase']
physicalscience_str = " ".join(physicalscience_list)

socialscience_list = journalknownretraction_cat[journalknownretraction_cat['MainCategory'].str.contains('Social Science')]['JournalandConferenceProceedings_lowercase']
socialscience_str = " ".join(socialscience_list)


In [None]:
# Creat list of stopwords
new_stopwords = ['science', 'journal', 'conference', 'bulletin', 'proceeding', 
                 'research', 'experimental', 'reports','current', 'international',
                 'reviews', 'archives', 'review','communication', 'opinion', 'american',
                 'indian','european', 'serials','letters', 'korean']

In [None]:
# Replace stopwords
for i in new_stopwords:
    lifescience_str = lifescience_str.replace(i, '')
    healthscience_str = healthscience_str.replace(i, '')
    physicalscience_str = physicalscience_str.replace(i, '')
    socialscience_str = socialscience_str.replace(i, '')

In [None]:
# Get 100 keywords in each field
text_list = [lifescience_str, healthscience_str, physicalscience_str, socialscience_str]
keyword = []

for i in text_list:
    kw_extractor = yake.KeywordExtractor()
    text = i
    language = "en"
    max_ngram_size = 1
    deduplication_threshold = 0.9
    numOfKeywords = 100
    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
    keywords = custom_kw_extractor.extract_keywords(text)
    for kw in keywords:
        keyword.append(kw[0])

In [None]:
# Create a dataframe to display the keywords of each field
cat = ['Life Science', 'Health Science', 'Physical Science', 'Social Science']
cat_key_df = pd.DataFrame()

cat_key_df['Categories'] = cat
cat_key_df['Keyword'] = [keyword[0:100], keyword[100:200], keyword[200:300], keyword[300:400]]

cat_key_df

In [None]:
# Categorized the journals that are not categorized based on field keywords
for i in range(0, len(journalknownretraction_notcat['JournalandConferenceProceedings_lowercase'])):
    if any([x in journalknownretraction_notcat['JournalandConferenceProceedings_lowercase'][i] for x in cat_key_df['Keyword'][0]]):
        journalknownretraction_notcat['MainCategory'][i] = cat_key_df['Categories'][0]
    elif any([x in journalknownretraction_notcat['JournalandConferenceProceedings_lowercase'][i] for x in cat_key_df['Keyword'][1]]):
        journalknownretraction_notcat['MainCategory'][i] = cat_key_df['Categories'][1]
    elif any([x in journalknownretraction_notcat['JournalandConferenceProceedings_lowercase'][i] for x in cat_key_df['Keyword'][2]]):
        journalknownretraction_notcat['MainCategory'][i] = cat_key_df['Categories'][2]
    elif any([x in journalknownretraction_notcat['JournalandConferenceProceedings_lowercase'][i] for x in cat_key_df['Keyword'][3]]):
        journalknownretraction_notcat['MainCategory'][i] = cat_key_df['Categories'][3]

## Part 4: Manually generate keywords and match with the rest of the journals

In [None]:
# Identify journals that still are not categorized
notfound = journalknownretraction_notcat[journalknownretraction_notcat['MainCategory'].isnull()].reset_index().drop(['index'],axis=1)
notfound

In [None]:
#clean encoding error symbol
nameclean = []

for i in notfound['JournalandConferenceProceedings_lowercase']:
    #notfound['name_clean']=[]
    stripped = i.split('(', 1)[0]
    nameclean.append(stripped)

notfound['name_clean'] = nameclean

notfound

In [None]:
# Manually select lists of keywords for each field

conf = ['conference', 'workshop', 'summit', 'congress', 'symposium']

phy_term = ['actuators','aeroacoustics','aerodynamic','aerospace','antenna','astronautical','astronomical',
            'automation','rhumatisme','atmosphere','equations','software','electronics','microelectronics',
            'bioengineering','broadband','chemie','circuits','cloud','computer','computing','crystal','cyber',
            'energy','engineering','ergonomics','geoscience','internet','iron','materials','mathematics',
            'micron','microsystems','mobile','nanotechnology','robotics','satellite','security','sensors','signal',
            'steel','telecommunications','toxicological','waste','wireless','airfield','highway','navigation',
            'cryptography','geometry','topology','bifurcation','chaos','algebra','algebraic','atmospheres',
            'geographic','physica','superconductivity','plastic','geographer', 'differentiable', 'rhumatisme',
            'symbolic', 'logic', 'solar']

life_phy_term = ['bioinformatics','entomology','nematology','nicotine','tobacco','protein','crispr','poultry'
                 'parasites', 'mosquito', 'dendrology', 'dna', 'forestry', 'plant','bioengineering',
                 'acids','anatomical','anatomy','bacteriology','cytology','embo','entomologica','febs','genomic',
                 'histology','lipids','metallurgy','microbial','microrna','microscopy','nucleosides','nucleotides'] 

heal_phy_term = ['aging','AIDS','anesthesia','anesthesiology','arthroplasty','bioengineering','biomedical','reumatologia',
                 'blood','bone','bronchology ','chiropractic','dementia','dentistry','diabetology','digestive','wrestling',
                 'disease','diseases','dna','e-Health','elder','encephalopathy','epilepsy','eye','food','foot',
                 'geriatrics','gerontology','healthcare','illness','maxiollofacial','medical','medicine','metabolic',
                 'morbidity','mosquito','muscle','neurology','nicotine','nurse','obesity','ophthalmic','ophthalmology',
                 'opthamology','orofacial','orthopaedic','osteoporosis','otorhinolaryngology','parasites','patient',
                 'psychonomic','pulmonology','respiration','rheumatology','roentgenology','shoulder','spine','surgeon',
                 'surgery','surgical','tobacco','trauma','dermatologie','psychiatrique','cardiac','cardiological',
                 'urologe','ophthalmologe','traitement','gait','posture','gerontologist','head','neck','std','aids',
                 'thrombosis','hemostasis','arthritis','dental','prosthodontics','strength','conditioning','pharmacists',
                 'pharmacist','lancet','leukemia','lymphoma','liver','sclerosis','opthalmology','pharmazie','counseling',
                 'urologie','retina','seizure','breathing','anatomical','anatomy','thyroid','urolgia','prostate', 
                 'craniofacial', 'cleft', 'laryngoscope', 'oncologist', 'obstetrician', 'gynaecologist', 'oncologist', 
                 'urologia', 'kardiologe']

soc_phy_term = ['age', 'business', 'education', 'e-government', 'politics', 'law', 'legal','e-learning', 
                'humanities','anthropology', 'linguistic','naturalist','librarianship', 'reorganisation',
                'information', 'learning', 'architecture', 'museum', 'aging', 'culture', 'economy', 'elder', 
                'geography', 'media', 'librarian','christian','church','coaching','crime','egalitarian',
                'entrepreneurship','interpreter','islam','judge','muslim','philosophical','philosophy','police',
                'punishment','reoganisation','school','supervision','taxation','taxes','urban','wrestling']

In [None]:
# first run
string = notfound['name_clean'].tolist()
cate2 = []

for i in range(0,len(string)):
    cate = []
    if any(item in string[i] for item in conf):
            cate.append('Physical Science')
            if any(item in string[i] for item in heal_phy_term):
                cate.append('Health Science')
            if any(item in string[i] for item in soc_phy_term):
                cate.append('Social Science')
            if any(item in string[i] for item in life_phy_term):
                cate.append('Life Science')
        
    cate2.append(cate)

        
notfound['MainCategory'] = cate2

notfound['MainCategory'] = [', '.join(map(str, l)) for l in notfound['MainCategory']]
notfound['MainCategory'] = notfound['MainCategory'].astype(str).replace('nan', '')
notfound

In [None]:
# second run
stillnotfound = notfound[notfound['MainCategory']=='']

string = stillnotfound['name_clean'].tolist()
cate3 = []

for i in range(0,len(string)):
        cate = []
            
        if any(item in string[i] for item in phy_term):
                cate.append('Physical Science')
        if any(item in string[i] for item in heal_phy_term):
                cate.append('Health Science')
        if any(item in string[i] for item in soc_phy_term):
                cate.append('Social Science')
        if any(item in string[i] for item in life_phy_term):
                cate.append('Life Science')

        cate3.append(cate)

        
stillnotfound['MainCategory'] = cate3
stillnotfound['MainCategory'] = [', '.join(map(str, l)) for l in stillnotfound['MainCategory']]
stillnotfound['MainCategory']= stillnotfound['MainCategory'].astype(str).replace('nan', '')

stillnotfound = stillnotfound.drop(['name_clean'], axis=1)
stillnotfound

In [None]:
# Get the end result of this part
founded = pd.merge(notfound, stillnotfound, on='JournalandConferenceProceedings_lowercase', how='left').drop(['name_clean'], axis =1)
founded['MainCategory_x'] = founded['MainCategory_x'].astype(str).replace('nan', '')
founded['MainCategory_y'] = founded['MainCategory_y'].astype(str).replace('nan', '')

founded = founded.drop(columns=['JournalandConferenceProceedings_lowercase','JournalandConferenceProceedings_y'], axis=1).rename(columns={'JournalandConferenceProceedings_x': 'JournalandConferenceProceedings'}) 
founded['MainCategory'] = founded[founded.columns[1:]].apply(lambda x: ''.join(x.dropna().astype(str)),axis=1)
founded = founded[['JournalandConferenceProceedings', 'MainCategory']]
founded.head()

## Step 5: Append the result of each step to the original list

In [None]:
result_1 = pd.merge(journalknownretraction_notcat, founded, on='JournalandConferenceProceedings', how='left')
result_1['MainCategory_x'] = result_1['MainCategory_x'].astype(str).replace('nan', '')
result_1['MainCategory_y'] = result_1['MainCategory_y'].astype(str).replace('nan', '')

result_1['MainCategory'] = result_1[result_1.columns[1:]].apply(lambda x: ''.join(x.dropna().astype(str)),axis=1)
result_1 = result_1[['JournalandConferenceProceedings', 'MainCategory']]

print(result_1.info())
print(result_1.MainCategory.value_counts())
result_1.head()

In [None]:
result_2 = pd.concat([journalknownretraction_cat,result_1]).reset_index().drop(['index'], axis=1)
result_2 = result_2[['JournalandConferenceProceedings', 'MainCategory']]

print(result_2.info())
result_2.head()

In [None]:
result_2.to_csv(today + '-journalcategory-knownretractionlist.csv')