## Step 5 : Journal Field Information
- Part 1: Import Scopus Journal List to get upper-level field category for journals
- Part 2: Identify Journal from the Known Retraction List that are not listed in the Scopus Journal List
- Part 3: Use YAKE to find keyword in each field and match with the journals
- Part 4: Manually generate keywords and match with the rest of the journals
- Part 5: Append the result of each step to the original unionlist


Input File: 
   - Scopus list of journal titles (from Step 5)
       - 'journal/ext_list_{latest_MM_YYYY}.xlsx'  # Get latest one (See Part 1 below)
       
   - Union list of retracted publication with retraction year (from Step 4):
       - unionlist/unionlist_with_retractionyear_{date}.csv


Output File: 
   - unionlist_completed_{date}.csv
  

###### Uncomment the line of code "....to_csv(..)"  to save file to your local directory

In [None]:
# !pip install git+https://github.com/LIAAD/yake

In [None]:
import yake
import pandas as pd
import numpy as np
import os
import re
from datetime import date, datetime as dt

### Directory Setup

In [None]:
# Targeting the retraction_index_path
retraction_index_path = os.path.abspath('./.')
retraction_index_path

data_dir = retraction_index_path+'/data/' # data directory
result_dir = retraction_index_path+'/result/'

# Create 'journal' folder for journal title classification
if not os.path.exists(data_dir+'journal'): #(data+source)
    os.mkdir(data_dir+'journal')

### Part 1: Import Scopus Journal List to get upper-level field category for journals

In [None]:
"""
# Reading Scopus journal classification. 
# See latest version: 'https://www.elsevier.com/products/scopus/content' then
    click on 'Download the Source title list (includes discontinued sources list)'

"""
try:
    scopus_journal_data = pd.read_excel(data_dir+'journal/ext_list_August_2024.xlsx',sheet_name=None)  #,encoding ='utf-8',errors='ignore'
except UnicodeDecodeError:
    pass

In [None]:
scopus_journal_data_sheets_names = list(scopus_journal_data.keys())
scopus_journal_sheet = scopus_journal_data[scopus_journal_data_sheets_names[0]]

selected_columns=['Source Title',
                  'Top level:\n\nLife Sciences','Top level:\n\nSocial Sciences',
                  'Top level:\n\nPhysical Sciences','Top level:\n\nHealth Sciences',
                  '1000 \nGeneral']

scopus_journal_filtered = scopus_journal_sheet.filter(items = selected_columns).replace('',np.nan)
scopus_journal_filtered

In [None]:
"""
Joining the cell values for each category within the subject areas for each of the journal titles
"""
# 'MainCategory' <-- Concatenate the cell values for journal titles 
scopus_journal_filtered['MainCategory'] = scopus_journal_filtered.apply(lambda row: ','.join(\
    filter(lambda x: pd.notna(x), [row[column] for column in selected_columns[1:]])),axis=1)

scopus_journal_filtered = scopus_journal_filtered.rename(columns={selected_columns[0]:'JournalandConferenceProceedings'})

# scopus_journal_part <-- Select needed column of interest
scopus_journal_part = scopus_journal_filtered[['JournalandConferenceProceedings','MainCategory']]

scopus_journal_part

In [None]:
# Reading Scopus conference categories 
conference_sheet = scopus_journal_data[scopus_journal_data_sheets_names[3]].iloc[:,[1,-1]]
conference_sheet = conference_sheet.rename(columns={'All Science Journal Classification Codes (ASJC)':'ASJC',
                                                   'Source Title': 'JournalandConferenceProceedings'})
conference_sheet

In [None]:
"""
Assigning main category subjects to their ASJC codes
"""
lookup={
10: 'General',
11: 'Life Sciences',
12: 'Social Sciences',
13: 'Life Sciences',
14: 'Social Sciences',
15: 'Physical Sciences',
16: 'Physical Sciences',
17: 'Physical Sciences',
18: 'Social Sciences',
19: 'Physical Sciences',
20: 'Social Sciences',
21: 'Physical Sciences',
22: 'Physical Sciences',
23: 'Physical Sciences',
24: 'Life Sciences',
25: 'Physical Sciences',
26: 'Physical Sciences',
27: 'Health Sciences',
28: 'Life Sciences',
29: 'Health Sciences',
30: 'Life Sciences',
31: 'Physical Sciences',
32: 'Social Sciences',
33: 'Social Sciences',
34: 'Health Sciences',
35: 'Health Sciences',
36: 'Health Sciences'
}

In [None]:
def map_asjc_to_category(asjc_value):
    """
    This function assign conference proceedings of the Scopus their main categories
    :param asjc_value: the ASJC code of the journal & Conference proceedings
        """
    store=[]
    asjc_values = asjc_value.strip().split(';')

    for values in asjc_values:
        if values:
            values=values.strip()
#             print(values)
#             print((values[:2]))
#             print(lookup[int(values[:2])])
            store.append(lookup[int(values[:2])])
    
    store = list(set(store))
    return ','.join(store)

In [None]:
conference_sheet['ASJC']=conference_sheet['ASJC'].fillna("")
conference_sheet['ASJC']= conference_sheet['ASJC'].apply(str)
conference_sheet['MainCategory'] = conference_sheet['ASJC'].apply(map_asjc_to_category)
conference_sheet_part= conference_sheet[['JournalandConferenceProceedings','MainCategory']]
conference_sheet_part

In [None]:
# Appending Both the Journal titles & Conferences from Scopus

journalscopus = pd.concat([scopus_journal_part,conference_sheet_part])
journalscopus\
#         .to_csv(data_dir+'journal/scopus_journalconferencecategory.csv' )

In [None]:
# Input Scopus Journal List
journalscopus = pd.read_csv(data_dir+'journal/scopus_journalconferencecategory.csv').drop(['Unnamed: 0'], axis=1)

journalscopus['JournalandConferenceProceedings_lowercase'] = journalscopus['JournalandConferenceProceedings'].str.lower().str.strip()
journalscopus_deduplicated = journalscopus.drop_duplicates(subset='JournalandConferenceProceedings_lowercase')
journalscopus_deduplicated

### Part 2a: Cleaning the Journal and Conference Titles in the UnionList

In [None]:
getdate= {'unionlist': '2024-07-09'}

In [None]:
def clean_journal_title(df_: pd.DataFrame):
    """
    param df_: The dataframe to work with
    :return: Dataframe with clean column
    """
    df= df_.copy(deep=True)
    df['JournalandConferenceProceedings_clean'] = df_['JournalandConferenceProceedings'].str.strip().str.lower()

    # remove '&amp and '&'
    df['JournalandConferenceProceedings_clean'] = df['JournalandConferenceProceedings_clean'].str.replace("&amp", "").str.replace("&", "")

    # remove 'the' if it starts a journal title
    df['JournalandConferenceProceedings_clean'] = df['JournalandConferenceProceedings_clean'].str.replace(r'(?i)^(the) ', '',regex=True)

    # remove position such 1st, 2nd, 3rd, 4th from the journal titles
    df['JournalandConferenceProceedings_clean'] = df['JournalandConferenceProceedings_clean'].apply(lambda x: re.sub(r'\b\d+(st|nd|rd|th)\b', '', x))

    # remove other digits and punctuation from the journal titles
    df['JournalandConferenceProceedings_clean'] = df['JournalandConferenceProceedings_clean'].apply(lambda x: re.sub(r'[^\w\s]|[\d]', '', x))

    #remove extra whitespace in between words
    df['JournalandConferenceProceedings_clean'] = df['JournalandConferenceProceedings_clean'].str.replace(r'\s+', ' ',regex=True).str.strip()
    
    return df

In [None]:
"""
Load Unionlist list and clean --> journalknownretraction_clean and knownretractionlist (title-cleaned copy of unionlist)

"""
unionlist= pd.read_csv(data_dir+f"unionlist/unionlist_with_retractionyear_{getdate['unionlist']}.csv").drop(['Unnamed: 0'],axis=1)

knownretractionlist= unionlist.copy(deep=True) #unionlist[['Journal']].copy(deep=True)
knownretractionlist['JournalandConferenceProceedings_lowercase'] = knownretractionlist['Journal'].str.lower().str.strip()


journalknownretraction= unionlist[['DOI','Journal']].rename(columns={'Journal': 'JournalandConferenceProceedings'})
journalknownretraction['JournalandConferenceProceedings_lowercase'] = journalknownretraction['JournalandConferenceProceedings'].str.lower().str.strip()

# Cleaning the Journal Titles from the Unionlist
journalknownretraction_clean = clean_journal_title(journalknownretraction)

#journalknownretraction_clean.groupby('JournalandConferenceProceedings_clean')['JournalandConferenceProceedings'].count().reset_index()
after_cleaning = len(journalknownretraction_clean[['JournalandConferenceProceedings_clean']].drop_duplicates())

print(f'The total number of journal titles in the unionlist is {after_cleaning} after cleaning')
journalknownretraction_clean

In [None]:
"""
Export the journalknownretraction_clean file for OpenRefine further cleaning
"""
journalknownretraction_clean \
#                .to_csv(data_dir+'journal/unionlist_journalcategory.csv')

#### Import the Clean Journal Title From OpenRefined

In [None]:
journalknownretraction_clean2 = pd.read_csv(data_dir+'journal/unionlistjournalcategory_openrefined_updated.csv').drop('Unnamed: 0', axis=1)

"""
Getting the unique total of journal titles in the Unionlist after OpenRefined cleaning
"""

journalknownretraction_clean2['JournalandConferenceProceedings_clean']= journalknownretraction_clean2['Journal_Openrefined'] #.copy()

journalknownretraction_unique=  journalknownretraction_clean2[['JournalandConferenceProceedings_lowercase','JournalandConferenceProceedings_clean']].copy()
journalknownretraction_unique.drop_duplicates(subset='JournalandConferenceProceedings_clean', keep='first', inplace=True)

after_cleaning2= len(journalknownretraction_unique)


print(f'The total number of journal titles in the knownretraction list is {after_cleaning2} after cleaning with OpenRefine')

journalknownretraction_clean2

In [None]:
journalknownretraction_unique

### Part 2b: Identify Journal Field Category from the Scopus Journal List

In [None]:
def get_resolved_journaltitle_count(df: pd.DataFrame()):
    """
    It gets unique count of journal titles based on 'JournalandConferenceProceedings_clean' column
    """
    x = df.copy(deep=True)
    xC = x[~x.MainCategory.isna()][['JournalandConferenceProceedings_clean']].drop_duplicates()
    xCnot = x[x.MainCategory.isna()][['JournalandConferenceProceedings_clean']].drop_duplicates()
    xT = x[['JournalandConferenceProceedings_clean']].drop_duplicates()
    
    return len(xC),len(xCnot),len(xT)

In [None]:
"""
Use 'JournalandConferenceProceedings' to merge with Scopus journal list
"""

journalknownretraction_cat= pd.merge(journalknownretraction_unique,journalscopus_deduplicated.iloc[:,1:], on='JournalandConferenceProceedings_lowercase', how='left')
# tempo_df_1.drop_duplicates(subset='JournalandConferenceProceedings_clean', keep='first', inplace=True)

print(get_resolved_journaltitle_count(journalknownretraction_cat))

In [None]:
journalknownretraction_cat

In [None]:
print(f'Of {after_cleaning2} total journal titles in the knownretractionlist')

first_pass = get_resolved_journaltitle_count(journalknownretraction_cat)

print(f'Categorized Round 1: The total number of classified journal titles with Scopus Journal is {first_pass[0]}, and {first_pass[1]} remain unclassified.')

print(f'That is, {(first_pass[0]/after_cleaning2)*100:.2f}% classified journals')

In [None]:
"""
Finding Numbers of DOIs matched with Scopus classification
"""

df_firstpass = pd.merge(journalknownretraction_cat.iloc[:,:2],journalscopus_deduplicated.iloc[:,1:], on='JournalandConferenceProceedings_lowercase', how='inner')

no_doi_firstpass=\
    len(pd.merge(knownretractionlist[['DOI','JournalandConferenceProceedings_lowercase']],df_firstpass[['JournalandConferenceProceedings_lowercase','MainCategory']],
                             on= 'JournalandConferenceProceedings_lowercase', how='inner'))

print('The total number of DOIs matched in Scopus journal list is:', no_doi_firstpass)
print(f'Which is {no_doi_firstpass/len(knownretractionlist)*100:.2f}% of the {len(knownretractionlist)} DOIs')

## Part 3: Identify Journal from the Unionlist that are not listed in the Scopus Journal List

In [None]:
# Identify Journals from the known retraction list that are categorized
journalknownretraction_categories= journalknownretraction_cat.copy(deep=True)

journalknownretraction_categories.MainCategory = journalknownretraction_categories.MainCategory.str.strip()
journalknownretraction_cat1 = journalknownretraction_categories[~journalknownretraction_categories['MainCategory'].isnull()].copy(deep=True)

# Identify Journals from the known retraction list that are not categorized to any field
journalknownretraction_notcat = journalknownretraction_categories[journalknownretraction_categories['MainCategory'].isnull()].copy(deep=True)

# print(journalknownretraction_notcat.info())

print('Number of Journals that are not categorized: ', first_pass[1])
print(f'Percentage of Journals that are not categoriezed: {round(int(first_pass[1])/int(after_cleaning2)*100, 2)}%')

## Part 4: Use YAKE to find keyword in each field and match with the journals

In [None]:
journalknownretraction_notcat

In [None]:
# Create a list of journal name str from journals that are already categorized
lifescience = journalknownretraction_cat[journalknownretraction_cat['MainCategory'].str.contains('Life Science', na=True)]['JournalandConferenceProceedings_clean']
lifescience_ = " ".join(lifescience)
lifescience_list  = re.sub(r'[^\w\s]', '', lifescience_).split()

healthscience= journalknownretraction_cat[journalknownretraction_cat['MainCategory'].str.contains('Health Science', na=True)]['JournalandConferenceProceedings_clean']
healthscience_ = " ".join(healthscience)
healthscience_list  = re.sub(r'[^\w\s]', '', healthscience_).split()

physicalscience = journalknownretraction_cat[journalknownretraction_cat['MainCategory'].str.contains('Physical Science', na=True)]['JournalandConferenceProceedings_clean']
physicalscience_ = " ".join(physicalscience)
physicalscience_list  = re.sub(r'[^\w\s]', '', physicalscience_).split()

socialscience= journalknownretraction_cat[journalknownretraction_cat['MainCategory'].str.contains('Social Science', na=True)]['JournalandConferenceProceedings_clean']
socialscience_ = " ".join(socialscience)
socialscience_list  = re.sub(r'[^\w\s]', '', socialscience_).split()


In [None]:
#!pip install stopwordsiso
import stopwordsiso as stopwords

# stop words list of all languages in the ISO-639 standard was used to process the titles. 
# https://github.com/stopwords-iso/stopwords-iso  stopwords.stopwords("en") stopwords.stopwords.lang()

In [None]:
# Create list of stopwords
new_stopwords = ['&', '&amp', 'acta', 'africa', 'african', 'albania', 'albanian', 'america', 'american', 'and', 
                 'andorra', 'applied', 'archives', 'armenia', 'asian', 'asian-australasian', 'association', 'australasian', 
                 'austria', 'austrian', 'azerbaijan', 'belarus', 'belgium', 'bmc', 'bosnia', 'brazil', 'brazilian', 'british', 
                 'bulgaria', 'bulgarian', 'bulletin', 'cadernos', 'canadian', 'china', 'chinese', 'communication', 'communications', 
                 'conference', 'croatia', 'croatian', 'current', 'cyprus', 'czech', 'denmark', 'dynamic', 'dynamics', 'east', 
                 'elife', 'estonia', 'europe', 'european', 'experimental', 'f1000research', 'finland', 'france', 'georgia', 
                 'german', 'germany', 'greece', 'herzegovina', 'hungary', 'iceland', 'india', 'indian', 'indonesia', 'indonesian', 
                 'international', 'iran', 'iranian', 'ireland', 'italian', 'italy', 'jama', 'japan', 'japanese', 'journal', 'jurnal', 
                 'kazakhstan', 'korean', 'latvia', 'lecture', 'letters', 'liechtenstein', 'list', 'lithuania', 'luxembourg', 'macedonia', 
                 'malta', 'management', 'moldova', 'monaco', 'montenegro', 'moscow', 'national academy', 'netherlands', 'north', 'norway',
                 'note', 'notes', 'opinion', 'oxford', 'peerj', 'poland', 'portugal', 'proceeding', 'proceedings', 'reports', 'republic',
                 'research', 'review', 'reviews', 'revista', 'romania', 'russia', 'russian', 'saudi', 'scandinavian', 'science', 'sciences',
                 'serbia', 'serials', 'slovakia', 'slovenia', 'society', 'south', 'spain', 'spainish', 'studies', 'sweden', 'switzerland',
                 'targets', 'trabalhos', 'turkey', 'turkukraine', 'uk', 'ukrainian', 'united kingdom', 'universities', 'university', 'vakblad', 'west']

new_stopwords.extend(stopwords.stopwords(["en", "de", "fr", "la", "ru"])) # adding English, German, French, Latin, and Russian stopwords

print('The total stopwords is',len(new_stopwords))

In [None]:
"""
Removing stopwords from the main categories lists
"""
lifescience_str= set(lifescience_list)
healthscience_str = set(healthscience_list)
physicalscience_str= set(physicalscience_list)
socialscience_str= set(socialscience_list)

for remove_item in set(new_stopwords):
    if remove_item in lifescience_str:
        lifescience_str.remove(remove_item)
    if remove_item in healthscience_str:
        healthscience_str.remove(remove_item)
    if remove_item in physicalscience_str:
        physicalscience_str.remove(remove_item)
    if remove_item in socialscience_str:
        socialscience_str.remove(remove_item)

In [None]:
"""
Extracting relevant keywords in each field into list
"""
text_list = [lifescience_str, healthscience_str, physicalscience_str, socialscience_str]
store_keywords = []

for i in text_list[:]:
    keywords_per_science=[]
    kw_extractor = yake.KeywordExtractor()
    text = ' '.join(set(i))
    language = "en"
    max_ngram_size = 1
    deduplication_threshold = 0.5
    numOfKeywords = 1000
    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
    keywords = custom_kw_extractor.extract_keywords(text)
    for kw in keywords:
        keywords_per_science.append(kw[0])
    store_keywords.append(keywords_per_science)

In [None]:
# Create a dataframe to display the keywords of each field
cat = ['Life Sciences', 'Health Sciences', 'Physical Sciences', 'Social Sciences']
cat_key_df = pd.DataFrame()

cat_key_df['Categories'] = cat

#lifescience_keywords,healthscience_keywords,physicalscience_keywords,socialscience_keywords = store_keywords
cat_key_df['Keyword'] = [store_keywords[0][9:], store_keywords[1][9:], store_keywords[2][9:], store_keywords[3][9:]]

cat_key_df

In [None]:
# Categorize journals and return a list of categories
def categorize_journal(title, cat_key_df):
    """
    It categorizes the journal titles based on keyword subjects
    :param title: the title of the journal
    :param cat_key_df: dataframe of topical keywords and linked categories
    
    :return: the main categories of the title
    """
    categories = []
    for index, row in cat_key_df.iterrows():
        if any(keyword in title for keyword in row['Keyword']):
            categories.append(row['Categories'])
    return categories

In [None]:
# Iterate not-yet-classified journal titles and categorize using categorize_journal

for i in range(len(journalknownretraction_notcat)):
    title = journalknownretraction_notcat['JournalandConferenceProceedings_lowercase'].iloc[i]
    categories = categorize_journal(title, cat_key_df)
    
    if categories:
        journalknownretraction_notcat['MainCategory'].iloc[i] = ', '.join(categories)

journalknownretraction_notcat

In [None]:
"""
Filtering classified and unclassified journals after using YAKE approach
"""

journalknownretraction_cat2 = journalknownretraction_notcat[~journalknownretraction_notcat['MainCategory'].isna()]

journalknownretraction_notcat2 = journalknownretraction_notcat[journalknownretraction_notcat['MainCategory'].isna()]

second_pass = get_resolved_journaltitle_count(journalknownretraction_notcat)

print(f'Categorized Round 2: The total number of second phase classified journal titles is {second_pass[0]}')
print(f'The total number of classified journal titles from first & second phase is {first_pass[0]+second_pass[0]}')
print(f'Uncategorized Round 2: The total number of remaining unclassified journal titles is {second_pass[1]}')

In [None]:
"""
Finding Numbers of DOIs matched with YAKE approach
"""
df_secondpass = pd.merge(journalknownretraction_unique,journalknownretraction_cat2.iloc[:,:], on='JournalandConferenceProceedings_lowercase', how='inner')

no_doi_secondpass=\
    len(pd.merge(knownretractionlist[['DOI','JournalandConferenceProceedings_lowercase']],df_secondpass[['JournalandConferenceProceedings_lowercase','MainCategory']],
                             on= 'JournalandConferenceProceedings_lowercase', how='inner'))

print('The total number of DOIs matched with YAKE Approach is:' , no_doi_secondpass)
print(f'Which is {no_doi_secondpass/len(knownretractionlist)*100:.2f}% of the {len(knownretractionlist)} DOIs')

## Part 5: Manually generate keywords and match with the rest of the journals

In [None]:
nameclean = []
for i in journalknownretraction_notcat2['JournalandConferenceProceedings_clean']:
    stripped = i.split('(', 1)[0]
    stripped = i.split('=', 1)[0]
    nameclean.append(stripped)

journalknownretraction_notcat2['name_clean'] = nameclean

journalknownretraction_notcat2

In [None]:
lifescience_words = ['acids',
 'agric',
 'agronomy',
 'akuakultur',
 'anatomical',
 'anatomy',
 'aquaculture',
 'bacteriology',
 'biochemistry',
 'bioengineering',
 'bioethics',
 'bioinformatics',
 'biolog',
 'biological',
 'biology',
 'biomedicine',
 'biomolecular',
 'biophysics',
 'biorxiv',
 'biotechnology',
 'cell',
 'cells',
 'cellular',
 'chemical',
 'chemie',
 'chemistry',
 'clinic',
 'clínica',
 'crispr',
 'cytology',
 'dendrology',
 'dna',
 'ecology',
 'endocrinology',
 'entomologica',
 'entomology',
 'epidemiology',
 'evolution',
 'genetics',
 'genomic',
 'genomics',
 'histology',
 'immunology',
 'lipids',
 'medical',
 'medrxiv',
 'microbial',
 'microbiology',
 'microchemistry',
 'microchimica',
 'microrna',
 'microscopy',
 'molecul',
 'molecular',
 'mosquito',
 'nanomedicine',
 'nematology',
 'neurochemistry',
 'neurology',
 'neurophysiology',
 'neuroscience',
 'nicotine',
 'nucleosides',
 'nucleotides',
 'parasites',
 'pathology',
 'pharmaceutics',
 'pharmacology',
 'pharmocognosy',
 'physiology',
 'plant',
 'polyadenylation',
 'poultry',
 'protein',
 'tobacco',
 'toxicology',
 'virology',
 ]

healthscience_words= [ 'age',
 'aging',
 'aids',
 'anaesthesia',
 'anaesthesist',
 'anästhesiologie',
 'anatomy',
 'anesthesia',
 'anesthesiology',
 'arthritis',
 'biochemistry',
 'bioengineering',
 'bioethics',
 'biomedical',
 'biorxiv',
 'biotechnology',
 'blood',
 'bone',
 'cancer',
 'cardiac',
 'cardiological',
 'cardiologist',
 'cardiology',
 'cardiovascular',
 'chiropractic',
 'chirurg',
 'cirugía',
 'clinic',
 'clínica',
 'clinical',
 'counseling',
 'craniofacial',
 'dementia',
 'dental',
 'dentistry',
 'dermatology',
 'dermo-sifiliográficas',
 'dermo-sifiliographics',
 'diabetes',
 'diabetology',
 'digestive',
 'disease',
 'diseases',
 'drug',
 'drugs',
 'e-health',
 'endocrinology',
 'enfermería',
 'epidemiology',
 'epilepsy',
 'eye',
 'foot',
 'gastroenterology',
 'genetics',
 'geriatrics',
 'gerontologist',
 'gerontology',
 'gynaecologist',
 'gynécologie',
 'gynecology',
 'health',
 'heart',
 'hematology',
 'hemostasis',
 'hypertension',
 'imaging',
 'immunology',
 'infection',
 'infectious',
 'intervention',
 'kardiologe',
 'lancet',
 'leukemia',
 'liver',
 'lymphoma',
 'maxillofacial',
 'medical',
 'médicale',
 'medicine',
 'medrxiv',
 'metabolic',
 'metabolism',
 'microbiology',
 'molecular',
 'morbidity',
 'nefrología',
 'néphrologie',
 'néphrologology',
 'nephrology',
 'neuro',
 'neurology',
 'neurotology',
 'nicotine',
 'nurse',
 'nursing',
 'nutrition',
 'obesity',
 'obstetrician',
 'obstetrics',
 'occupational',
 'oncology',
 'onkologie',
 'ophthalmic',
 'ophthalmologe',
 'ophthalmology',
 'oral',
 'orthopäde',
 'orthopaedic',
 'orthopaedics',
 'orthopedist',
 'ortopediya',
 'osteoporosis',
 'otorhinolaryngology',
 'pain',
 'parasites',
 'pathology',
 'patient',
 'pediatría',
 'pediatric',
 'pediatrics',
 'pédiatrie',
 'pharmaceutical',
 'pharmacology',
 'pharmazie',
 'pharmocognosy',
 'physiology',
 'prosthodontics',
 'psychiatry',
 'psychoanalysis',
 'psychology',
 'psychonomic',
 'pulmonology',
 'radiology',
 'rehabilitación',
 'rehabilitation',
 'reproductive',
 'respiration',
 'respiratory',
 'retina',
 'reumatologia',
 'reumatología',
 'revista',
 'rheumatology',
 'roentgenology',
 'sclerosis',
 'seizure',
 'shoulder',
 'spine',
 'std',
 'surgeon',
 'surgery',
 'surgical',
 'thrombosis',
 'thyroid',
 'tobacco',
 'toxicology',
 'trauma',
 'urological',
 'urológicas',
 'urology',
 'vascular',
 'veterinar',
 'veterinary',
 'virology']


physicalscience_words= ['acs',
 'actuators',
 'aeroacoustics',
 'aerodynamic',
 'aerospace',
 'akuakultur',
 'algebra',
 'antenna',
 'aquaculture',
 'astro',
 'astronomy',
 'atmospheric',
 'automation',
 'bifurcation',
 'bioengineering',
 'bioinformatics',
 'biomaterials',
 'biotechnology',
 'broadband',
 'catalysis',
 'chaos',
 'circuits',
 'computation',
 'computational',
 'computer',
 'computing',
 'crystal',
 'crystallography',
 'cyber',
 'dynamics',
 'earth',
 'educational technology',
 'edutainment',
 'e-government',
 'e-learning',
 'electrical',
 'electronics',
 'energy',
 'engineering',
 'engineers',
 'equations',
 'ergonomics',
 'fisika',
 'force',
 'geochemistry',
 'geometry',
 'geoscience',
 'ieee',
 'informatics',
 'internet',
 'linguistics',
 'linguística',
 'manufacturing',
 'matemática',
 'material',
 'mathematical',
 'mathematics',
 'mathematics',
 'metallurgy',
 'measurement',
 'mechanical',
 'mechanics',
 'microchimica',
 'microchemistry',
 'microelectronics',
 'microsystems',
 'nanotechnology',
 'navigation',
 'nuclear',
 'oberflächentechnik',
 'optic',
 'optical',
 'particles',
 'physics',
 'planetary',
 'plastic',
 'polymer',
 'robotic',
 'satellite',
 'sensing',
 'sensors',
 'software',
 'solar',
 'sound',
 'statistics',
 'steel',
 'superconductivity',
 'surface technology',
 'telecommunications',
 'thermo',
 'topology',
 'transportation',
 'waste',
 'waves',
 'wireless']

socialscience_words= ['accounting',
 'age',
 'aging',
 'anthropology',
 'archaeology',
 'architecture',
 'art',
 'behavioral',
 'bioethics',
 'business',
 'christian',
 'church',
 'cognition',
 'consumer',
 'crime',
 'criminology',
 'crisis',
 'cultural',
 'decision',
 'e-government',
 'e-learning',
 'econometric',
 'economic',
 'economics',
 'economy',
 'education',
 'educational',
 'educational technology',
 'ekonomi',
 'entrepreneurship',
 'environment',
 'ethics',
 'ethnography',
 'family',
 'finance',
 'financial',
 'forensic',
 'geograph',
 'governance',
 'history',
 'humanities',
 'identity',
 'interpreter',
 'islam',
 'jew',
 'jewish',
 'judge',
 'juridica',
 'juridical',
 'justice',
 'law',
 'learning',
 'legal',
 'librarian',
 'linguistic',
 'linguistics',
 'linguistik',
 'linguística',
 'marital',
 'market',
 'marketing',
 'media',
 'microeconomics',
 'mikroökonomik',
 'museum',
 'muslim',
 'naturalist',
 'pedagogy',
 'pedagogía',
 'pedagógika',
 'pedagógike',
 'personality',
 'philosoph',
 'philosophy',
 'police',
 'policy',
 'politic',
 'politics',
 'pravo',
 'psychoanalysis',
 'psychology',
 'punishment',
 'religion',
 'reorganisation',
 'school',
 'sex',
 'social',
 'society',
 'sociologies',
 'sociology',
 'sozialgeschichte',
 'sport',
 'sustainable',
 'taxes',
 'teaching',
 'tourism',
 'trade',
 'transportation',
 'wrestling']

In [None]:
clean_titles = journalknownretraction_notcat2['name_clean'].tolist()
cate2 = []

for i in range(0,len(clean_titles)):
    cate = []
    if any(item in clean_titles[i] for item in physicalscience_words):
        cate.append('Physical Sciences')
    if any(item in clean_titles[i] for item in healthscience_words):
        cate.append('Health Science')
    if any(item in clean_titles[i] for item in socialscience_words):
        cate.append('Social Science')
    if any(item in clean_titles[i] for item in lifescience_words):
        cate.append('Life Science')
        
    cate2.append(cate)

        
journalknownretraction_notcat2['MainCategory'] = cate2

journalknownretraction_notcat2['MainCategory'] = [', '.join(map(str, l)) for l in journalknownretraction_notcat2['MainCategory']]
journalknownretraction_notcat2['MainCategory'] = journalknownretraction_notcat2['MainCategory'].astype(str).replace('', np.nan)

journalknownretraction_notcat2

In [None]:
"""
Filtering classified and unclassified after using manually generate keyword approach
"""

journalknownretraction_cat3 = journalknownretraction_notcat2[~((journalknownretraction_notcat2['MainCategory'].isna()) |\
                               (journalknownretraction_notcat2['MainCategory'] == ''))]

journalknownretraction_notcat_last = journalknownretraction_notcat2[(journalknownretraction_notcat2['MainCategory'].isna()) |\
                               (journalknownretraction_notcat2['MainCategory'] == '')]

third_pass = get_resolved_journaltitle_count(journalknownretraction_notcat2)

print(f'Categorized Round 3: The total number of second phase classified journal titles is {third_pass[0]}')
print(f'The total number of classified journal titles from first, second & third phases is {first_pass[0]+second_pass[0]+third_pass[0]}')
print(f'Uncategorized Round 3: The total number of remaining unclassified journal titles is {third_pass[1]}')

In [None]:
"""
Finding Numbers of DOIs matched with manually curated list approach
"""

df_thirdpass = pd.merge(journalknownretraction_unique,journalknownretraction_cat3.iloc[:,:], on='JournalandConferenceProceedings_lowercase', how='inner')

no_doi_thirdpass=\
    len(pd.merge(knownretractionlist[['DOI','JournalandConferenceProceedings_lowercase']],df_thirdpass[['JournalandConferenceProceedings_lowercase','MainCategory']],
                             on= 'JournalandConferenceProceedings_lowercase', how='inner'))

print('The total number of DOIs matched with manually curated list approach is:', no_doi_thirdpass)
print(f'Which is {no_doi_thirdpass/len(knownretractionlist)*100:.2f}% of the {len(knownretractionlist)} DOIs')

In [None]:
"""
Joining all the journal titles that were classified: journalknownretraction_cat1,journalknownretraction_cat2,
journalknownretraction_cat3 and the remaining unclassified one: journalknownretraction_notcat_last

Output: journal_cat

"""
journalcategories = [journalknownretraction_cat1,journalknownretraction_cat2,
                     journalknownretraction_cat3.iloc[:,:-1], journalknownretraction_notcat_last]

journal_cat = pd.concat(journalcategories, axis=0).reset_index(drop=True).drop(['name_clean'], axis=1)

assert len(journal_cat) == len(journalknownretraction_categories), 'Length of journal_cat should be equal to that of journalknownretraction_categories'
# The table will not show if the assertion above is false.

journal_cat\
#            .to_csv(data_dir+ 'journal/journalcategory.csv')

In [None]:
"""
Save journal categories to unionlist
"""
unionlist_updated=\
pd.merge(knownretractionlist,journal_cat.iloc[:,:], on='JournalandConferenceProceedings_lowercase', how='left')

unionlist_updated.drop(['JournalandConferenceProceedings_lowercase', 'JournalandConferenceProceedings_clean'], axis=1, inplace=True)

unionlist_updated\
#         .to_csv(data_dir+f"unionlist/unionlist_completed_{getdate['unionlist']}.csv")
