## Step 1.5 : Journal Field Information
- Part 1: Import Scopus Journal List to get upper-level field category for journals
- Part 2: Identify Journal from the Known Retraction List that are not listed in the Scopus' Journal List
- Part 3: Use YAKE to find keyword in each field and match with the journals
- Part 4: Manually generate keywords and match with the rest of the journals
- Part 5: Append the result of each step to the original list

In [None]:
# !pip install yake
# !pip install openpyxl

In [1]:
import yake
import pandas as pd
import numpy as np
import os
import re
from datetime import date, datetime as dt

In [None]:
from pathlib import Path

path = Path('\\.')
path.parent.absolute()
os.listdir()
box_path_1= 'INPUT_YOUR_DATA_DIRECTORY'
box_path_2 = 'INPUT_YOUR_RESULT_DIRECTORY'

### Part 1: Import Scopus Journal List to get upper-level field category for journals

In [None]:
# Reading Scopus journal classification. See latest version: https://www.elsevier.com/?a=91122

try:
    scopus_journal_data = pd.read_excel(box_path_1+'extlistJune2023.xlsx',sheet_name=None)  #,encoding ='utf-8',errors='ignore'
except UnicodeDecodeError:
    pass

In [None]:
#scopus_journal_sheet.columns

In [None]:
scopus_journal_data_sheets_names = list(scopus_journal_data.keys())
scopus_journal_sheet = scopus_journal_data[scopus_journal_data_sheets_names[0]]

selected_columns=['Source Title (Medline-sourced journals are indicated in Green)',
                  'Top level:\n\nLife Sciences','Top level:\n\nSocial Sciences',
                  'Top level:\n\nPhysical Sciences','Top level:\n\nHealth Sciences',
                  '1000 \nGeneral']

scopus_journal_filtered = scopus_journal_sheet.filter(items = selected_columns).replace('',np.nan)
scopus_journal_filtered

In [None]:
"""
Joining the cell values for each category within the subject areas for each of the journal titles
"""

# 'MainCategory' <-- Concatenate the cell values for journal titles 
scopus_journal_filtered['MainCategory'] = scopus_journal_filtered.apply(lambda row: ','.join(\
    filter(lambda x: pd.notna(x), [row[column] for column in selected_columns[1:]])),axis=1)


scopus_journal_filtered = scopus_journal_filtered.rename(columns={selected_columns[0]:'JournalandConferenceProceedings'})  #journalscopus = journalscopus

# scopus_journal <-- Select needed column of interesr 
scopus_journal_part = scopus_journal_filtered[['JournalandConferenceProceedings','MainCategory']]

#founded.to_csv(save_file_dir_proj+'journal/journalcategory_founded_tempo.csv')

#scopus_journal.to_csv(save_file_dir_proj+'journal/scopus_journalsubject.csv')
scopus_journal_part

In [None]:
# Reading Scopus conference categories 
conference_sheet = scopus_journal_data[scopus_journal_data_sheets_names[3]].iloc[:,[1,-1]]
conference_sheet = conference_sheet.rename(columns={'All Science Journal Classification Codes (ASJC)':'ASJC',
                                                   'Source Title': 'JournalandConferenceProceedings'})
conference_sheet

In [None]:
"""
Assigning main category subjects to their ASJC codes
"""
lookup={
10: 'General',
11: 'Life Sciences',
12: 'Social Sciences',
13: 'Life Sciences',
14: 'Social Sciences',
15: 'Physical Sciences',
16: 'Physical Sciences',
17: 'Physical Sciences',
18: 'Social Sciences',
19: 'Physical Sciences',
20: 'Social Sciences',
21: 'Physical Sciences',
22: 'Physical Sciences',
23: 'Physical Sciences',
24: 'Life Sciences',
25: 'Physical Sciences',
26: 'Physical Sciences',
27: 'Health Sciences',
28: 'Life Sciences',
29: 'Health Sciences',
30: 'Life Sciences',
31: 'Physical Sciences',
32: 'Social Sciences',
33: 'Social Sciences',
34: 'Health Sciences',
35: 'Health Sciences',
36: 'Health Sciences'
}

In [None]:
def map_asjc_to_category(asjc_value):
    """
    :params asjc_value: the ASJC code of the journal & Conference proceedings
    This function assign conference proceedings of the Scopus their main categories
    """
    store=[]
    asjc_values = asjc_value.strip().split(';')

    for values in asjc_values:
        if values:
            values=values.strip()
#             print(values)
#             print((values[:2]))
#         print(lookup[int(values[:2])])
            store.append(lookup[int(values[:2])])
    
    store = list(set(store))
    return ','.join(store)

In [None]:

conference_sheet['ASJC']=conference_sheet['ASJC'].fillna("")
# conference_sheet['ASJC']= conference_sheet['ASJC'].apply(str)
conference_sheet['MainCategory'] = conference_sheet['ASJC'].apply(map_asjc_to_category)
conference_sheet_part= conference_sheet[['JournalandConferenceProceedings','MainCategory']]
conference_sheet_part

In [None]:
#scopus_journal_part[scopus_journal_part['MainCategory'].str.contains('General')]

In [None]:
# Appending Both the Journal titles & Conferences from Scopus

journalscopus = pd.concat([scopus_journal_part,conference_sheet_part])
journalscopus
#journalscopus.to_csv(box_path_1+'scopus_journalconferencecategory.csv' )

In [None]:
len(journalscopus)

In [None]:
# Input Scopus Journal List
journalscopus = pd.read_csv(box_path_1 + 'scopus_journalconferencecategory.csv').drop(['Unnamed: 0'], axis=1)

journalscopus['JournalandConferenceProceedings_lowercase'] = journalscopus['JournalandConferenceProceedings'].str.lower().str.strip()
journalscopus_deduplicated = journalscopus.drop_duplicates(subset='JournalandConferenceProceedings_lowercase')
journalscopus_deduplicated

In [None]:
journalscopus[~journalscopus['MainCategory'].isna()]

### Part 2: Cleaning the Journal Title's Names in the KnownRetraction List

In [None]:
knownretractionlist= pd.read_csv(box_path_1 +'2023-04-13-knownretractionlist.csv').drop(['Unnamed: 0','MainCategory'],axis=1)
knownretractionlist['JournalandConferenceProceedings_lowercase'] = knownretractionlist['Journal'].str.lower().str.strip()

journalknownretraction_= knownretractionlist[['Journal','JournalandConferenceProceedings_lowercase']].rename(columns={'Journal': 'JournalandConferenceProceedings'})

journalknownretraction_


In [None]:
def clean_journal_title(df_: pd.DataFrame):
    """
    param df_: The dataframe to work with
    :return: Dataframe with clean column
    """
    df= df_.copy(deep=True)
    df['JournalandConferenceProceedings_clean'] = df_['JournalandConferenceProceedings'].str.strip().str.lower()

    # remove '&amp and '&'
    df['JournalandConferenceProceedings_clean'] = df['JournalandConferenceProceedings_clean'].str.replace("&amp", "").str.replace("&", "")

    # remove 'the' if it starts a journal title
    df['JournalandConferenceProceedings_clean'] = df['JournalandConferenceProceedings_clean'].str.replace(r'(?i)^(the) ', '',regex=True)

    # remove position such 1st, 2nd, 3rd, 4th from the journal titles
    df['JournalandConferenceProceedings_clean'] = df['JournalandConferenceProceedings_clean'].apply(lambda x: re.sub(r'\b\d+(st|nd|rd|th)\b', '', x)) #re.sub(r'\b\d+(st|nd|rd|th)\b', '', text)

    # remove other digits and punctuation from the journal titles
    df['JournalandConferenceProceedings_clean'] = df['JournalandConferenceProceedings_clean'].apply(lambda x: re.sub(r'[^\w\s]|[\d]', '', x))

    #remove extra whitespace in between words
    df['JournalandConferenceProceedings_clean'] = df['JournalandConferenceProceedings_clean'].str.replace(r'\s+', ' ',regex=True).str.strip()
    
    return df

In [None]:
# Cleaning the Journal Titles 
journalknownretraction_clean = clean_journal_title(journalknownretraction_)

#journalknownretraction_clean.groupby('JournalandConferenceProceedings_clean')['JournalandConferenceProceedings'].count().reset_index()
after_cleaning = len(journalknownretraction_clean[['JournalandConferenceProceedings_clean']].drop_duplicates())

print(f'The total number of journal titles in the knownretraction list is {after_cleaning} after cleaning')
journalknownretraction_clean

In [None]:
# Export the journalknownretraction_clean file for OpenRefine further cleaning
#journalknownretraction_clean.to_csv(box_path_1+'2023-08-08-journalcategory-knownretractionlist_updated.csv')

In [None]:
journalknownretraction_clean2 = pd.read_csv(box_path_1+ '2023-08-08-STI_journalcategory_openrefined_cleaned.csv').drop('Column', axis=1)

after_cleaning2 = len(journalknownretraction_clean2[['JournalandConferenceProceedings_clean']].drop_duplicates())

print(f'The total number of journal titles in the knownretraction list is {after_cleaning2} after cleaning with OpenRefine')
journalknownretraction_clean2

### Part 2: Identify Journal Field Category from the Scopus Journal List

In [None]:
def get_resolved_journaltitle_count(df: pd.DataFrame()):
    """
    It gets unique count of journal titles based on 'JournalandConferenceProceedings_clean' column
    """
    x = df.copy(deep=True)
    xC = x[~x.MainCategory.isna()][['JournalandConferenceProceedings_clean']].drop_duplicates()
    xCnot = x[x.MainCategory.isna()][['JournalandConferenceProceedings_clean']].drop_duplicates()
    xT = x[['JournalandConferenceProceedings_clean']].drop_duplicates()
    
    return len(xC),len(xCnot),len(xT)

In [None]:

# Use 'JournalandConferenceProceedings' to merge with Scopus journal list
journalknownretraction_clean2['JournalandConferenceProceedings_lowercase'] = journalknownretraction_clean2['JournalandConferenceProceedings'].str.lower().str.strip()

journalknownretraction= pd.merge(journalknownretraction_clean2,journalscopus_deduplicated.iloc[:,1:], on='JournalandConferenceProceedings_lowercase', how='left')

print(get_resolved_journaltitle_count(journalknownretraction))
"""
Mapping out items from 'JournalandConferenceProceedings_clean' with no MainCategory with
same items in 'JournalandConferenceProceedings_lowercase' that have MainCategory. This is because items were merged
with Scopus journal list based on 'JournalandConferenceProceedings_lowercase'
"""
mapping = journalknownretraction.dropna(subset=['MainCategory']).set_index('JournalandConferenceProceedings_clean')['MainCategory']

# Fill in missing values in column using the mapping
journalknownretraction['MainCategory'] = journalknownretraction['MainCategory'].fillna(journalknownretraction['JournalandConferenceProceedings_clean'].map(mapping))


#journalknownretraction[journalknownretraction['MainCategory'].isna()][['JournalandConferenceProceedings_clean']].drop_duplicates()

In [None]:
print(f'Of {after_cleaning2} total number of journal titles in the knownretractionlist')
first_pass = get_resolved_journaltitle_count(journalknownretraction)
print(f'The total no of classified journal titles with Scopus Journal is {first_pass[0]} and {first_pass[1]} remaining unclassified')

print(f'That is {(first_pass[0]/after_cleaning2)*100}% classified journal')

In [None]:
"""
Finding Numbers of DOIs matched with Scopus classification
"""


df_firstpass = pd.merge(journalknownretraction_clean2,journalscopus_deduplicated.iloc[:,1:], on='JournalandConferenceProceedings_lowercase', how='inner')

no_doi_firstpass=\
    len(pd.merge(knownretractionlist[['DOI','JournalandConferenceProceedings_lowercase']],df_firstpass[['JournalandConferenceProceedings_lowercase','MainCategory']],
                             on= 'JournalandConferenceProceedings_lowercase', how='inner'))

print('The total number of DOIs matched in Scopus journal list is:' , no_doi_firstpass)
print(f'Which is {round(no_doi_firstpass/len(knownretractionlist)*100,2)}% of the {len(knownretractionlist)} DOIs')

## Part 3: Identify Journal from the Known Retraction List that are not listed in the Scopus' Journal List

In [None]:
# Identify Journals from the known retraction list that are categorized
#journalknownretraction_cat = journalknownretraction.merge(journalscopus, left_on= 'JournalandConferenceProceedings_lowercase', right_on= 'JournalandConferenceProceedings_lowercase', how='left')

journalknownretraction.MainCategory = journalknownretraction.MainCategory.str.strip()
journalknownretraction_cat = journalknownretraction[~journalknownretraction['MainCategory'].isnull()].copy(deep=True)
#print(journalknownretraction_cat.info())

# Identify Journals from the known retraction list that are not categorized to any field
journalknownretraction_notcat = journalknownretraction[journalknownretraction['MainCategory'].isnull()].copy(deep=True)
# journalknownretraction_notcat.MainCategory= journalknownretraction_cat.MainCategory.str.strip()

# #print(journalknownretraction_notcat.info())
print('Count of Journal that are not categorized: ', first_pass[1])
print(f'Percentage of Journals that are not categoriezed: {round(int(first_pass[1])/int(after_cleaning2)*100, 2)}%')

## Part 4: Use YAKE to find keyword in each field and match with the journals

In [None]:
# Create a list of journal name str from journals that are categorized
lifescience = journalknownretraction_cat[journalknownretraction_cat['MainCategory'].str.contains('Life Science')]['JournalandConferenceProceedings_clean']
lifescience_ = " ".join(lifescience)
lifescience_list  = re.sub(r'[^\w\s]', '', lifescience_).split()

healthscience= journalknownretraction_cat[journalknownretraction_cat['MainCategory'].str.contains('Health Science')]['JournalandConferenceProceedings_clean']
healthscience_ = " ".join(healthscience)
healthscience_list  = re.sub(r'[^\w\s]', '', healthscience_).split()


physicalscience = journalknownretraction_cat[journalknownretraction_cat['MainCategory'].str.contains('Physical Science')]['JournalandConferenceProceedings_clean']
physicalscience_ = " ".join(physicalscience)
physicalscience_list  = re.sub(r'[^\w\s]', '', physicalscience_).split()

socialscience= journalknownretraction_cat[journalknownretraction_cat['MainCategory'].str.contains('Social Science')]['JournalandConferenceProceedings_clean']
socialscience_ = " ".join(socialscience)
socialscience_list  = re.sub(r'[^\w\s]', '', socialscience_).split()


In [None]:
#!pip install stopwordsiso
import stopwordsiso as stopwords

# stop words list of all languages in the ISO-639 standard was used to process the titles. 
# https://github.com/stopwords-iso/stopwords-iso  stopwords.stopwords("en") stopwords.stopwords.lang()

In [None]:
# Creat list of stopwords
new_stopwords = ['&', '&amp', 'acta', 'africa', 'african', 'albania', 'albanian', 'america', 'american', 'and', 
                 'andorra', 'applied', 'archives', 'armenia', 'asian', 'asian-australasian', 'association', 'australasian', 
                 'austria', 'austrian', 'azerbaijan', 'belarus', 'belgium', 'bmc', 'bosnia', 'brazil', 'brazilian', 'british', 
                 'bulgaria', 'bulgarian', 'bulletin', 'cadernos', 'canadian', 'china', 'chinese', 'communication', 'communications', 
                 'conference', 'croatia', 'croatian', 'current', 'cyprus', 'czech', 'denmark', 'dynamic', 'dynamics', 'east', 
                 'elife', 'estonia', 'europe', 'european', 'experimental', 'f1000research', 'finland', 'france', 'georgia', 
                 'german', 'germany', 'greece', 'herzegovina', 'hungary', 'iceland', 'india', 'indian', 'indonesia', 'indonesian', 
                 'international', 'iran', 'iranian', 'ireland', 'italian', 'italy', 'jama', 'japan', 'japanese', 'journal', 'jurnal', 
                 'kazakhstan', 'korean', 'latvia', 'lecture', 'letters', 'liechtenstein', 'list', 'lithuania', 'luxembourg', 'macedonia', 
                 'malta', 'management', 'moldova', 'monaco', 'montenegro', 'moscow', 'national academy', 'netherlands', 'north', 'norway',
                 'note', 'notes', 'opinion', 'oxford', 'peerj', 'poland', 'portugal', 'proceeding', 'proceedings', 'reports', 'republic',
                 'research', 'review', 'reviews', 'revista', 'romania', 'russia', 'russian', 'saudi', 'scandinavian', 'science', 'sciences',
                 'serbia', 'serials', 'slovakia', 'slovenia', 'society', 'south', 'spain', 'spainish', 'studies', 'sweden', 'switzerland',
                 'targets', 'trabalhos', 'turkey', 'turkukraine', 'uk', 'ukrainian', 'united kingdom', 'universities', 'university', 'vakblad', 'west']


#"de", "id", "zh"
new_stopwords.extend(stopwords.stopwords(["en","de","fr", "la","ru"])) # adding English stopwords
# new_stopwords.extend(stopwords.stopwords("fr")) # adding French stopwords
# new_stopwords.extend(stopwords.stopwords("de")) # adding German stopwords
# Add stopwords from stopwordsiso #stopwords.langs()  stopwords.stopwords("en")
print('The total stopwords is ',len(new_stopwords))

In [None]:
"""
Removing stopwords from the main categories lists
"""
lifescience_str= set(lifescience_list)
healthscience_str = set(healthscience_list)
physicalscience_str= set(physicalscience_list)
socialscience_str= set(socialscience_list)

for remove_item in set(new_stopwords):
    if remove_item in lifescience_str:
        lifescience_str.remove(remove_item)
    if remove_item in healthscience_str:
        healthscience_str.remove(remove_item)
    if remove_item in physicalscience_str:
        physicalscience_str.remove(remove_item)
    if remove_item in socialscience_str:
        socialscience_str.remove(remove_item)

In [None]:
"""
Extracting relevant keywords in each field
"""
text_list = [lifescience_str, healthscience_str, physicalscience_str, socialscience_str]
store_keywords = []

for i in text_list:
    keywords_per_science=[]
    kw_extractor = yake.KeywordExtractor()
    text = ' '.join(set(i))
    language = "en"
    max_ngram_size = 1
    deduplication_threshold = 0.5
    numOfKeywords = 1000
    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
    keywords = custom_kw_extractor.extract_keywords(text)
    for kw in keywords:
        keywords_per_science.append(kw[0])
    store_keywords.append(keywords_per_science)

In [None]:
# Create a dataframe to display the keywords of each field
cat = ['Life Sciences', 'Health Sciences', 'Physical Sciences', 'Social Sciences']
cat_key_df = pd.DataFrame()

cat_key_df['Categories'] = cat

#lifescience_keywords,healthscience_keywords,physicalscience_keywords,socialscience_keywords = store_keywords
cat_key_df['Keyword'] = [store_keywords[0], store_keywords[1], store_keywords[2], store_keywords[3]]

cat_key_df


In [None]:
# Create a function to categorize journals and return a list of categories
def categorize_journal(title, cat_key_df):
    """
    It categorizes the journal titles into subjects
    :param title: the title of the journal
    :param cat_key_df: list of topical keywords
    
    :return: the main categories of the title
    """
    categories = []
    for index, row in cat_key_df.iterrows():
        if any(keyword in title for keyword in row['Keyword']):
            categories.append(row['Categories'])
    return categories


In [None]:
# Iterate not yet classified journal titles and categorize using categorize_journal
for i in range(len(journalknownretraction_notcat)):

    title = journalknownretraction_notcat['JournalandConferenceProceedings_lowercase'].iloc[i]
    categories = categorize_journal(title, cat_key_df)
    
    if categories:
        journalknownretraction_notcat['MainCategory'].iloc[i] = ', '.join(categories)

journalknownretraction_notcat

In [None]:
"""
Filtering classified and unclassified after using YAKE approach
"""

journalknownretraction_cat2 = journalknownretraction_notcat[~journalknownretraction_notcat['MainCategory'].isna()]
# print(f'Categoried 2: The total number of second classified journal titles is {len(journalknownretraction_cat2)}')



journalknownretraction_notcat2 = journalknownretraction_notcat[journalknownretraction_notcat['MainCategory'].isna()]
# print(f'Uncategoried 2: The total number of remaining unclassified journal titles from \
# journalknownretraction_notcat2 is {len(journalknownretraction_notcat2)}')

second_pass = get_resolved_journaltitle_count(journalknownretraction_notcat)
print(f'Categoried 2: The total number of second phase classified journal titles is {second_pass[0]}')
print(f'The total number of classified journal titles from first & second phase is {first_pass[0]+second_pass[0]}')
print(f'Uncategoried 2: The total number of remaining unclassified journal titles is {second_pass[1]}')

In [None]:
"""
Finding Numbers of DOIs matched with YAKE approach
"""


df_secondpass = pd.merge(journalknownretraction_clean2,journalknownretraction_cat2.iloc[:,1:4], on='JournalandConferenceProceedings_lowercase', how='inner')

no_doi_secondpass=\
    len(pd.merge(knownretractionlist[['DOI','JournalandConferenceProceedings_lowercase']],df_secondpass[['JournalandConferenceProceedings_lowercase','MainCategory']],
                             on= 'JournalandConferenceProceedings_lowercase', how='inner'))

print('The total number of DOIs matched with YAKE Approach is:' , no_doi_secondpass)
print(f'Which is {round(no_doi_secondpass/len(knownretractionlist)*100,2)}% of the {len(knownretractionlist)} DOIs')

## Part 4: Manually generate keywords and match with the rest of the journals

In [None]:
nameclean = []
for i in journalknownretraction_notcat2['JournalandConferenceProceedings_clean']:
    #notfound['name_clean']=[]
    stripped = i.split('(', 1)[0]
    stripped = i.split('=', 1)[0]
    nameclean.append(stripped)

journalknownretraction_notcat2['name_clean'] = nameclean

journalknownretraction_notcat2

In [None]:
lifescience_words = ['acids',
 'agric',
 'agronomy',
 'akuakultur',
 'anatomical',
 'anatomy',
 'aquaculture',
 'bacteriology',
 'biochemistry',
 'bioengineering',
 'bioethics',
 'bioinformatics',
 'biolog',
 'biological',
 'biology',
 'biomedicine',
 'biomolecular',
 'biophysics',
 'biorxiv',
 'biotechnology',
 'cell',
 'cells',
 'cellular',
 'chemical',
 'chemie',
 'chemistry',
 'clinic',
 'clínica',
 'crispr',
 'cytology',
 'dendrology',
 'dna',
 'ecology',
 'endocrinology',
 'entomologica',
 'entomology',
 'epidemiology',
 'evolution',
 'genetics',
 'genomic',
 'genomics',
 'histology',
 'immunology',
 'lipids',
 'medical',
 'medrxiv',
 'microbial',
 'microbiology',
 'microchemistry',
 'microchimica',
 'microrna',
 'microscopy',
 'molecul',
 'molecular',
 'mosquito',
 'nanomedicine',
 'nematology',
 'neurochemistry',
 'neurology',
 'neurophysiology',
 'neuroscience',
 'nicotine',
 'nucleosides',
 'nucleotides',
 'parasites',
 'pathology',
 'pharmaceutics',
 'pharmacology',
 'pharmocognosy',
 'physiology',
 'plant',
 'polyadenylation',
 'poultry',
 'protein',
 'tobacco',
 'toxicology',
 'virology',
 ]

healthscience_words= [ 'age',
 'aging',
 'aids',
 'anaesthesia',
 'anaesthesist',
 'anästhesiologie',
 'anatomy',
 'anesthesia',
 'anesthesiology',
 'arthritis',
 'biochemistry',
 'bioengineering',
 'bioethics',
 'biomedical',
 'biorxiv',
 'biotechnology',
 'blood',
 'bone',
 'cancer',
 'cardiac',
 'cardiological',
 'cardiologist',
 'cardiology',
 'cardiovascular',
 'chiropractic',
 'chirurg',
 'cirugía',
 'clinic',
 'clínica',
 'clinical',
 'counseling',
 'craniofacial',
 'dementia',
 'dental',
 'dentistry',
 'dermatology',
 'dermo-sifiliográficas',
 'dermo-sifiliographics',
 'diabetes',
 'diabetology',
 'digestive',
 'disease',
 'diseases',
 'drug',
 'drugs',
 'e-health',
 'endocrinology',
 'enfermería',
 'epidemiology',
 'epilepsy',
 'eye',
 'foot',
 'gastroenterology',
 'genetics',
 'geriatrics',
 'gerontologist',
 'gerontology',
 'gynaecologist',
 'gynécologie',
 'gynecology',
 'health',
 'heart',
 'hematology',
 'hemostasis',
 'hypertension',
 'imaging',
 'immunology',
 'infection',
 'infectious',
 'intervention',
 'kardiologe',
 'lancet',
 'leukemia',
 'liver',
 'lymphoma',
 'maxillofacial',
 'medical',
 'médicale',
 'medicine',
 'medrxiv',
 'metabolic',
 'metabolism',
 'microbiology',
 'molecular',
 'morbidity',
 'nefrología',
 'néphrologie',
 'néphrologology',
 'nephrology',
 'neuro',
 'neurology',
 'neurotology',
 'nicotine',
 'nurse',
 'nursing',
 'nutrition',
 'obesity',
 'obstetrician',
 'obstetrics',
 'occupational',
 'oncology',
 'onkologie',
 'ophthalmic',
 'ophthalmologe',
 'ophthalmology',
 'oral',
 'orthopäde',
 'orthopaedic',
 'orthopaedics',
 'orthopedist',
 'ortopediya',
 'osteoporosis',
 'otorhinolaryngology',
 'pain',
 'parasites',
 'pathology',
 'patient',
 'pediatría',
 'pediatric',
 'pediatrics',
 'pédiatrie',
 'pharmaceutical',
 'pharmacology',
 'pharmazie',
 'pharmocognosy',
 'physiology',
 'prosthodontics',
 'psychiatry',
 'psychoanalysis',
 'psychology',
 'psychonomic',
 'pulmonology',
 'radiology',
 'rehabilitación',
 'rehabilitation',
 'reproductive',
 'respiration',
 'respiratory',
 'retina',
 'reumatologia',
 'reumatología',
 'revista',
 'rheumatology',
 'roentgenology',
 'sclerosis',
 'seizure',
 'shoulder',
 'spine',
 'std',
 'surgeon',
 'surgery',
 'surgical',
 'thrombosis',
 'thyroid',
 'tobacco',
 'toxicology',
 'trauma',
 'urological',
 'urológicas',
 'urology',
 'vascular',
 'veterinar',
 'veterinary',
 'virology']


physicalscience_words= ['acs',
 'actuators',
 'aeroacoustics',
 'aerodynamic',
 'aerospace',
 'akuakultur',
 'algebra',
 'antenna',
 'aquaculture',
 'astro',
 'astronomy',
 'atmospheric',
 'automation',
 'bifurcation',
 'bioengineering',
 'bioinformatics',
 'biomaterials',
 'biotechnology',
 'broadband',
 'catalysis',
 'chaos',
 'circuits',
 'computation',
 'computational',
 'computer',
 'computing',
 'crystal',
 'crystallography',
 'cyber',
 'dynamics',
 'earth',
 'educational technology',
 'edutainment',
 'e-government',
 'e-learning',
 'electrical',
 'electronics',
 'energy',
 'engineering',
 'engineers',
 'equations',
 'ergonomics',
 'fisika',
 'force',
 'geochemistry',
 'geometry',
 'geoscience',
 'ieee',
 'informatics',
 'internet',
 'linguistics',
 'linguística',
 'manufacturing',
 'matemática',
 'material',
 'mathematical',
 'mathematics',
 'mathematics',
 'metallurgy',
 'measurement',
 'mechanical',
 'mechanics',
 'microchimica',
 'microchemistry',
 'microelectronics',
 'microsystems',
 'nanotechnology',
 'navigation',
 'nuclear',
 'oberflächentechnik',
 'optic',
 'optical',
 'particles',
 'physics',
 'planetary',
 'plastic',
 'polymer',
 'robotic',
 'satellite',
 'sensing',
 'sensors',
 'software',
 'solar',
 'sound',
 'statistics',
 'steel',
 'superconductivity',
 'surface technology',
 'telecommunications',
 'thermo',
 'topology',
 'transportation',
 'waste',
 'waves',
 'wireless']

socialscience_words= ['accounting',
 'age',
 'aging',
 'anthropology',
 'archaeology',
 'architecture',
 'art',
 'behavioral',
 'bioethics',
 'business',
 'christian',
 'church',
 'cognition',
 'consumer',
 'crime',
 'criminology',
 'crisis',
 'cultural',
 'decision',
 'e-government',
 'e-learning',
 'econometric',
 'economic',
 'economics',
 'economy',
 'education',
 'educational',
 'educational technology',
 'ekonomi',
 'entrepreneurship',
 'environment',
 'ethics',
 'ethnography',
 'family',
 'finance',
 'financial',
 'forensic',
 'geograph',
 'governance',
 'history',
 'humanities',
 'identity',
 'interpreter',
 'islam',
 'jew',
 'jewish',
 'judge',
 'juridica',
 'juridical',
 'justice',
 'law',
 'learning',
 'legal',
 'librarian',
 'linguistic',
 'linguistics',
 'linguistik',
 'linguística',
 'marital',
 'market',
 'marketing',
 'media',
 'microeconomics',
 'mikroökonomik',
 'museum',
 'muslim',
 'naturalist',
 'pedagogy',
 'pedagogía',
 'pedagógika',
 'pedagógike',
 'personality',
 'philosoph',
 'philosophy',
 'police',
 'policy',
 'politic',
 'politics',
 'pravo',
 'psychoanalysis',
 'psychology',
 'punishment',
 'religion',
 'reorganisation',
 'school',
 'sex',
 'social',
 'society',
 'sociologies',
 'sociology',
 'sozialgeschichte',
 'sport',
 'sustainable',
 'taxes',
 'teaching',
 'tourism',
 'trade',
 'transportation',
 'wrestling']

In [None]:
clean_titles = journalknownretraction_notcat2['name_clean'].tolist()
cate2 = []

for i in range(0,len(clean_titles)):
    cate = []
    if any(item in clean_titles[i] for item in physicalscience_words):
        cate.append('Physical Sciences')
    if any(item in clean_titles[i] for item in healthscience_words):
        cate.append('Health Science')
    if any(item in clean_titles[i] for item in socialscience_words):
        cate.append('Social Science')
    if any(item in clean_titles[i] for item in lifescience_words):
        cate.append('Life Science')
        
    cate2.append(cate)

        
journalknownretraction_notcat2['MainCategory'] = cate2

journalknownretraction_notcat2['MainCategory'] = [', '.join(map(str, l)) for l in journalknownretraction_notcat2['MainCategory']]
journalknownretraction_notcat2['MainCategory'] = journalknownretraction_notcat2['MainCategory'].astype(str).replace('', np.nan)
# journalknownretraction_notcat2['MainCategory'] = journalknownretraction_notcat2['MainCategory'].astype(str).replace('nan', '')
journalknownretraction_notcat2

In [None]:
"""
Filtering classified and unclassified after using Manually generate keyword approach
"""

journalknownretraction_cat3 = journalknownretraction_notcat2[journalknownretraction_notcat2['MainCategory']!='']
#print(f'Categoried 3: The total number of third classified journal titles is {len(journalknownretraction_cat3)}')



journalknownretraction_notcat_last = journalknownretraction_notcat2[journalknownretraction_notcat2['MainCategory']=='']
#print(f'Uncategoried 3: The total number of remaining unclassified journal titles from \
#after manual keyword processing of journalknownretraction_notcat2 is {len(journalknownretraction_notcat_last)}')


third_pass = get_resolved_journaltitle_count(journalknownretraction_notcat2)
print(f'Categoried 3: The total number of second phase classified journal titles is {third_pass[0]}')
print(f'The total number of classified journal titles from first, second & third phases is {first_pass[0]+second_pass[0]+third_pass[0]}')
print(f'Uncategoried 3: The total number of remaining unclassified journal titles is {third_pass[1]}')

# journalknownretraction_notcat_last

In [None]:
"""
Finding Numbers of DOIs matched with Manually curated list approach
"""


df_thirdpass = pd.merge(journalknownretraction_clean2,journalknownretraction_cat3.iloc[:,1:4], on='JournalandConferenceProceedings_lowercase', how='inner')

no_doi_thirdpass=\
    len(pd.merge(knownretractionlist[['DOI','JournalandConferenceProceedings_lowercase']],df_thirdpass[['JournalandConferenceProceedings_lowercase','MainCategory']],
                             on= 'JournalandConferenceProceedings_lowercase', how='inner'))

print('The total number of DOIs matched with Manually curated list Approach is:' , no_doi_thirdpass)
print(f'Which is {round(no_doi_thirdpass/len(knownretractionlist)*100,2)}% of the {len(knownretractionlist)} DOIs')

In [None]:

"""
Joining all the journal titles that were classified: journalknownretraction_cat,journalknownretraction_cat2,
journalknownretraction_cat3 and the remaining unclassified one: journalknownretraction_notcat_last

Output: journal_cat

"""
journalcategories = [journalknownretraction_cat,journalknownretraction_cat2,
                     journalknownretraction_cat3, journalknownretraction_notcat_last]


journal_cat = pd.concat([journalknownretraction_cat,journalknownretraction_cat2])
for df in journalcategories[2:]:
    journal_cat = pd.concat([journal_cat,df])


journal_cat.reset_index(inplace=True)


journal_cat = journal_cat.drop(columns=['index'], axis=1)#.rename(columns={'JournalandConferenceProceedings_x': 'JournalandConferenceProceedings'})

assert len(journal_cat)== len(journalknownretraction), 'Length of journal_cat should be equal to that of journalknownretraction '


journal_cat = journal_cat.drop(['JournalandConferenceProceedings_lowercase', 'name_clean'], axis=1)


journal_cat\
            #.to_csv(box_path_1 + '2023-09-03_journalcategory_updated.csv')

In [None]:
# Saving uncategorized journal list
journal_cat[journal_cat['MainCategory'].isna()]\
    #.to_csv(box_path_1 + '2023-09-03_journalcategory_notcategorized.csv')

In [None]:
#journal_cat['JournalandConferenceProceedings_lowercase']= journal_cat['JournalandConferenceProceedings'].str.lower().str.strip()

knownretractionlist_update=\
pd.merge(knownretractionlist,journal_cat.iloc[:,-2:], on='JournalandConferenceProceedings_lowercase', how='left')

knownretractionlist_update\
    #.to_csv(box_path_1 + '2023-09-03_journalcategory_knownretractionlist_updated.csv')
knownretractionlist_update