In [5]:
import time
import random
import pandas as pd
from scholarly import scholarly
from googlesearch import search

In [2]:
fer_datasets_LONG = [
    #"Extended Cohn-Kanade", 
    #'MMI Facial Expression',
    #"Japanese Female Facial Expression", 
    #"Toronto Face Database",
    #"Binghamton University 3D Facial Expression",
    #"Oulu-CASIA",
    #"Radboud Faces Database",
    #"Karolinska Directed Emotional Faces",
    "Acted Facial Expressions In The Wild",
    "Static Facial Expression in the Wild",
    "CMU Multi-PIE",
    "Affective Faces Database",
    "Expression in-the-Wild"
]


In [3]:
topics = ["facial expression recognition", "FER", "deep learning", "machine learning", "classification", "classifier", "neural network", "CNN"]

In [6]:


# Function to search scholarly papers and collect extended information
def search_scholarly(query, save_file):
    search_query = scholarly.search_pubs(query)
    papers = []
    
    # Define common metrics and synonyms for bias
    metrics_to_check = [
        'accuracy', 'f1', 'precision', 'recall', 'auc', 'roc', 
        'sensitivity', 'specificity', 'confusion matrix', 
        'loss function', 'cross-entropy', 'mean squared error', 
        'overfitting', 'underfitting', 'cross-validation', 
        'training time', 'inference time', 'statistical significance', 
        'p-value', 't-test', 'anova', 'correlation', 'regression', 
        'baseline comparison', 
        'mae', 'rmse'
    ]
    
    bias_synonyms = [
        'bias', 'biasness', 'fairness', 'unfairness', 'equity', 
        'inequality', 'prejudice', 'discrimination', 'impartiality', 
        'skewness', 'systematic error'
    ]
    
    for _ in range(30):  # Adjust this number as needed (e.g., 35)
        try:
            paper = next(search_query)
            # Collecting only papers with more than a certain citation threshold (e.g., 100)
            if 'num_citations' in paper and paper['num_citations'] >= 100:
                paper_info = {
                    'Title': paper['bib'].get('title', 'No Title'),
                    'Authors': paper['bib'].get('author', 'No Author'),
                    'Year': paper['bib'].get('pub_year', None),
                    'Cited By': paper['num_citations'],
                    'Dataset': query,
                    'Abstract': paper['bib'].get('abstract', ''),  # Extract abstract if available
                    'DOI': paper['bib'].get('doi', 'No DOI'),      # Extract DOI if available
                    'Journal': paper['bib'].get('venue', 'No Journal'),
                    'URL': None  # Initialize URL as None
                }

                # Check for mentions of metrics and bias-related terms in the abstract
                abstract = paper_info['Abstract'].lower() if paper_info['Abstract'] else ''
                
                # Include flags for whether specific metrics are mentioned
                for metric in metrics_to_check:
                    paper_info[f'Mentions_{metric.capitalize().replace(" ", "_")}'] = metric in abstract
                
                # Include a separate flag for bias-related terms
                paper_info['Mentions_Bias'] = any(bias_term in abstract for bias_term in bias_synonyms)
                
                # Perform a Google search to find the paper's URL
                try:
                    search_results = search(paper_info['Title'], num_results=1)  # Get the top search result
                    for url in search_results:
                        paper_info['URL'] = url
                        break  # Only take the first result
                except Exception as e:
                    print(f"Error finding URL for {paper_info['Title']}: {e}")
                    paper_info['URL'] = None
                
                papers.append(paper_info)

                # Save to CSV after processing each paper
                temp_df = pd.DataFrame(papers)
                temp_df.to_csv(save_file, mode='a', header=not pd.io.common.file_exists(save_file), index=False)

                # If we've collected 10 highly cited papers, break out of the loop
                if len(papers) >= 20:
                    break

            # Random delay between 30 and 60 seconds to avoid detection
            time.sleep(random.uniform(30, 60))
        except StopIteration:
            break
        except Exception as e:
            print(f"An error occurred: {e}")
            continue

    return papers

# Collect data from Scholarly for each combination of dataset and topic
def collect_data(fer_datasets_LONG, topics, save_file):
    scholarly_results = []

    for dataset in fer_datasets_LONG:
        for topic in topics:
            combined_query = f"{dataset} {topic}"
            print(f"Searching for combination: {combined_query}")
            scholarly_results.extend(search_scholarly(combined_query, save_file))

# Save results to a CSV file
save_file = 'scholarly_papers_combined_extended.csv'

# Start collecting data
collect_data(fer_datasets_LONG, topics, save_file)

print("Data collection completed.")


Searching for combination: Acted Facial Expressions In The Wild facial expression recognition
Searching for combination: Acted Facial Expressions In The Wild FER
Searching for combination: Acted Facial Expressions In The Wild deep learning
Searching for combination: Acted Facial Expressions In The Wild machine learning
Searching for combination: Acted Facial Expressions In The Wild classification
Searching for combination: Acted Facial Expressions In The Wild classifier
Searching for combination: Acted Facial Expressions In The Wild neural network
Searching for combination: Acted Facial Expressions In The Wild CNN
Searching for combination: Static Facial Expression in the Wild facial expression recognition
Searching for combination: Static Facial Expression in the Wild FER
Searching for combination: Static Facial Expression in the Wild deep learning
Searching for combination: Static Facial Expression in the Wild machine learning
Searching for combination: Static Facial Expression in th

MaxTriesExceededException: Cannot Fetch from Google Scholar.