In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re

In [2]:
fer_datasets = ["CK+", "MMI", "JAFFE", "TFD", "FER-2013", "AFEW7.0", 
    "SFEW2.0", "Multi-PIE", "BU-3DFE", "Oulu-CASIA", 
    "RaFD", "KDEF", "EmotioNet", "RAF-DB", "AffectNet", "ExpW"]


In [3]:
fer_datasets_LONG = [
    "Extended Cohn-Kanade", 
    'MMI Facial Expression',
    "Japanese Female Facial Expression", 
    "Toronto Face Database",
    "Binghamton University 3D Facial Expression",
    "Oulu-CASIA",
    "Radboud Faces Database",
    "Karolinska Directed Emotional Faces",
    "Acted Facial Expressions In The Wild",
    "Static Facial Expression in the Wild",
    "CMU Multi-PIE",
    "Affective Faces Database",
    "Expression in-the-Wild", 
    "Facial Expression Recognition 2013"
]


In [4]:
topics = ["facial expression recognition", "FER", "deep learning", "machine learning", "classification", "classifier", "neural network", "CNN"]

In [5]:
# Define the file paths for the two datasets
file_path_1 = 'scholarly_papers_combined_extended.csv'
file_path_2 = 'scholarly_papers_combined_extended_2.csv'
file_path_3 = 'scholarly_papers_combined_extended_3.csv'
file_path_4 = 'scholarly_papers_combined_extended_4.csv'
file_path_5 = "scholarly_papers_combined_extended_FER2013.csv"

# Load the two CSV files into DataFrames
df1 = pd.read_csv(file_path_1)
df2 = pd.read_csv(file_path_2)
df3 = pd.read_csv(file_path_3)
df4 = pd.read_csv(file_path_4)
df5 = pd.read_csv(file_path_5)


# Print the shapes of the individual DataFrames
print(f"Shape of df1: {df1.shape}")
print(f"Shape of df2: {df2.shape}")
print(f"Shape of df3: {df3.shape}")
print(f"Shape of df4: {df4.shape}")
print(f"Shape of df5: {df5.shape}")

# Concatenate the two DataFrames
merged_df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)

# Print the shape of the merged DataFrame
print(f"Shape of merged_df: {merged_df.shape}")

#merged_df

Shape of df1: (1482, 36)
Shape of df2: (1789, 36)
Shape of df3: (2890, 36)
Shape of df4: (1066, 36)
Shape of df5: (2157, 36)
Shape of merged_df: (9384, 36)


In [6]:
# Create a function to check if any dataset is in the 'Dataset' column
def extract_dataset(row):
    for dataset in fer_datasets_LONG:
        if dataset.lower() in str(row).lower():
            return dataset
    return None

# Create a function to check if any topic is in the 'Abstract' or 'Title' column
def extract_topic(row):
    for topic in topics:
        if topic.lower() in str(row).lower():
            return topic
    return None

merged_df['Detected_Dataset'] = merged_df['Dataset'].apply(extract_dataset)
merged_df['Detected_Topic'] = merged_df['Dataset'].apply(extract_topic)
merged_df = merged_df.drop(columns=['Dataset'])
#merged_df.shape

In [7]:
# Check for duplicate rows based on all columns
duplicate_rows = merged_df[merged_df.duplicated()]
duplicate_rows.shape

(8486, 37)

In [8]:
df = merged_df.copy()

# Function to merge unique datasets or topics
def merge_unique_entries(entries):
    # Remove None or NaN values and return unique values joined by a comma
    return ', '.join(sorted(set([entry for entry in entries if pd.notna(entry)])))

# Group by 'Title' and aggregate the 'Detected Dataset', 'Detected Topic' columns, and 'Mentions_' columns
df_clean = df.groupby('Title').agg({
    'Authors': 'first',  # Keep the first occurrence for non-duplicate columns
    'Year': 'first',
    'Cited By': 'first',
    'Detected_Dataset': merge_unique_entries,  # Merge and remove duplicates in 'Detected Dataset'
    'Detected_Topic': merge_unique_entries,    # Merge and remove duplicates in 'Detected Topic'
    'Abstract': 'first',  # Keep the first occurrence for 'Abstract'
    'DOI': 'first',       # Keep the first occurrence for 'DOI'
    'Journal': 'first',   # Keep the first occurrence for 'Journal'
    'URL': 'first',       # Keep the first occurrence for 'URL'
    
    # For 'Mentions_' columns, use the 'any()' function to keep 'True' if any row has 'True'
    'Mentions_Accuracy': 'any',
    'Mentions_F1': 'any',
    'Mentions_Precision': 'any',
    'Mentions_Recall': 'any',
    'Mentions_Auc': 'any',
    'Mentions_Roc': 'any',
    'Mentions_Sensitivity': 'any',
    'Mentions_Specificity': 'any',
    'Mentions_Confusion_matrix': 'any',
    'Mentions_Loss_function': 'any',
    'Mentions_Cross-entropy': 'any',
    'Mentions_Mean_squared_error': 'any',
    'Mentions_Overfitting': 'any',
    'Mentions_Underfitting': 'any',
    'Mentions_Cross-validation': 'any',
    'Mentions_Training_time': 'any',
    'Mentions_Inference_time': 'any',
    'Mentions_Statistical_significance': 'any',
    'Mentions_P-value': 'any',
    'Mentions_T-test': 'any',
    'Mentions_Anova': 'any',
    'Mentions_Correlation': 'any',
    'Mentions_Regression': 'any',
    'Mentions_Baseline_comparison': 'any',
    'Mentions_Mae': 'any',
    'Mentions_Rmse': 'any',
    'Mentions_Bias': 'any'
}).reset_index()

# Display the shape of the cleaned and original dataframes
print(df_clean.shape, merged_df.shape)

# Checking for missing values
missing_values = df_clean.isnull().sum()
#print(missing_values)


(263, 37) (9384, 37)


In [9]:
df_clean.to_csv('../Scrapes_ALL.csv')