## AIM

In this notebook, I checked the accuracy of the Google Scholar dataset. I found that most of them are pretty accurate. Only 124 of them are not accurate. 

In [1]:
import pandas as pd
import numpy as np
import re
from difflib import SequenceMatcher
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC 
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException 
from selenium.common.exceptions import ElementNotInteractableException
import os
import random
import re
import csv
import urllib.parse

In [2]:
df = pd.read_csv('../data/processed/gscholar_data.csv')
df.head()

Unnamed: 0,Original Title,Title on Google Scholar,DOI,Year,Citation Link,Citation Counts on Google Scholar
0,The Gender Divide in Wikipedia: Quantifying an...,The Gender Divide in Wikipedia: Quantifying an...,10.1093/joc/jqac004,2022,https://scholar.google.com/scholar?cites=73284...,2
1,Mapping Exposure Diversity: The Divergent Effe...,Mapping Exposure Diversity: The Divergent Effe...,10.1093/joc/jqac009,2022,https://scholar.google.com/scholar?cites=29888...,6
2,Democratic Consequences of Incidental Exposure...,Democratic consequences of incidental exposure...,10.1093/joc/jqac008,2022,https://scholar.google.com/scholar?cites=13540...,5
3,The Great and Powerful Dr. Oz? Alternative Hea...,The Great and Powerful Dr. Oz? Alternative Hea...,10.1093/joc/jqac011,2022,https://scholar.google.com/scholar?cites=12065...,3
4,A Methodological Framework for Analyzing the A...,A Methodological Framework for Analyzing the A...,10.1093/joc/jqac013,2022,https://scholar.google.com/scholar?q=related:e...,0


In [3]:
df.shape

(5718, 6)

In [4]:
# orig_t = df['Original Title'].tolist()
# gscholar_t = df['Title on Google Scholar'].tolist()

In [5]:
def clean_text(text):
    """
    Takes a string and returns a string
    """
    # remove html tags, lowercase, remove nonsense, remove non-letter
    title = text.replace('[CITATION]', '')
    title = title.replace('[PDF]', '')
    title = title.lower()
    title = re.sub(r'[^a-z]+', ' ', title)
    title = title.strip()
    return title

In [6]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [7]:
def get_similar_score(row):
    orig_t = row['orig_t_processed']
    gscholar_t = row['gscholar_t_processed']
    return similar(orig_t, gscholar_t)

In [8]:
df['orig_t_processed'] = [clean_text(i) for i in df['Original Title']]
df['gscholar_t_processed'] = [clean_text(i) for i in df['Title on Google Scholar']]

In [9]:
dff = df[df['orig_t_processed'] != df['gscholar_t_processed']]

In [10]:
dff.head(3)

Unnamed: 0,Original Title,Title on Google Scholar,DOI,Year,Citation Link,Citation Counts on Google Scholar,orig_t_processed,gscholar_t_processed
7,"#GirlGamers, Soldiers, and Public Relations: A...",Random optimization,10.1093/joc/jqab051,2022,https://scholar.google.com/scholar?cites=16237...,565,girlgamers soldiers and public relations analy...,random optimization
17,What Does #Freedom Look Like? Instagram and th...,[HTML] How does foreign direct investment affe...,10.1093/joc/jqab021,2021,https://scholar.google.com/scholar?cites=17920...,10280,what does freedom look like instagram and the ...,html how does foreign direct investment affect...
59,Intellectuals Debate #MeToo in China: Legitimi...,[BOOK] European stories: Intellectual debates ...,10.1093/joc/jqaa033,2020,https://scholar.google.com/scholar?cites=68538...,132,intellectuals debate metoo in china legitimizi...,book european stories intellectual debates on ...


In [11]:
dff.shape

(294, 8)

In [12]:
dff = dff.assign(simi_score = dff.apply(lambda row: get_similar_score(row), axis = 1))

In [19]:
dff[dff.simi_score <= 0.95].head()

Unnamed: 0,Original Title,Title on Google Scholar,DOI,Year,Citation Link,Citation Counts on Google Scholar,orig_t_processed,gscholar_t_processed,simi_score
7,"#GirlGamers, Soldiers, and Public Relations: A...",Random optimization,10.1093/joc/jqab051,2022,https://scholar.google.com/scholar?cites=16237...,565,girlgamers soldiers and public relations analy...,random optimization,0.216216
17,What Does #Freedom Look Like? Instagram and th...,[HTML] How does foreign direct investment affe...,10.1093/joc/jqab021,2021,https://scholar.google.com/scholar?cites=17920...,10280,what does freedom look like instagram and the ...,html how does foreign direct investment affect...,0.308824
59,Intellectuals Debate #MeToo in China: Legitimi...,[BOOK] European stories: Intellectual debates ...,10.1093/joc/jqaa033,2020,https://scholar.google.com/scholar?cites=68538...,132,intellectuals debate metoo in china legitimizi...,book european stories intellectual debates on ...,0.389474
71,"Past Debates, Fresh Impact on Nano-Enabled Foo...","Past debates, fresh impact on nano-enabled foo...",10.1093/joc/jqaa019,2020,https://scholar.google.com/scholar?cites=13485...,15,past debates fresh impact on nano enabled food...,past debates fresh impact on nano enabled food...,0.920245
90,"Toward an Aggregate, Implicit, and Dynamic Mod...","Toward an aggregate, implicit, and dynamic mod...",10.1093/joc/jqz033,2019,https://scholar.google.com/scholar?cites=26600...,13,toward an aggregate implicit and dynamic model...,toward an aggregate implicit and dynamic model...,0.901163


In [20]:
dff[dff.simi_score <= 0.95].shape

(123, 9)

In [27]:
# these are the 123 papers to scrape again
to_scrape_further = dff[dff.simi_score <= 0.95]

## Scrape these papers

In [39]:
papers = to_scrape_further['Original Title'].tolist()
dois = to_scrape_further['DOI'].tolist()
len(papers), papers[0], dois[0]

(123,
 '#GirlGamers, Soldiers, and Public Relations: Analyzing Gender Representation in U.S. Army Esports',
 '10.1093/joc/jqab051')

In [26]:
gscholar_dict_list = []

In [None]:
for paper in papers:
    idx = papers.index(paper)
    doi = dois[idx]
    query_string = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C50&q='
    driver.get(query_string + paper + '&btnG=')
    gs_paper_e = wait.until(EC.presence_of_element_located((
            By.CSS_SELECTOR, 'h3.gs_rt')))
    gs_paper_title = gs_paper_e.text
    gs_citation_e = wait.until(
        EC.presence_of_element_located((By.XPATH, '//div[@class="gs_fl"]//child::a[3]'
    )))
    citation_link = gs_citation_e.get_attribute('href')
    citation_count_string = gs_citation_e.get_attribute('innerHTML')
    if citation_count_string == "Related articles":
        gs_citation_count = 0
    else:
        gs_citation_count = int(re.findall(r'\d+', citation_count_string)[0])
    gscholar_dict = {
        'Original Title': orig_title,
        'Title on Google Scholar': gs_paper_title,
        'DOI': doi,
        'Citation Link': citation_link,
        'Citation Counts on Google Scholar': gs_citation_count,
    }
    gscholar_dict_list.append(gscholar_dict)
    print(f'paper {idx + 1} is done!')
    time.sleep(0.2+random.uniform(0,0.2))