In [2]:
import pandas as pd
import numpy as np
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import urllib
import requests
from PyPDF2 import PdfMerger
from urllib.parse import urlparse
from io import BytesIO
from enum import Enum
import re
from urllib.error import URLError, HTTPError  # Import these classes from urllib.error
import ssl

## Extract Link from 150 Good research paper

In [4]:
# Read in the csv file
## This is a csv file that teammates manually inspected as good research pape
good_df = pd.read_csv("../data/150_research_papers.csv")[['Link']]
good_df['label'] = 1

In [5]:
good_df.head()
# good_df.shape

Unnamed: 0,Link,label
0,https://www.science.org/doi/10.1126/science.ad...,1
1,https://www.nature.com/articles/s41566-019-0398-2,1
2,https://www.nature.com/articles/s41560-020-007...,1
3,https://www.science.org/doi/10.1126/science.ab...,1
4,https://www.nature.com/articles/s41467-021-236...,1


## Import bad paper

In [7]:
bad_df = pd.read_csv("../data/irrelevant_papers.txt")
bad_df.head()

Unnamed: 0,link
0,https://doi.org/10.1016/0021-9797(77)90150-3
1,https://www.sciencedirect.com/science/article/...
2,https://www.sciencedirect.com/science/article/...
3,https://doi.org/10.1107%2FS0021889813019535
4,https://www.sciencedirect.com/science/article/...


In [8]:
bad_df['label'] = 0
# sampled_bad_df.to_csv('bad_paper_small.csv', index=False)

In [9]:
bad_df

Unnamed: 0,link,label
0,https://doi.org/10.1016/0021-9797(77)90150-3,0
1,https://www.sciencedirect.com/science/article/...,0
2,https://www.sciencedirect.com/science/article/...,0
3,https://doi.org/10.1107%2FS0021889813019535,0
4,https://www.sciencedirect.com/science/article/...,0
...,...,...
310,https://www.sciencedirect.com/science/article/...,0
311,https://www.sciencedirect.com/science/article/...,0
312,https://www.sciencedirect.com/science/article/...,0
313,https://doi.org/10.1063/1.1404988,0


# Scraping Text

In [10]:
def get_text(soup):
    #Given a beautiful soup object, it will extact the text
    for script in soup(['script', 'style']):
        script.extract()
    text = soup.get_text(separator=' ')
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text

In [11]:
def create_txt_list(df, col_name):
    #Function that given dataframe and column name that stores the link, 
    #We will extract the text and store them into a list
    #This function returns a modified df where they add a new column "text" associated with an list. 
    text_lst = []
    page_count = 0
    for link in df[col_name]:
        page_count += 1
        print(f"Analyzing {page_count}th link")
        try:
            page = urlopen(link)
            print("Page accessed successfully.")
        except HTTPError as e:
            print(f"HTTP error occurred: {e.code} - {e.reason}")
            if e.code == 403 and e.reason == 'Forbidden':
                print("Found a forbidden access exception ")
                hdr = {'User-Agent': 'Mozilla/5.0'}
                try:
                    # Retrying with modified request headers
                    request = Request(link, headers=hdr)
                    page = urlopen(request)
                    print("Page accessed successfully with headers.")
                except HTTPError as retry_e:
                    print(f"Retry HTTP error occurred: {retry_e.code} - {retry_e.reason}")
                    text_lst.append("HTTP Error, No Access")
                    continue
                except URLError as retry_e:
                    print(f"Retry URL error occurred: {retry_e.reason}")
                    text_lst.append("URL Error, No Access")
                    continue
            else:
                print("NOT a forbidden access exception ")
                text_lst.append("HTTP Error, No Access")
                continue
        except URLError as e:
            print(f"URL error occurred: {e.reason}")
            text_lst.append("URL Error, No Access")
            continue
        except ssl.SSLError as e:
            print(f"SSL error occurred: {e}")
            text_lst.append("SSL Error, No Access")
            continue
        except ValueError as e:
            print(f"Value error (likely an invalid URL): {e}")
            text_lst.append("Invalid URL No Access")
            continue
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            text_lst.append("Unexpected Error No Access")
            continue
        html_content = page.read().decode("utf-8")
        soup = BeautifulSoup(html_content, "html.parser")
        text_lst.append(get_text(soup))
        print()
    df['text'] = text_lst
    return df

In [13]:
#Extract text from good text dataframe
bad_df_text = create_txt_list(bad_df, "link")

Analyzing 1th link
Page accessed successfully.

Analyzing 2th link
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
Analyzing 3th link
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
Analyzing 4th link
Page accessed successfully.

Analyzing 5th link
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
Analyzing 6th link
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Page accessed successfully with headers.

Analyzing 7th link
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
Analyzing 8th link
Page accessed successfully.

Analyzing 9th link
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
Analyzing 10th link
Page accessed successfully.

In [14]:
bad_df_text.to_csv('../data/text_bad_paper.csv', index=False)

In [15]:
#Extract text from good dataframe
good_df_text = create_txt_list(good_df, "Link")

Analyzing 1th link
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
Analyzing 2th link
Page accessed successfully.

Analyzing 3th link
Page accessed successfully.

Analyzing 4th link
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
Analyzing 5th link
Page accessed successfully.

Analyzing 6th link
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
Analyzing 7th link
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
Analyzing 8th link
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
Analyzing 9th link
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
Analyzing 10th link
HTTP error occurred: 403 -

In [16]:
good_df_text.to_csv('../data/text_good_paper.csv', index=False)

## Head of good_df_text and bad_df_text

In [17]:
good_df_text

Unnamed: 0,Link,label,text
0,https://www.science.org/doi/10.1126/science.ad...,1,"HTTP Error, No Access"
1,https://www.nature.com/articles/s41566-019-0398-2,1,Surface passivation of perovskite film for eff...
2,https://www.nature.com/articles/s41560-020-007...,1,Intact 2D/3D halide junction perovskite solar ...
3,https://www.science.org/doi/10.1126/science.ab...,1,"HTTP Error, No Access"
4,https://www.nature.com/articles/s41467-021-236...,1,Multication perovskite 2D/3D interfaces form v...
...,...,...,...
146,https://doi.org/10.1002%2Fsmll.201803350,1,"HTTP Error, No Access"
147,https://doi.org/10.1038%2Fs41467-022-34203-x,1,Overcoming C60-induced interfacial recombinati...
148,https://doi.org/10.1038%2Fs41560-023-01249-0,1,Bifunctional hole-shuttle molecule for improve...
149,https://doi.org/10.1002%2Fadma.201907757,1,"HTTP Error, No Access"


In [18]:
bad_df_text

Unnamed: 0,link,label,text
0,https://doi.org/10.1016/0021-9797(77)90150-3,0,Redirecting
1,https://www.sciencedirect.com/science/article/...,0,"HTTP Error, No Access"
2,https://www.sciencedirect.com/science/article/...,0,"HTTP Error, No Access"
3,https://doi.org/10.1107%2FS0021889813019535,0,(IUCr) X-ray analysis of residual stress gradi...
4,https://www.sciencedirect.com/science/article/...,0,"HTTP Error, No Access"
...,...,...,...
310,https://www.sciencedirect.com/science/article/...,0,"HTTP Error, No Access"
311,https://www.sciencedirect.com/science/article/...,0,"HTTP Error, No Access"
312,https://www.sciencedirect.com/science/article/...,0,"HTTP Error, No Access"
313,https://doi.org/10.1063/1.1404988,0,The charge of glass and silica surfaces | The ...


## Filter out the non accessible texts

In [19]:
pass_bad_df = bad_df_text[(bad_df_text['text'] != "HTTP Error, No Access") & (bad_df_text['text'] != "Redirecting")]
pass_bad_df

Unnamed: 0,link,label,text
3,https://doi.org/10.1107%2FS0021889813019535,0,(IUCr) X-ray analysis of residual stress gradi...
5,https://doi.org/10.1063%2F1.4864778,0,Unusual defect physics in CH3NH3PbI3 perovskit...
9,https://doi.org/10.1039/D3TA00873H,0,Molecular interaction modulating Ruddlesden–Po...
12,https://doi.org/10.1039/C5CE01014D,0,Fundamental growth principles of colloidal met...
14,https://doi.org/10.1038%2Fnatrevmats.2018.17,0,Scalable fabrication of perovskite solar cells...
...,...,...,...
290,https://doi.org/10.1017/9781108394826.004,0,Methods of Colloidal Simulation (Chapter 3) - ...
292,https://doi.org/10.1146/annurev.ms.12.080182.0...,0,Transition Metal Oxide Gels and Colloids | Ann...
300,https://doi.org/10.1039/DC9786500007,0,"The First Rideal Lecture. Microemulsions, a fi..."
302,https://doi.org/10.1038%2Fnphoton.2012.285,0,Strain-engineered artificial atom as a broad-s...


In [20]:
pass_good_df = good_df_text[(good_df_text['text'] != "HTTP Error, No Access") & (good_df_text['text'] != "Redirecting")]
pass_good_df = pass_good_df.rename(columns={'Link': 'link'})
pass_good_df

Unnamed: 0,link,label,text
1,https://www.nature.com/articles/s41566-019-0398-2,1,Surface passivation of perovskite film for eff...
2,https://www.nature.com/articles/s41560-020-007...,1,Intact 2D/3D halide junction perovskite solar ...
4,https://www.nature.com/articles/s41467-021-236...,1,Multication perovskite 2D/3D interfaces form v...
10,https://doi.org/10.1038%2Fs41586-022-04604-5,1,Stability-limiting heterointerfaces of perovsk...
15,https://doi.org/10.1038%2Fs41467-022-30426-0,1,Imaging and quantifying non-radiative losses a...
...,...,...,...
141,https://doi.org/10.1038%2Fs41586-023-06637-w,1,Anion–π interactions suppress phase impurities...
145,https://doi.org/10.1039%2FD2EE00288D,1,Optimized carrier extraction at interfaces for...
147,https://doi.org/10.1038%2Fs41467-022-34203-x,1,Overcoming C60-induced interfacial recombinati...
148,https://doi.org/10.1038%2Fs41560-023-01249-0,1,Bifunctional hole-shuttle molecule for improve...


## Merge two df together

In [21]:
merged_df = pass_good_df.append(pass_bad_df, ignore_index=True)
merged_df

  merged_df = pass_good_df.append(pass_bad_df, ignore_index=True)


Unnamed: 0,link,label,text
0,https://www.nature.com/articles/s41566-019-0398-2,1,Surface passivation of perovskite film for eff...
1,https://www.nature.com/articles/s41560-020-007...,1,Intact 2D/3D halide junction perovskite solar ...
2,https://www.nature.com/articles/s41467-021-236...,1,Multication perovskite 2D/3D interfaces form v...
3,https://doi.org/10.1038%2Fs41586-022-04604-5,1,Stability-limiting heterointerfaces of perovsk...
4,https://doi.org/10.1038%2Fs41467-022-30426-0,1,Imaging and quantifying non-radiative losses a...
...,...,...,...
127,https://doi.org/10.1017/9781108394826.004,0,Methods of Colloidal Simulation (Chapter 3) - ...
128,https://doi.org/10.1146/annurev.ms.12.080182.0...,0,Transition Metal Oxide Gels and Colloids | Ann...
129,https://doi.org/10.1039/DC9786500007,0,"The First Rideal Lecture. Microemulsions, a fi..."
130,https://doi.org/10.1038%2Fnphoton.2012.285,0,Strain-engineered artificial atom as a broad-s...


In [22]:
merged_df.to_csv('merged_label.csv', index=False)

## Making PDFs of each paper

In [23]:
def get_base_url(url):
    parsed = urlparse(url)
    return f"{parsed.scheme}://{parsed.netloc}"


In [30]:
def get_page(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0')
    req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8')
    req.add_header('Accept-Language', 'en-US,en;q=0.5')

    r = urllib.request.urlopen(req)
    return r

In [None]:
def get_actual_url(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}
    r = requests.get(url, headers=headers, allow_redirects=True)
    parsed_uri = urlparse(r.url)
    return f'{parsed_uri.scheme}://{parsed_uri.netloc}/'

In [None]:
def reset_eof(pdf_content):
    # find the line position of the EOF
    EOF_MARKER = b'%%EOF'
    if EOF_MARKER in pdf_content:
        # we can remove the early %%EOF and put it at the end of the file
        pdf_content = pdf_content.replace(EOF_MARKER, b'')
        pdf_content = pdf_content + EOF_MARKER
    else:
        # Some files really don't have an EOF marker
        # printed b'\n%%EO%E'
        pdf_content = pdf_content[:-6] + EOF_MARKER
    return pdf_content

In [None]:
def download_all_pdfs(url, paper_index):
    page = get_page(url)
    url = get_actual_url(url)
    print(url)
    soup = BeautifulSoup(page.read().decode("utf-8"), 'html.parser')
    pdf_links = [link.get('href') for link in soup.find_all('a', href=True) \
                if link.get('href') and link.get('href').endswith('.pdf')]
    pdf_links = list(set(pdf_links))
    pdf_files = []
    i = 0
    merger = PdfMerger()
    for pdf_link in pdf_links:
        pdf_url = pdf_link if pdf_link.startswith('http') else \
            get_base_url(url) + pdf_link
        res = requests.get(pdf_url)
        print(pdf_url)
        merger.append(BytesIO(reset_eof(res.content)))
        # pdf_name = f"data/pdfs/{paper_index}_{i}.pdf"
        # with open(pdf_name, 'wb') as f:
        #     f.write(res.content)
        # pdf_files.append(pdf_name)
        i += 1
    # for pdf in pdf_files:
    #     merger.append(pdf)
    with open(f'../data/pdfs/{paper_index}.pdf', 'wb') as merged:
        merger.write(merged)
    merger.close()


In [56]:
for index, row in merged_df.iterrows():
    if index < 3: 
        continue
    print(row['link'])
    download_all_pdfs(row['link'], index)

https://doi.org/10.1038%2Fs41586-022-04604-5
https://www.nature.com/
<http.client.HTTPResponse object at 0x000001E4791374F0>


Previous trailer can not be read ("'NumberObject' object is not subscriptable",)
Object ID 387,0 ref repaired
Object 378 0 found
Object 379 0 found
Object ID 390,0 ref repaired
Object 1 0 found
Object 3 0 found
Object 5 0 found
Object 7 0 found
Object 9 0 found
Object 11 0 found
Object 13 0 found
Object 15 0 found
Object 17 0 found
Object 380 0 found
Object 19 0 found
Object 21 0 found
Object 23 0 found
Object 25 0 found
Object 34 0 found
Object 381 0 found
Object 36 0 found
Object 38 0 found
Object 56 0 found
Object 59 0 found
Object 126 0 found
Object 382 0 found
Object 143 0 found
Object 171 0 found
Object 179 0 found
Object 197 0 found
Object 228 0 found


https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-022-04604-5/MediaObjects/41586_2022_4604_MOESM1_ESM.pdf


Object 383 0 found
Object 254 0 found
Object 269 0 found
Object 280 0 found
Object 285 0 found
Object 287 0 found
Object 369 0 found
Object 370 0 found
Object ID 389,0 ref repaired
Object 377 0 found
Object 371 0 found
Previous trailer can not be read ("'NumberObject' object is not subscriptable",)
Object ID 356,0 ref repaired


https://www.nature.com/articles/s41586-022-04604-5.pdf


Object 344 0 found
Object 345 0 found
Object ID 359,0 ref repaired
Object 1 0 found
Object 56 0 found
Object 73 0 found
Object 94 0 found
Object 346 0 found
Object 125 0 found
Object 136 0 found
Object 151 0 found
Object 158 0 found
Object 163 0 found
Object 347 0 found
Object 168 0 found
Object 173 0 found
Object 178 0 found
Object 183 0 found
Object 188 0 found
Object 193 0 found
Object 198 0 found
Object 203 0 found
Object ID 475,0 ref repaired
Object ID 476,0 ref repaired
Object ID 358,0 ref repaired
Object 339 0 found
Object 340 0 found
Object 341 0 found
Object ID 477,0 ref repaired
Object ID 478,0 ref repaired
Object ID 519,0 ref repaired
Object ID 517,0 ref repaired
Object ID 518,0 ref repaired
Object ID 515,0 ref repaired
Object ID 516,0 ref repaired
Object ID 513,0 ref repaired
Object ID 514,0 ref repaired
Object ID 511,0 ref repaired
Object ID 512,0 ref repaired
Object ID 509,0 ref repaired
Object ID 510,0 ref repaired
Object ID 507,0 ref repaired
Object ID 508,0 ref repaire

https://doi.org/10.1038%2Fs41467-022-30426-0
https://www.nature.com/
<http.client.HTTPResponse object at 0x000001E4787D2EF0>
https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-022-30426-0/MediaObjects/41467_2022_30426_MOESM2_ESM.pdf
https://www.nature.com/articles/s41467-022-30426-0.pdf
https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-022-30426-0/MediaObjects/41467_2022_30426_MOESM1_ESM.pdf
https://doi.org/10.1039%2FD0TA10535J


ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))