In [1]:
import pandas as pd
import numpy as np
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from enum import Enum
import re
from urllib.error import URLError, HTTPError  # Import these classes from urllib.error
import ssl

## Randomly selecting 12 papers from 150 Good research paper

In [2]:
# Read in the csv file
## This is a csv file that teammates manually inspected as good research pape
random_papers_df = pd.read_csv("150 Research paper_small_classification.csv")['Link']
random_papers_df.head()

0    https://www.science.org/doi/10.1126/science.ad...
1    https://www.nature.com/articles/s41566-019-0398-2
2    https://www.nature.com/articles/s41560-020-007...
3    https://www.science.org/doi/10.1126/science.ab...
4    https://www.nature.com/articles/s41467-021-236...
Name: Link, dtype: object

In [3]:
sampled_good = random_papers_df.sample(n=12, random_state=1)  # random_state for reproducibility
sampled_good

14              https://doi.org/10.1126%2Fsciadv.abj7930
98           https://doi.org/10.1038%2Fs41560-019-0538-4
75                  https://doi.org/10.1039%2FC9TA12489F
16                  https://doi.org/10.1039%2FD0TA10535J
131         https://doi.org/10.1038%2Fs41586-023-06208-z
56              https://doi.org/10.1002%2Faenm.202101447
141         https://doi.org/10.1038%2Fs41586-023-06637-w
44     https://www.sciencedirect.com/science/article/...
29                  https://doi.org/10.1039%2FC8TC04871A
120    https://www.science.org/doi/10.1126/science.ab...
94     https://onlinelibrary.wiley.com/doi/10.1002/ad...
5      https://www.sciencedirect.com/science/article/...
Name: Link, dtype: object

In [4]:
## Exporting this as a csv file name "good_paper_small"
sample_df_good = sampled_good.to_frame(name='link')
sample_df_good['label'] = 1
# sample_df_good.to_csv('good_paper_small.csv', index=False)

## Randomly selecting 12 papers from BAD research Paper

We will randomly select 12 and manually inspect if these paper are actually "bad"

In [5]:
# Read in the csv file
random_papers_df = pd.read_csv("bad_paper_big.csv")
random_papers_df.head()

Unnamed: 0,link,doi
0,https://doi.org/10.1038%2Fnature14133,10.1038%2Fnature14133
1,https://doi.org/10.1038%2Fnmat4014,10.1038%2Fnmat4014
2,https://doi.org/10.1039%2FC5EE03874J,10.1039%2FC5EE03874J
3,https://doi.org/10.1038%2Fs41563-018-0071-z,10.1038%2Fs41563-018-0071-z
4,https://doi.org/10.1002%2Faenm.201700491,10.1002%2Faenm.201700491


In [6]:
sampled_bad_df = random_papers_df.sample(n=12, random_state=1)  # random_state for reproducibility
sampled_bad_df

Unnamed: 0,link,doi
24,https://doi.org/10.1002%2Fadma.201607039,10.1002%2Fadma.201607039
39,https://doi.org/10.1039%2FC7TA00434F,10.1039%2FC7TA00434F
52,https://doi.org/10.1002%2Faenm.201601079,10.1002%2Faenm.201601079
27,https://doi.org/10.1038%2Fs41467-017-00516-5,10.1038%2Fs41467-017-00516-5
44,https://doi.org/10.1021%2Facsenergylett.7b00236,10.1021%2Facsenergylett.7b00236
2,https://doi.org/10.1039%2FC5EE03874J,10.1039%2FC5EE03874J
21,https://doi.org/10.1126%2Fsciadv.aao5616,10.1126%2Fsciadv.aao5616
62,https://doi.org/10.1103%2FPhysRevLett.77.3865,10.1103%2FPhysRevLett.77.3865
41,https://doi.org/10.1002%2Faenm.201701136,10.1002%2Faenm.201701136
50,https://doi.org/10.1557%2Fmrc.2015.26,10.1557%2Fmrc.2015.26


In [7]:
sampled_bad_df['label'] = 0
# sampled_bad_df.to_csv('bad_paper_small.csv', index=False)

In [8]:
sampled_bad_df

Unnamed: 0,link,doi,label
24,https://doi.org/10.1002%2Fadma.201607039,10.1002%2Fadma.201607039,0
39,https://doi.org/10.1039%2FC7TA00434F,10.1039%2FC7TA00434F,0
52,https://doi.org/10.1002%2Faenm.201601079,10.1002%2Faenm.201601079,0
27,https://doi.org/10.1038%2Fs41467-017-00516-5,10.1038%2Fs41467-017-00516-5,0
44,https://doi.org/10.1021%2Facsenergylett.7b00236,10.1021%2Facsenergylett.7b00236,0
2,https://doi.org/10.1039%2FC5EE03874J,10.1039%2FC5EE03874J,0
21,https://doi.org/10.1126%2Fsciadv.aao5616,10.1126%2Fsciadv.aao5616,0
62,https://doi.org/10.1103%2FPhysRevLett.77.3865,10.1103%2FPhysRevLett.77.3865,0
41,https://doi.org/10.1002%2Faenm.201701136,10.1002%2Faenm.201701136,0
50,https://doi.org/10.1557%2Fmrc.2015.26,10.1557%2Fmrc.2015.26,0


# Scraping Text

In [9]:
def get_text(soup):
    #Given a beautiful soup object, it will extact the text
    for script in soup(['script', 'style']):
        script.extract()
    text = soup.get_text(separator=' ')
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text

In [15]:
def create_txt_list(df, col_name):
    #Function that given dataframe and column name that stores the link, 
    #We will extract the text and store them into a list to be returned. 
    text_lst = []
    for link in df[col_name]:
        try:
            page = urlopen(link)
            print("Page accessed successfully.")
        except HTTPError as e:
            print(f"HTTP error occurred: {e.code} - {e.reason}")
            if e.code == 403 and e.reason == 'Forbidden':
                print("Found a forbidden access exception ")
                hdr = {'User-Agent': 'Mozilla/5.0'}
                try:
                    # Retrying with modified request headers
                    request = Request(link, headers=hdr)
                    page = urlopen(request)
                    print("Page accessed successfully with headers.")
                except HTTPError as retry_e:
                    print(f"Retry HTTP error occurred: {retry_e.code} - {retry_e.reason}")
                    text_lst.append("HTTP Error, No Access")
                    continue
                except URLError as retry_e:
                    print(f"Retry URL error occurred: {retry_e.reason}")
                    text_lst.append("URL Error, No Access")
                    continue
            else:
                print("NOT a forbidden access exception ")
                text_lst.append("HTTP Error, No Access")
                continue
        except URLError as e:
            print(f"URL error occurred: {e.reason}")
            text_lst.append("URL Error, No Access")
            continue
        except ssl.SSLError as e:
            print(f"SSL error occurred: {e}")
            text_lst.append("SSL Error, No Access")
            continue
        except ValueError as e:
            print(f"Value error (likely an invalid URL): {e}")
            text_lst.append("Invalid URL No Access")
            continue
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            text_lst.append("Unexpected Error No Access")
            continue
        html_content = page.read().decode("utf-8")
        soup = BeautifulSoup(html_content, "html.parser")
        text_lst.append(get_text(soup))
    return text_lst

In [16]:
#Extract text from good text dataframe
good_text_lst = create_txt_list(sample_df_good, "link")

HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
Page accessed successfully.
Page accessed successfully.
Page accessed successfully.
Page accessed successfully.
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
Page accessed successfully.
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
Page accessed successfully.
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden


In [17]:
print(len(good_text_lst)) #Should be length of 12. 
#Store the text into dataframe
sample_df_good["text"] = good_text_lst
sample_df_good

12


Unnamed: 0,link,label,text
14,https://doi.org/10.1126%2Fsciadv.abj7930,1,"HTTP Error, No Access"
98,https://doi.org/10.1038%2Fs41560-019-0538-4,1,Managing grains and interfaces via ligand anch...
75,https://doi.org/10.1039%2FC9TA12489F,1,Dynamical evolution of the 2D/3D interface: a ...
16,https://doi.org/10.1039%2FD0TA10535J,1,Deep surface passivation for efficient and hyd...
131,https://doi.org/10.1038%2Fs41586-023-06208-z,1,Oriented nucleation in formamidinium perovskit...
56,https://doi.org/10.1002%2Faenm.202101447,1,"HTTP Error, No Access"
141,https://doi.org/10.1038%2Fs41586-023-06637-w,1,Anion–π interactions suppress phase impurities...
44,https://www.sciencedirect.com/science/article/...,1,"HTTP Error, No Access"
29,https://doi.org/10.1039%2FC8TC04871A,1,Amine additive reactions induced by the soft L...
120,https://www.science.org/doi/10.1126/science.ab...,1,"HTTP Error, No Access"


In [18]:
#Extract text from bad text dataframe
bad_text_lst = create_txt_list(sampled_bad_df, "link")

HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
Page accessed successfully.
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
Page accessed successfully.
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
Page accessed successfully.
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden
Page accessed successfully.
Page accessed successfully.
HTTP error occurred: 403 - Forbidden
Found a forbidden access exception 
Retry HTTP error occurred: 403 - Forbidden


In [19]:
print(len(bad_text_lst)) #Should be length of 12. 
sampled_bad_df["text"] = bad_text_lst
sampled_bad_df

12


Unnamed: 0,link,doi,label,text
24,https://doi.org/10.1002%2Fadma.201607039,10.1002%2Fadma.201607039,0,"HTTP Error, No Access"
39,https://doi.org/10.1039%2FC7TA00434F,10.1039%2FC7TA00434F,0,Towards enabling stable lead halide perovskite...
52,https://doi.org/10.1002%2Faenm.201601079,10.1002%2Faenm.201601079,0,"HTTP Error, No Access"
27,https://doi.org/10.1038%2Fs41467-017-00516-5,10.1038%2Fs41467-017-00516-5,0,Strain-engineered growth of two-dimensional ma...
44,https://doi.org/10.1021%2Facsenergylett.7b00236,10.1021%2Facsenergylett.7b00236,0,"HTTP Error, No Access"
2,https://doi.org/10.1039%2FC5EE03874J,10.1039%2FC5EE03874J,0,Cesium-containing triple cation perovskite sol...
21,https://doi.org/10.1126%2Fsciadv.aao5616,10.1126%2Fsciadv.aao5616,0,"HTTP Error, No Access"
62,https://doi.org/10.1103%2FPhysRevLett.77.3865,10.1103%2FPhysRevLett.77.3865,0,"HTTP Error, No Access"
41,https://doi.org/10.1002%2Faenm.201701136,10.1002%2Faenm.201701136,0,"HTTP Error, No Access"
50,https://doi.org/10.1557%2Fmrc.2015.26,10.1557%2Fmrc.2015.26,0,Identifying defect-tolerant semiconductors wit...
