In [1]:
import pandas as pd
import numpy as np
import time
from tqdm.notebook import tqdm
import re
import os
from itertools import dropwhile

In [2]:
from lxml import etree
from bs4 import BeautifulSoup

In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [4]:
sd = pd.read_csv('output/01b_science_direct_methods_explode.csv', index_col=0, encoding='utf-8')
pdf = pd.read_csv('output/03b_pdf_methods_explode.csv', index_col=0, encoding='utf-8')
html = pd.read_csv('output/04b_html_methods_explode.csv', index_col=0, encoding='utf-8')

In [5]:
## handle duplicates
## prioritise keep sd, then pdf, then html

In [6]:
sd_pmid = sd['pmid'].to_list()
pdf_pmid = pdf['pmid'].to_list()
html_pmid = html['pmid'].to_list()

In [7]:
sd_pdf = [x for x in sd_pmid if x in pdf_pmid]
len(sd_pdf)

647

In [8]:
sd_html = [x for x in sd_pmid if x in html_pmid]
len(sd_html)

3

In [9]:
pdf = pdf[~pdf['pmid'].isin(sd_pdf)]
html = html[~html['pmid'].isin(sd_html)]

In [10]:
combined = pd.concat([sd, pdf, html])
len(combined)

28703

In [11]:
pmid_list = combined['pmid'].to_list()
methods = combined.drop(['pmid'], axis=1)
len(methods)

28703

# extra cleaning

In [12]:
## get rid of encoding chars

def ignore_encode(x):
    if x is not np.nan:
        return x.encode('ascii','ignore').decode("ascii")
    else:
        return np.nan

## clean chars

def clean_chars(x):
    if x is not np.nan:
        return x.replace('[]', ' ')
    else:
        return np.nan

In [13]:
methods = methods.applymap(ignore_encode)
methods = methods.applymap(clean_chars)

In [14]:
methods.sample(15)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34
9126,materials and methods the data consisted of ab...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3052,material and methods two databases were collec...,database the first database was collected usin...,tt and wtt methods takes the tt method has si...,is trained with train data is the average t...,methods are tested further. are: 83.89% and ...,"method. for each cl, the ksize and kno parame...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4277,methods adult subjects who had return of spont...,eeg data pre-processing the eeg was re-referen...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9655,2.1|study design and population a cross-sectio...,2.2|data collection following the study protoc...,2.5|data analysis the data extracted from the...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
755,['methods a total of 5517 subjects aged 2090 y...,2. materials and methods a total of 5517 subje...,method from the perspective of a reasonable ba...,variable selection methods lists the comparis...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3003,methods the 2d axial slices of the ct scan a...,data preparation we note that the cysts are r...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4460,methods neuroimaging data the mri-genetics int...,neuroimaging data the mri-genetics interface e...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
18861,2 materials andmethods 2.1 materials we have u...,2.1 materials we have used two datasets in our...,2.2 methods this section describes the appli...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
13534,experimental methods data acquisition medical ...,data acquisition medical data from 942 patient...,"data processing data processing, model design,...","data panels additionally, the sparse coding mo...",data availability statement the raw data suppo...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
11117,2. materials and methods the ethical approval ...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [15]:
methods['0'].str.len()

0        10308.0
1         5105.0
2        25345.0
3         4699.0
4         7007.0
5         4389.0
6         6184.0
7         7877.0
8         5859.0
9         6183.0
10        5683.0
11        6470.0
12       11331.0
13        8595.0
14       14638.0
15        8371.0
16       12108.0
17       12964.0
18        5977.0
19        4484.0
20           NaN
21        2013.0
22        9091.0
23       14048.0
24        7919.0
25        6337.0
26       19467.0
27        8876.0
28        7226.0
29        7763.0
30        5929.0
31       12970.0
32        7377.0
33       17138.0
34       11333.0
35        8185.0
36       13206.0
37       13472.0
38        9853.0
39        5119.0
40       16905.0
41       15496.0
42        5822.0
43        2496.0
44        8562.0
45        5489.0
46       15867.0
47        9188.0
48        3846.0
49       15052.0
50        9600.0
51        5365.0
52       27652.0
53        3095.0
54        6062.0
55        7253.0
56       14532.0
57        1390.0
58       10362

In [17]:
methods['pmid'] = pmid_list

methods_keep = methods[['pmid', '0']].copy()
                       
methods_keep = methods_keep.reset_index(drop=True)

In [18]:
methods_keep.to_csv('output/05_methods_combined.csv')

In [69]:
## clean empty brackets and spaces

methods.applymap(lambda x:(x.replace('()', ' ')))
methods.applymap(lambda x:(x.replace('[]', ' ')))
methods.applymap(lambda x:(x.replace('{}', ' ')))

methods.applymap(lambda x:(x.replace('    ', ' ')))
methods.applymap(lambda x:(x.replace('   ', ' ')))
methods.applymap(lambda x:(x.replace('  ', ' ')))
#methods = [x.replace('    ', ' ') for x in methods]
#methods = [x.replace('   ', ' ') for x in methods]
#methods = [x.replace('  ', ' ') for x in methods]

AttributeError: 'float' object has no attribute 'replace'

In [54]:
methods.head(50)

AttributeError: 'list' object has no attribute 'head'

In [8]:
sd_methods['science_direct_methods'] = methods
sd_methods.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6124 entries, 42411 to 21081
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   pmid                    6124 non-null   int64 
 1   doi                     6124 non-null   object
 2   title                   6123 non-null   object
 3   abstract                6124 non-null   object
 4   science_direct_methods  6124 non-null   object
dtypes: int64(1), object(4)
memory usage: 287.1+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [10]:
sd_methods.to_csv('output/science_direct_clean.csv')