# WosClient

In [1]:
import time
import pandas as pd

In [2]:
# !pip install suds-jurko

from suds import client
from base64 import b64encode as _b64encode
from collections import OrderedDict as _OrderedDict

class WosClient():
    """Query the Web of Science.
       You must provide user and password only to user premium WWS service.
       with WosClient() as wos:
           results = wos.search(...)"""

    base_url = 'http://search.webofknowledge.com'
    auth_url = base_url + '/esti/wokmws/ws/WOKMWSAuthenticate?wsdl'
    search_url = base_url + '/esti/wokmws/ws/WokSearch?wsdl'
    searchlite_url = base_url + '/esti/wokmws/ws/WokSearchLite?wsdl'

    def __init__(self, user=None, password=None, SID=None, close_on_exit=True,
                 lite=False):
        """Create the SOAP clients. user and password for premium access."""

        self._SID = SID
        self._close_on_exit = close_on_exit
        search_wsdl = self.searchlite_url if lite else self.search_url
        self._auth = client.Client(self.auth_url)
        self._search = client.Client(search_wsdl)

        if user and password:
            auth = '%s:%s' % (user, password)
            auth = _b64encode(auth.encode('utf-8')).decode('utf-8')
            headers = {'Authorization': ('Basic %s' % auth).strip()}
            self._auth.set_options(headers=headers)

    def __enter__(self):
        """Automatically connect when used with 'with' statements."""
        self.connect()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        """Close connection after closing the 'with' statement."""
        if self._close_on_exit:
            self.close()

    def __del__(self):
        """Close connection when deleting the object."""
        if self._close_on_exit:
            self.close()

    def connect(self):
        """Authenticate to WOS and set the SID cookie."""
        if not self._SID:
            self._SID = self._auth.service.authenticate()
            print('Authenticated (SID: %s)' % self._SID)

        self._search.set_options(headers={'Cookie': 'SID="%s"' % self._SID})
        self._auth.options.headers.update({'Cookie': 'SID="%s"' % self._SID})
        return self._SID

    def close(self):
        """Close the session."""
        if self._SID:
            self._auth.service.closeSession()
            self._SID = None

    def search(self, query, count=5, offset=1):
        """Perform a query. Check the WOS documentation for v3 syntax."""
        if not self._SID:
            raise RuntimeError('Session not open. Invoke .connect() before.')

        qparams = _OrderedDict([('databaseId', 'WOS'),
                                ('userQuery', query),
                                ('queryLanguage', 'en')])

        rparams = _OrderedDict([('firstRecord', offset),
                                ('count', count),
                                ('sortField', _OrderedDict([('name', 'RS'),
                                                            ('sort', 'D')]))])

        return self._search.service.search(qparams, rparams)

In [3]:

from xml.etree import ElementTree as _ET
from xml.dom import minidom as _minidom
import re as _re

def single(wosclient, wos_query, xml_query=None, count=10, offset=1):
    """Perform a single Web of Science query and then XML query the results."""
    result = wosclient.search(wos_query, count, offset)
    print (result)
    xml = _re.sub(' xmlns="[^"]+"', '', result.records, count=1).encode('utf-8')
    if xml_query:
        xml = _ET.fromstring(xml)
        return [el.text for el in xml.findall(xml_query)]
    else:
        return _minidom.parseString(xml).toprettyxml()

def query(wosclient, wos_query, xml_query=None, count=10, offset=1, limit=100):
    """Query Web of Science and XML query results with multiple requests."""
    results = [single(wosclient, wos_query, xml_query, min(limit, count-x+1), x) for x in range(offset, count+1, limit)]
    if xml_query:
        return [el for res in results for el in res]
    else:
        pattern = _re.compile(r'.*?<records>|</records>.*', _re.DOTALL)
        return ('<?xml version="1.0" ?>\n<records>' +
                '\n'.join(pattern.sub('', res) for res in results) +
                '</records>')

def doi_to_wos(wosclient, doi):
    """Convert DOI to WOS identifier."""
    results = query(wosclient, 'DO=%s' % doi, './REC/UID', count=1)
    return results[0].lstrip('WOS:') if results else None

In [4]:
soap = WosClient(lite=True)
soap.connect()

Authenticated (SID: D4eMRXaV71JO6j1wtA7)


D4eMRXaV71JO6j1wtA7

# Exchange the Given and Family Name

In [5]:
searchnames = []
i=0
for line in open('Dismis_Acad_List_CLEAN.txt','r'):
    QueryString = line.strip()
    QueryString = "AU="+line.split(',')[0].split()[-1]+' '+line.split(',')[0].split()[0]
    print('QueryString:', QueryString)
    searchnames.append(QueryString)


FileNotFoundError: [Errno 2] No such file or directory: 'Dismis_Acad_List_CLEAN.txt'

In [6]:
len(searchnames)

0

In [7]:
pd.DataFrame(searchnames,columns=['SearchNames']).to_csv('SearchNames.csv')

In [8]:
SearchNames = pd.read_csv('SearchNames.csv')

In [9]:
SearchNames.SearchNames[0].strip().split('=')[1]

IndexError: index out of bounds

In [10]:
soap = WoSClient(lite=True)
soap.connect()
query = "VA= Article"
res=soap.search(query)


NameError: name 'WoSClient' is not defined

In [11]:
names = []
lens = []
co_aut = []
i=0
for i in range(len(SearchNames)):
    line = SearchNames.SearchNames[i]
    QueryString = line
    print(QueryString)
    if i == 2400: # WOS constraints that each ID can only search for 2500 times. Just set smaller counts to renew the ID.
        soap = WosClient(lite=True)
        soap.connect()
    results = soap.search(QueryString)
    #getting number of name of co-authors
    if (results.recordsFound>0):
        authors = []
        for j in range (len(results.records)):
            for k in range (len(results.records[j].authors)):
                for l in range (len(results.records[j].authors[k].value)):
                    co_authors=results.records[j].authors[k].value[l]
                    authors.append(co_authors)
                    print('co_author',co_authors)
                #co_aut.append(authors)
            #co.append(co_aut)
    print(results.recordsFound)
    names.append(QueryString.split('=')[1])
    lens.append(results.recordsFound)
    co_aut.append(authors)
    time.sleep(0.5) # WOS constraints that you can only search twice per second
    i=i+1

In [12]:
#a python module to make unicode strings work as expected for turkish chars. solves the turkish "İ" problem

# have to do pip install unicode_tr
name = []
from unicode_tr.extras import slugify
for i in range(len(names)):
    name_en = slugify(names[i])
    name_en = name_en.replace('-',' ').title()
    name.append(name_en)

ImportError: No module named 'unicode_tr'

In [85]:
df = pd.DataFrame([name,lens,co_aut]).T
df.columns=['Author','NoPapers','CoAuthors']
df

Unnamed: 0,Author,NoPapers,CoAuthors
0,Aydin Aydin,4,"[Kul, Sibel, Aydin, Aydin, Dinc, Hasan, Erdura..."
1,Gilgil Erdal,5,"[Melikoglu, Meltem Alkan, Kocabas, Hilal, Seze..."
2,Halac Metin,63,"[Yilmaz, Mehmet Halit, Ozguroglu, Mustafa, Mer..."
3,Yuce Abdulhakim,0,"[Yilmaz, Mehmet Halit, Ozguroglu, Mustafa, Mer..."
4,Kiris Abdulkadir,54,"[Gedikli, Omer, Ozturk, Serkan, Yilmaz, Hulya,..."
5,Kucukbayrak Abdulkadir,30,"[Kucukbayrak, Abdulkadir, Ozdemir, Davut, Yild..."
6,Yildirim Abdulkadir,33,"[Karadeniz, Ali, Yildirim, Abdulkadir, Simsek,..."
7,Cekin Abdulkadir,0,"[Karadeniz, Ali, Yildirim, Abdulkadir, Simsek,..."
8,Sengun Abdulkadir,13,"[Sengun, Abdulkadir, Cobankara, Funda Kont, Or..."
9,Bedir Abdulkerim,38,"[Bedir, Abdulkerim, Aliyazicioglu, Yuksel, Bil..."


In [109]:
#formatting co authors column to match with authors
for i in range(df.shape[0]):
    for j in range(len(df.CoAuthors.iloc[i])):
        a=df.CoAuthors.iloc[i]
        a[j]=a[j].replace(","," ")
        df.CoAuthors.iloc[i] = a
df

Unnamed: 0,Author,NoPapers,CoAuthors
0,Aydin Aydin,4,"[Kul Sibel, Aydin Aydin, Dinc Hasan, Erdura..."
1,Gilgil Erdal,5,"[Melikoglu Meltem Alkan, Kocabas Hilal, Seze..."
2,Halac Metin,63,"[Yilmaz Mehmet Halit, Ozguroglu Mustafa, Mer..."
3,Yuce Abdulhakim,0,"[Yilmaz Mehmet Halit, Ozguroglu Mustafa, Mer..."
4,Kiris Abdulkadir,54,"[Gedikli Omer, Ozturk Serkan, Yilmaz Hulya,..."
5,Kucukbayrak Abdulkadir,30,"[Kucukbayrak Abdulkadir, Ozdemir Davut, Yild..."
6,Yildirim Abdulkadir,33,"[Karadeniz Ali, Yildirim Abdulkadir, Simsek ..."
7,Cekin Abdulkadir,0,"[Karadeniz Ali, Yildirim Abdulkadir, Simsek ..."
8,Sengun Abdulkadir,13,"[Sengun Abdulkadir, Cobankara Funda Kont, Or..."
9,Bedir Abdulkerim,38,"[Bedir Abdulkerim, Aliyazicioglu Yuksel, Bil..."


In [111]:
no_dismissed = pd.Series
for i in range()


pandas.core.series.Series

In [24]:
# 753 out of 3575 people have 0 papers in the web of science dataset
zeropapers = df.loc[df.NoPapers==0].copy()

In [None]:
# df.to_csv('Publishments.csv')

In [25]:
zeropapers.to_csv('ZeroPublishment.csv')

In [26]:
pd.concat([SearchNames,df],axis=1).to_csv('Publishments.csv')

In [27]:
publishment_cnt = pd.read_csv('Publishments.csv')

In [28]:
publishment_cnt.sort_values(by='NoPapers',ascending=False)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,SearchNames,Author,NoPapers
330,330,330,AU=YILDIRIM ALİ,YILDIRIM ALİ,272.0
191,191,191,AU=YILDIRIM AHMET,YILDIRIM AHMET,256.0
154,154,154,AU=KAYA AHMET,KAYA AHMET,240.0
125,125,125,AU=DEMİR AHMET,DEMİR AHMET,221.0
214,214,214,AU=ŞAHİN AHMET,ŞAHİN AHMET,215.0
215,215,215,AU=ŞAHİN AHMET,ŞAHİN AHMET,215.0
289,289,289,AU=GÜRBÜZ ALİ,GÜRBÜZ ALİ,208.0
157,157,157,AU=KILIÇ AHMET,KILIÇ AHMET,205.0
138,138,138,AU=KILIÇ AHMET,KILIÇ AHMET,205.0
198,198,198,AU=ÇELİK AHMET,ÇELİK AHMET,195.0


In [29]:
len(publishment_cnt['Author'].unique())

913