# WosClient

In [None]:
import time
import pandas as pd

In [None]:
# !pip install suds-jurko

from suds import client
from base64 import b64encode as _b64encode
from collections import OrderedDict as _OrderedDict

class WosClient():
    """Query the Web of Science.
       You must provide user and password only to user premium WWS service.
       with WosClient() as wos:
           results = wos.search(...)"""

    base_url = 'http://search.webofknowledge.com'
    auth_url = base_url + '/esti/wokmws/ws/WOKMWSAuthenticate?wsdl'
    search_url = base_url + '/esti/wokmws/ws/WokSearch?wsdl'
    searchlite_url = base_url + '/esti/wokmws/ws/WokSearchLite?wsdl'

    def __init__(self, user=None, password=None, SID=None, close_on_exit=True,
                 lite=False):
        """Create the SOAP clients. user and password for premium access."""

        self._SID = SID
        self._close_on_exit = close_on_exit
        search_wsdl = self.searchlite_url if lite else self.search_url
        self._auth = client.Client(self.auth_url)
        self._search = client.Client(search_wsdl)

        if user and password:
            auth = '%s:%s' % (user, password)
            auth = _b64encode(auth.encode('utf-8')).decode('utf-8')
            headers = {'Authorization': ('Basic %s' % auth).strip()}
            self._auth.set_options(headers=headers)

    def __enter__(self):
        """Automatically connect when used with 'with' statements."""
        self.connect()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        """Close connection after closing the 'with' statement."""
        if self._close_on_exit:
            self.close()

    def __del__(self):
        """Close connection when deleting the object."""
        if self._close_on_exit:
            self.close()

    def connect(self):
        """Authenticate to WOS and set the SID cookie."""
        if not self._SID:
            self._SID = self._auth.service.authenticate()
            print('Authenticated (SID: %s)' % self._SID)

        self._search.set_options(headers={'Cookie': 'SID="%s"' % self._SID})
        self._auth.options.headers.update({'Cookie': 'SID="%s"' % self._SID})
        return self._SID

    def close(self):
        """Close the session."""
        if self._SID:
            self._auth.service.closeSession()
            self._SID = None

    def search(self, query, count=5, offset=1):
        """Perform a query. Check the WOS documentation for v3 syntax."""
        if not self._SID:
            raise RuntimeError('Session not open. Invoke .connect() before.')

        qparams = _OrderedDict([('databaseId', 'WOS'),
                                ('userQuery', query),
                                ('queryLanguage', 'en')])

        rparams = _OrderedDict([('firstRecord', offset),
                                ('count', count),
                                ('sortField', _OrderedDict([('name', 'RS'),
                                                            ('sort', 'D')]))])

        return self._search.service.search(qparams, rparams)

In [None]:

from xml.etree import ElementTree as _ET
from xml.dom import minidom as _minidom
import re as _re

def single(wosclient, wos_query, xml_query=None, count=10, offset=1):
    """Perform a single Web of Science query and then XML query the results."""
    result = wosclient.search(wos_query, count, offset)
    print (result)
    xml = _re.sub(' xmlns="[^"]+"', '', result.records, count=1).encode('utf-8')
    if xml_query:
        xml = _ET.fromstring(xml)
        return [el.text for el in xml.findall(xml_query)]
    else:
        return _minidom.parseString(xml).toprettyxml()

def query(wosclient, wos_query, xml_query=None, count=10, offset=1, limit=100):
    """Query Web of Science and XML query results with multiple requests."""
    results = [single(wosclient, wos_query, xml_query, min(limit, count-x+1), x) for x in range(offset, count+1, limit)]
    if xml_query:
        return [el for res in results for el in res]
    else:
        pattern = _re.compile(r'.*?<records>|</records>.*', _re.DOTALL)
        return ('<?xml version="1.0" ?>\n<records>' +
                '\n'.join(pattern.sub('', res) for res in results) +
                '</records>')

def doi_to_wos(wosclient, doi):
    """Convert DOI to WOS identifier."""
    results = query(wosclient, 'DO=%s' % doi, './REC/UID', count=1)
    return results[0].lstrip('WOS:') if results else None

In [None]:
soap = WosClient(lite=True)
soap.connect()

Authenticated (SID: D3BtfMB8s7cppbOs7cg)


D3BtfMB8s7cppbOs7cg

# Exchange the Given and Family Name

In [None]:
searchnames = []
i=0
for line in open('Dismis_Acad_List_CLEAN.txt','r'):
    QueryString = line.strip()
    QueryString = "AU="+line.split(',')[0].split()[-1]+' '+line.split(',')[0].split()[0]
    print('QueryString:', QueryString)
    searchnames.append(QueryString)


In [6]:
len(searchnames)

3575

In [None]:
pd.DataFrame(searchnames,columns=['SearchNames']).to_csv('SearchNames.csv')

In [8]:
SearchNames = pd.read_csv('SearchNames.csv')

In [9]:
SearchNames.SearchNames[0].strip().split('=')[1]

'AYDIN AYDIN'

In [None]:
soap = WoSClient(lite=True)
soap.connect()
query = "VA= Article"
res=soap.search(query)


In [None]:
names = []
lens = []
co_aut = []
i=0
for i in range(len(SearchNames)):
#     if i > 10: break
    print('Will approximately take ' + str(len(SearchNames) / 2) + ' seconds..')
    line = SearchNames.SearchNames[i]
    QueryString = line
    print(QueryString)
    if i == 2400: # WOS constraints that each ID can only search for 2500 times. Just set smaller counts to renew the ID.
        soap = WosClient(lite=True)
        soap.connect()
    results = soap.search(QueryString)
    #getting number of name of co-authors
    if (results.recordsFound>0):
        authors = []
        for j in range (len(results.records)):
            for k in range (len(results.records[j].authors)):
                for l in range (len(results.records[j].authors[k].value)):
                    co_authors=results.records[j].authors[k].value[l]
                    authors.append(co_authors)
                    print('co_author',co_authors)
                #co_aut.append(authors)
            #co.append(co_aut)
    print(results.recordsFound)
    names_tr.append(QueryString.split('=')[1])
    lens.append(results.recordsFound)
    co_aut.append(authors)
    time.sleep(0.5) # WOS constraints that you can only search twice per second
    i=i+1

In [53]:
#a python module to make unicode strings work as expected for turkish chars. solves the turkish "İ" problem

# have to do pip install unicode_tr
names_en = []
from unicode_tr.extras import slugify
for i in range(len(names_tr)):
    name_en = slugify(names_tr[i])
    name_en = name_en.replace('-',' ').title()
    names_en.append(name_en)

In [58]:
len(names_tr)

1926

In [59]:
cnt=0
for l in co_aut:
    cnt += len(l)
print('Totally ' + str(cnt) + ' guys were downloaded as coauthors')

Totally 44265 guys were downloaded as coauthors


In [69]:
coauthors = set()
for l in co_aut:
    for author in l:
        author = author.replace(',', '')
        coauthors.add(author)

In [73]:
# let's now create the file with not-dismissed professors:
dismissed = set(names_en)

print('Totally we have ' + str(len(dismissed)) + ' dismissed and ' + str(len(coauthors)) + ' unique coauthors')

Totally we have 1892 dismissed and 17338 unique coauthors


In [75]:
undismissed = coauthors - dismissed
print('So, we have ' + str(len(undismissed)) + ' undismissed guys')

So, we have 16059 undismissed guys


In [76]:
pd.Dataframe(list(dismissed)).to_csv('dismissed.csv')
pd.Dataframe(list(undismissed)).to_csv('undismissed.csv')

AttributeError: module 'pandas' has no attribute 'to_csv'