# Process XML files from IMF Database

## 1. Connect to IMF Database

In [1]:
import pyodbc

SERVER_NAME = 'TSTWEOSQL,5876'
DATABASE_NAME = 'IMF_EPUBS'

QUERY_STRING = "SELECT dbo.PUBLICATION.*, dbo.DOCUMENT.* \
                FROM [dbo].[PUBLICATION] \
                INNER JOIN dbo.DOCUMENT ON dbo.PUBLICATION.SeriesNumber = dbo.DOCUMENT.SeriesNumber \
                WHERE dbo.PUBLICATION.ImfCategoryDesc = 'IMF Staff Country Reports' \
                        AND dbo.PUBLICATION.Country = 'Vietnam'" 

def get_xmls(server_name, database_name, query_string):
    CON = pyodbc.connect('Trusted_Connection=yes',
                         driver='{SQL Server}',
                         server='%s' % server_name,
                         database='%s' % database_name)

    CURSOR = CON.cursor()
    return CURSOR.execute(query_string).fetchall()


results = get_xmls(SERVER_NAME, DATABASE_NAME, QUERY_STRING)

## 2. Create document object

In [2]:
from data_util2 import document_db
import pickle


### First, let's test one file 

In [3]:
test_doc = document_db(results[1])

print ('Country: ', test_doc.country,
      '\nPublication Date: ', test_doc.PublicationDate,
      '\nProjected Year: ', test_doc.ProjectedYear,
       '\nPage Count: ', test_doc.PageCount)

Country:  Vietnam 
Publication Date:  1995-09-14 00:00:00 
Projected Year:  1996 
Page Count:  103


In [4]:
#Check the paragraphs in the doc
test_doc.paras[:2]

['One of the striking features of Viet Nam’s transition to a market economy has been the resilience of output. The level of general economic activity never declined; there was only a temporary deceleration in the pace of expansion under the impact of external shocks and fundamental changes in domestic policy. During the core transition period of 1988-91, the economy grew at an average annual rate of 6 percent in the face of comprehensive steps in the reorientation from a command system to a system based on market principles, including decentralization of decision making, creation of incentives, and the liberalization of trade, prices, and the exchange rate. At the same time, financial policies were tightened, although when the economy suffered the collapse of its commercial and financial relationship with the CMEA in 1990-91, the stance of monetary policies was temporarily relaxed. Since then, financial stabilization has been resumed, the reform process has continued, and the growth of

### Next, loop through all xmls to create document objects

In [5]:
doc_dict = dict()
total_length= len(results)
for i in range(total_length):
    if "A" in results[i].PublisherId: #To select only files that have content
        doc = document_db(results[i])
        try:
            if doc.series_id in doc_dict.keys():
                doc_dict[doc.series_id].paras.extend(doc.paras)
            else:
                doc_dict[doc.series_id] = doc
        except:
            print(doc.series_id)

no paragraph found: 08887-9781451840438_A002_002
no paragraph found: 10067-9781451840445_A003_002


### Finally, save doc_dict to a pickle file

In [6]:
pickle.dump(doc_dict, open('processed_xml.p',"wb"))

### Note: 'no paragraph found' might mean that BeautifulSoup can't read the xml file. 

In [7]:
for i in range(len(results)):
    if results[i].PublisherId == '10067-9781451840445_A003_002':
        print (i)

118


#### There is content in this xml file

In [8]:
results[118].Content

'<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//IMF//IMF DTD//EN" "../../../../IMF_DTDs_XSLs/journal-dtd-3.0/3.0/journalpublishing3.dtd"[]><?xml-stylesheet type="text/xsl" href="../../../../IMF_DTDs_XSLs/journal-dtd-3.0/journal_ViewIMF-v1.0.xsl"?><article article-type="002" dtd-version="3.0" xml:lang="en" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML"><front><journal-meta><journal-id journal-id-type="publisher-id">002</journal-id><journal-title-group><journal-title>IMF Staff Country Reports</journal-title></journal-title-group><issn>1934-7685</issn><isbn>9781451840445</isbn><publisher><publisher-name>International Monetary Fund</publisher-name><publisher-loc>Washington, D.C.</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="doi">10.5089/9781451840445.002.A003</article-id><article-id pub-id-type="publisher-id">002A003</article-id><article-categories><subj-group subj-group-type="heading"><subjec

#### However, BeautifulSoup can't read the file

In [9]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(results[118].Content, 'xml')
soup.body.find_all('p')

[]

#### If you are concerned about the content of this file, you can use Regular Expression to extract its paragraphs

In [10]:
import re
pattern = r'(?<=<p>).*?(?=</p>)'
doc = re.findall(pattern, results[118].Content)

In [11]:
doc[:20]

['INTERNATIONAL MONETARY FUND',
 'VIETNAM',
 '<bold>Staff Report for the 2008 Article IV Consultation—Informational Annex</bold>',
 'Prepared by the Asia and Pacific Department',
 'February 27, 2009',
 '<xref ref-type="app" rid="A01app01">I. Fund Relations</xref>',
 '<xref ref-type="app" rid="A01app02">II. Relations with the World Bank Group</xref>',
 '<xref ref-type="app" rid="A01app03">III. Relations with the Asian Development Bank</xref>',
 '<xref ref-type="app" rid="A01app04">IV. Statistical Issues</xref>',
 'Following an extended period of strong economic performance, Vietnam is facing considerable challenges. Growth moderated to 6.2 percent in 2008. Rapid credit growth fueled by massive capital inflows, coupled with a surge in commodity prices, led to high inflation and large trade deficits in the first half of 2008. Executive Directors have commended the Vietnamese authorities for the significant progress they have made in stabilizing the economy, which was overheating in 2008. 