In [1]:
#The below code parses all the xml files 
#All the news articles and relevant metadata are put into the all_news_result list
#All the commentary articles and relevant metadata are put into the all_commentary_result list

import xml.etree.ElementTree as ET
import html2text
from pprint import pprint
import json, os, csv, re

escape_illegal_xml_characters = lambda x: re.sub(u'[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]', '', x)
all_news = []
all_commentary = []
for root, dirs, files in os.walk("."):
    for name in files:
        if '.xml' in name:
            if "News" in root:
                all_news.append(os.path.join(root, name))
            if "Commentary" in root:
                all_commentary.append(os.path.join(root, name))
                
def commentary_parsing(root):
    count = 0
    while root[0][count].tag != 'item':
        count += 1
    assert root[0][-1].tag == 'item'
    for item in root[0][count:]:
        tags = []
        affiliation = ''
        author = ''
        contentEncoded = ''
        for child in item:
            if child.tag == 'title':
                title = child.text
            elif child.tag == 'link':
                link = child.text
            elif child.tag == '{http://purl.org/rss/1.0/modules/content/}encoded':
                contentEncoded = child.text
            elif child.tag == 'pubDate':
                date = child.text
            elif child.tag == 'category':
                if child.text != 'Uncategorized' and 'affiliation' not in child.text.lower() and 'author' not in child.text.lower():
                    tags.append(child.text)
                elif 'affiliation:' in child.text.lower():
                    affiliation = re.sub("affiliation:", "", child.text, flags=re.I)
                elif 'author:' in child.text.lower():
                    author = re.sub("author:", "", child.text, flags=re.I)
        all_tags = ', '.join(tags)
        result = [title,date,all_tags,author,affiliation,link,contentEncoded]
        all_commentary_result.append(result)
        
        
def news_parsing(root):
    count = 0
    while root[0][count].tag != 'item':
        count += 1
    assert root[0][-1].tag == 'item'
    for item in root[0][count:]:
        tags = []
        affiliation = ''
        author = ''
        contentEncoded = ''
        for child in item:
            if child.tag == 'title':
                title = child.text
            elif child.tag == 'link':
                link = child.text
            elif child.tag == '{http://purl.org/rss/1.0/modules/content/}encoded':
                contentEncoded = child.text
            elif child.tag == 'pubDate':
                date = child.text
            elif child.tag == 'category':
                if child.text != 'Uncategorized' and 'affiliation' not in child.text.lower() and 'author' not in child.text.lower():
                    tags.append(child.text)
                elif 'affiliation:' in child.text.lower():
                    affiliation = re.sub("affiliation:", "", child.text, flags=re.I)
                elif 'author:' in child.text.lower():
                    author = re.sub("author:", "", child.text, flags=re.I)
        all_tags = ', '.join(tags)
        result = [title,date,all_tags,author,affiliation,link,contentEncoded]
        all_news_result.append(result)
        
        
all_commentary_result = []
for i in all_commentary:
    try:
        root = ET.parse(i).getroot()
        commentary_parsing(root)
    except:
        content = ''
        with open (i,'r+', encoding='utf-8') as fp:
            content = fp.read()
        fp.close()
        parser = ET.XMLParser(encoding="utf-8")
        root = ET.fromstring(escape_illegal_xml_characters(content), parser=parser)
        commentary_parsing(root)
        continue
        
all_news_result = []
for i in all_news:
    try:
        root = ET.parse(i).getroot()
        news_parsing(root)
    except:
        content = ''
        with open (i,'r+', encoding='utf-8') as fp:
            content = fp.read()
        fp.close()
        parser = ET.XMLParser(encoding="utf-8")
        root = ET.fromstring(escape_illegal_xml_characters(content), parser=parser)
        news_parsing(root)
        continue
        


In [2]:
import pandas as pd
#Converts the lists into pandas dataframes
allNews = pd.DataFrame(all_news_result, columns = ["Title", "PubDate", "Tags","Author","Affiliation","Link","Text"])
allCommentary = pd.DataFrame(all_commentary_result, columns = ["Title", "PubDate", "Tags","Author","Affiliation","Link","Text"])
#Creates a new column where the processed text will be placed, for now it just copies all the text
allNews['Processed'] = allNews['Text']
allCommentary['Processed'] = allCommentary['Text']


In [3]:
#Preprocessing function
#problems with index 23 in all_commentary
def processText(oldString):
    newString = str(oldString)
    #First remove the author information
    newString = re.sub("^.*?:<br /><br />&quot;|^.*?: <br /><br />&quot;", '', newString)  
    
    
    newString = re.sub("^.*?]: &quot;", '', newString)  
    newString = re.sub("^.*?<span style=.*?><b>", '', newString)  
    newString = re.sub("^.*?<font SIZE=3><b>", '', newString)  
    
    newString = re.sub('^.*?\n\n<hr size="1" />\n\n<span style="font-size: medium;"><b>', '', newString)
    if(newString.find("<br /><br /><em>") != -1):
        newString = newString[:newString.find("<br /><br /><em>")]
    if(newString.find("<p><em><br />") != -1):
        newString = newString[:newString.find("<p><em><br />")]
    if(newString.find("&quot; [") != -1):
        newString = newString[:newString.find("&quot; [")]
    if(newString.find("<em>............................................................*") != -1):
        newString = newString[:newString.find("<em>............................................................*")]
    
    #Next remove, the paragraph and em tags
    newString = re.sub("<p>|</p>|<em>|</em>", '', newString)
    #Replace the newlines with spaces
    newString = re.sub("\n", ' ', newString)
    #Get rid of the links using the <a> tag, of the format <a href ...> text to be linked </a>
    newString = re.sub("</a>|<a.*?>", '', newString)   
    #Replace any other tags, some of them are br, some of them are <blockquote> 
    #Also, replace &emdash; and &nbsp; and replace anything in brackets [stuff here]
    newString = re.sub("<br />|&.*?;|<.*?>", '', newString)  
    newString = re.sub("--------", '', newString)  
    return(newString)
    
   
#Preprocessing function for News articles where there are no authors, need to figure out how to represent [things in brackets]
def processNews(oldString):
    newString = str(oldString)
    #First remove, the paragraph and em tags
    newString = re.sub("<p>|</p>|<em>|</em>", '', newString)
    #Replace the newlines with spaces
    newString = re.sub("\n", ' ', newString)
    #Get rid of the links using the <a> tag, of the format <a href ...> text to be linked </a>
    newString = re.sub("</a>|<a.*?>", '', newString)   
    #Replace any other tags, some of them are br, some of them are <blockquote> 
    #Also, replace &emdash; and &nbsp; and replace anything in brackets [stuff here]
    newString = re.sub("<br />|&.*?;|<.*?>", '', newString)  
    #newString = re.sub("\[.*?]", '', newString)  ##NEED TO FIGURE OUT REGEX MATCHING FOR []
    return(newString)
    

In [4]:
allCommentary['Processed'] = allCommentary.apply(lambda row: processText(row['Text']),axis=1)
allNews['Processed'] = allNews.apply(lambda row: processNews(row['Text']),axis=1)

In [5]:
import pickle
filename1 = 'allCommentary'
pickle.dump(allCommentary, open(filename1, 'wb'))
filename2 = 'allNews'
pickle.dump(allNews, open(filename2, 'wb'))

In [None]:
# Use allCommentary = pickle.load(open('allCommentary', 'rb')) to open up the dataframe

In [None]:
allCommentary = pickle.load(open('allCommentary', 'rb'))
allNews = pickle.load(open('allNews', 'rb'))