In [58]:
from bs4 import BeautifulSoup
import re


class HTMLCorrector:
    def __init__(self, path):
        """ This class represents the HTML code corrector . The purpose of this class is format the .htm file 
         according to the reference pdf generated (by Adobe Framemaker). 
        
        Args:
        - path: Path of the generated .htm file by Framemaker
        - header: A string that would store the header for correct HTML file
        
        """
        self.path = path
        self.header="" ## This will be updated_elementsted, once we will read the .HTML file generated by Framemaker
    
    def __read_file(self):
        """ Function to head the .HTML file and convert it into a string 
        
        Args:
        - path: Path of the generated .htm file by Framemaker
        
        Output:
        - content: A string that has the unformatted HTML code.
        """
        code=""
        with open(self.path, "r") as f:
            for line in f.readlines():            
                code =code+line
        return code
    
    
    def __add_pagenumbers(self,header):
          """ Function to add pagenumbers to the HTML file 
        
        Args:
        - header: Header portion of the HTML file
        
        Output:
        - updated_header: Header with embedded javascript code for page numbers
        """
        with open ("PNumberScript.js", "r") as myfile:
            JSCODE=myfile.readlines()
        start= """<script type="text/javascript">window.onload = addPageNumbers;"""
        end = "</script>"
        
        htmlcode =  start + '\n'.join(JSCODE) +end
        updated_header = header.replace("<head>","<head>"+htmlcode)
        return updated_header
    
    
    
    def __add_padding(self,x):
         """ Function to add paddings to the HTML file
        
        Args:
        - x: content of the HTML file
        
        Output:
        - x_updated: content with padding of 5%
        """
        
        x = x.replace("<body>","<body>"+"""<div style="padding:5%;">""")
        x_updated = x.replace("</body>","</div>"+"</body>")
        return x_updated
        
        
        
    def __add_header(self,x):
        """ Function to add header in our corrected HTML file 
        
        Args:
        - x: code without header 
        
        Output:
        - code_with_header: A string that has the HTML code along with the header.
        """
        si = x.find('<head>')
        ei = x.find('</head>')
        self.header = self.__add_pagenumbers(self.header)
        code_with_header = x[0:si-1]  + self.header + x[ei+7:]
        return code_with_header
    
        
   
    def __format_caption(self,temp):        
        """ This function will change the formatting of caption. It performs the following operations:
            1) Remove the </br> tag in between caption and text
            2) Remove </p> tag to avoid the next line
            3) Bolding the caption
        
        Args:
        - temp: HTML code string for caption tag. 
        
        Output:
        - f_string: A string that has the formatted HTML code for caption.
        """
        temp = (temp.replace('<br/>',''))
        temp = (temp.replace('</p>',''))
        f_string = temp.split('\n')
        f_string.insert(1, '<Strong>') 
        f_string.append('</Strong>')
        return ((''.join(f_string))) 
    
    
    def __format_text(self,temp):        
        """ This function will change the formatting of caption. It performs the following operations:
            1) Remove the <p> tag in between caption and text
            2) Split the string to get the required part of it.
        
        Args:
        - temp: HTML code string for text tag. 
        
        Output:
        - f_string: A string that has the formatted HTML code for text.
        """
        f_string = temp.split('\n')
        return ((''.join(f_string[1:])))
    
    def __change_finger_image(self,tag):
        """
        The function will change the image of the bullet points.

        Args:
        - tag: string of <p> tag element
        
        Output:
        - tag: Updated string that has the correct code for <p> tag element .
        """
        
        if bool(re.search("FingerParInd", tag)):
            tag = " ".join(re.split("\s+", tag, flags=re.UNICODE))
            tag =tag.replace("""<span class="FingerSolid"> * </span>""", """<span class="FingerSolid"> <img src="assets/FingerParInd.png" style="width:1.1rem;height:0.8rem;" > </span>""")          
            return tag
        elif bool(re.search("Finger2dParInd", tag)) or bool(re.search("Finger3dParInd", tag))  :
            tag = " ".join(re.split("\s+", tag, flags=re.UNICODE))
            tag =tag.replace("""<span class="FingerSolid"> + </span>""", """<span class="FingerSolid"> <img src="assets/Finger2dParInd.png" style="width:1.1rem;height:0.8rem;" > </span>""")          
            return tag
        else:
            return tag
        
    
    def __change_bullet_image(self,tag):
        """
        The function will change the image of the bullet points.

        Args:
        - tag: string of <p> tag element
        
        Output:
        - tag: Updated string that has the correct code for <p> tag element .
        """
        
        if bool(re.search("SquareHollowBullets", tag)) or bool(re.search("SquareBullets", tag)):
            tag = " ".join(re.split("\s+", tag, flags=re.UNICODE))
            tag =tag.replace("""<span class="SquareHollowBullets"> o </span>""", """<span class="SquareHollowBullets"> <img src="assets/SquareHollowBullets.png" style="width:0.8rem;height:0.8rem;" > </span>""")
            tag =tag.replace("""<span class="SquareBullets"> n </span>""", """<span class="SquareBullets"> <img src="assets/SquareBullets.png" style="width:0.8rem;height:0.8rem;" > </span>""")
            
            return tag
        else:
            return tag
        
    def __indent_bullets(self,tag):
        """
        The function will correct the class of the bullet points with wrong indentation level

        Args:
        - tag: string of <p> tag element
        
        Output:
        - tag: Updated string that has the correct code for <p> tag element .
        """
        if bool(re.search("SquareHollowBullets", tag)) and bool(re.search("BulletPar2dInd", tag)):
            tag = tag.replace("BulletPar2dInd","Bullet2dPar")
            return tag
        else:
            return tag
        
    def __bold_example_caption(self,tag):
        """
        The function will correct the font-style of Example captions

        Args:
        - tag: string of <p> tag element
        
        Output:
        - tag: Updated string that has the correct code for <p> tag element .
        """
        r ="Example.*:"
        repl = "<strong>" + re.findall(r, tag)[0] +"</strong>"
        tag = re.sub(r, repl, tag)
        return tag
    
    def beautify(self,content,indent_level=4): 

        """ The .HTML file generated by Framemaker has un formatted code. This function uses HTMLBeautifier to
        beautify the code.

        Args:
        - content: unformatted HTML code string. 
        - indent_level: Indentation level of the beatufied code. 

        Output:
        - beautified_code: A string that has the beatufied HTML code.
        """
        assert indent_level > 0, f"Indent level must be greater than zero. Got: {indent_level}"

        from html5print import HTMLBeautifier
        beautified_code= (HTMLBeautifier.beautify(content, indent_level))
        return beautified_code

    
    
    def correctify_code(self):
        """ The function will perform all the operations needed to correct the format of the
        HTML code.
        
        Output:
        - corrected_code: A string that has the code after applying the 
                          pre-processing operations.
        """
        file = self.__read_file() ## Reading the file
        b_code = self.beautify(file) ## Beautifying it
        pair_check=False 
        soup = BeautifulSoup(b_code)
        all_elements = soup.findAll('p')
        self.header = str(soup.findAll('head')[0])
        
        
        updated_elements=[]
        for i in range(len(all_elements)):

            element = str(all_elements[i]) ## Converting div type to string
            temp = "".join([s for s in element.strip().splitlines(True) if s.strip()]) ## Remving lines with no text
            temp = self.__indent_bullets(temp) ## Correcting indentation of bullets
            temp = self.__change_bullet_image(temp) ## Correcting images of bullets
            temp = self.__change_finger_image(temp) ## Correcting images of fingers
            ## Patterns for detection of caption and Text
            regex_caption = r'_[A-Za-z0-9]+Capt'
            regex_textI = r'_[A-Za-z0-9]+Text'
            regex_textII = r'_[A-Za-z0-9]+SecondPar'
            regex_example ="Example.*:"
            
            
            btemp =self.beautify(temp)     
            if bool(re.search(regex_caption, temp)) or bool(re.search("_Q1", temp)) or bool(re.search("_R1", temp)):
                updated_elements.append(self.__format_caption(temp))
                pair_check=True

            elif ( bool(re.search(regex_textI, temp)) or bool(re.search(regex_textII, temp)) or bool(re.search("_R2", temp)) or bool(re.search("_Q2", temp)) ) and pair_check:
                updated_elements.append(self.__format_text(temp))
                pair_check=False
            
            elif bool(re.search(regex_example, btemp)):
                updated_elements.append(self.__bold_example_caption(btemp))
            else:
                updated_elements.append(temp)
        corrected_code = self.beautify(''.join(updated_elements), 4)
        ## Removing gap between caption and text
        corrected_code = corrected_code.replace('&nbsp;','')
        return corrected_code
    
    
    
    def generate_correct_file(self,filename):
        """
        The function will correct the class of the bullet points with wrong indentation level

        Args:
        - filename: Name of the updated file 
        
        Output:
        - An output file with name as "filename" (string) in the present working directory
        """
        result = self.correctify_code()
        f_result = self.__add_header(result)
        f_result = self.__add_padding(f_result)
        
        
        with open(filename, "w") as text_file:
            text_file.write(f_result)
        
        

# main func
if __name__ == '__main__':
    Filename = './Ch14.htm' ## Path of the file generated by Frame maker
    Updated_File = "Formatted_Ch14.html" ## Name of the output file 
    HTMLCorrector(Filename).generate_correct_file(Updated_File)         
    
        


In [127]:
sentence = """<span class="SquareHollowBullets">
                    o   
                </span>"""

In [128]:
import re
sentence = " ".join(re.split("\s+", sentence, flags=re.UNICODE))
sentence

'<span class="SquareHollowBullets"> o </span>'

In [8]:
from html5print import HTMLBeautifier

In [99]:
with open("Formatted1_Ch14.html", "w") as text_file:
    text_file.write(x)
    

In [11]:
from bs4 import BeautifulSoup

In [13]:
import urllib
url = urllib.urlopen('http://meinparlament.diepresse.com/')
content = url.read()
soup = BeautifulSoup(content, 'lxml')

AttributeError: module 'urllib' has no attribute 'urlopen'

In [41]:
import re

data = '<div class="media-story">content and content</div>'
match = re.findall(r'<p*>*</p>', x)
match

[]

In [37]:
import re
clean = re.compile('<p.*?/p>')
re.findall(clean, x)

[]

In [None]:
text=""
with open("Ch14.htm", "r") as f:
    with open("Formatted_Ch14.html", "w") as text_file:
        for line in f.readlines(): 
            if bool(re.search('<p ', line)):
                
                

In [4]:
from bs4 import BeautifulSoup

In [67]:
soup = BeautifulSoup(x)
soup.findAll('head')[0].append('hgfhg')

In [68]:
soup.findAll('head')[0]

<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<title>
            Ch14
        </title>
<link href="../rhstylemapping.css" rel="stylesheet" type="text/css"/>
<link href="Ch14.css" rel="stylesheet" type="text/css"/>
<link href="../editstyle.css" rel="stylesheet" type="text/css"/>
<script language="JavaScript" type="text/javascript">
            function reDo() {
                if (innerWidth != origWidth || innerHeight != origHeight) location.reload();
            }
            if ((parseInt(navigator.appVersion) == 4) && (navigator.appName == "Netscape")) {
                origWidth = innerWidth;
                origHeight = innerHeight;
                onresize = reDo;
            }
            onerror = null;
        </script>
<style type="text/css">
            <!--
            div.WebHelpPopupMenu { position:absolute;
            left:0px;
            top:0px;
            z-index:4;
            visibility:hidden; }
            p.WebHelpNavBar { text-a

In [44]:
import re
upda=[]
for i in range(len(arr)):
    
    t= str(arr[i])
    temp = "".join([s for s in t.strip().splitlines(True) if s.strip()])
    regex_caption = r'_[A-Za-z0-9]+Capt'
    regex_textI = r'_[A-Za-z0-9]+Text'
    regex_textII = r'_[A-Za-z0-9]+SecondPar'
    
   
    if bool(re.search(regex_caption, temp)):
        temp = (temp.replace('<br/>',''))
        temp = (temp.replace('</p>',''))
        splits = temp.split('\n')
        splits.insert(1, '<Strong>') 
        splits.append('</Strong>')
        upda.append((''.join(splits)))
        pair_check=True
    
    elif ( bool(re.search(regex_textI, temp)) or bool(re.search(regex_textII, temp)) ) and pair_check:
        splits = temp.split('\n')
        upda.append((''.join(splits[1:])))
        pair_check=False
        
    else:
        upda.append(temp)
        
        
        
    

In [51]:
t="""
  <p class="Finger3dParInd" id="32712" style="text-indent : 0.00pt;">
            <span style="font-weight:Bold;">
                Example:
            </span>
            University, a public university, passes a speech code that says &ldquo;All students are forbidden to harass or threaten other students on the basis of race, gender, or sexual orientation. Violations are punishable by suspension.&rdquo; Since the code bans just certain types of harassment or threats (those based on race, gender and sexual orientation) and not others (e.g., those based on political affiliation), the code is a content-based regulation, and must be strictly scrutinized. It will probably fail that scrutiny, since there are content-neutral alternatives that are less restrictive of speech and will do the job almost as well (e.g., a code banning
            <span style="font-style:Italic;">
                all
            </span>
            harassments and threats, regardless of the speaker&rsquo;s motive).
            <br>
        </p>
            """
r ="Example.*[\s\S]*</"
re.findall(r, t)

# repl = "<strong>" + re.findall(r, t) +"</strong>"
# print(re.sub(r, repl, t, count=0, flags=0))

['Example:\n            </span>\n            University, a public university, passes a speech code that says &ldquo;All students are forbidden to harass or threaten other students on the basis of race, gender, or sexual orientation. Violations are punishable by suspension.&rdquo; Since the code bans just certain types of harassment or threats (those based on race, gender and sexual orientation) and not others (e.g., those based on political affiliation), the code is a content-based regulation, and must be strictly scrutinized. It will probably fail that scrutiny, since there are content-neutral alternatives that are less restrictive of speech and will do the job almost as well (e.g., a code banning\n            <span style="font-style:Italic;">\n                all\n            </span>\n            harassments and threats, regardless of the speaker&rsquo;s motive).\n            <br>\n        </']

In [37]:
print(re.sub(r, repl, t, count=0, flags=0))


  <p class="Finger3dParInd" id="32712" style="text-indent : 0.00pt;">
            <span style="font-weight:Bold;">
                <strong>Example:</strong>
            </span>
            University, a public university, passes a speech code that says &ldquo;All students are forbidden to harass or threaten other students on the basis of race, gender, or sexual orientation. Violations are punishable by suspension.&rdquo; Since the code bans just certain types of harassment or threats (those based on race, gender and sexual orientation) and not others (e.g., those based on political affiliation), the code is a content-based regulation, and must be strictly scrutinized. It will probably fail that scrutiny, since there are content-neutral alternatives that are less restrictive of speech and will do the job almost as well (e.g., a code banning
            <span style="font-style:Italic;">
                all
            </span>
            harassments and threats, regardless of the speake

In [34]:
print(upda)

['<p class="ChapNo" id="8528">\n                Chapter 14\n                <br/>\n</p>', '<p class="_L0" id="13178">\n<a id="index_15298" name="index_15298">\n</a>\n<a data-indexterm="%3C$nopage%3EFIRST%20AMENDMENT:%3CIndexItal%3ESee%3CDefault%20Para%20Font%3E%20FREEDOM%20OF%20SPEECH,%20FREEDOM%20OF%20ASSOCIATION,%20FREEDOM%20OF%20EXPRESSION,%20FREEDOM%20OF%20RELIGION" id="%3C$nopage%3EFIRST_AMENDMENT_%3CIndexItal%3ESee%3CDefault_Para_Font%3E_FREEDOM_OF_SPEECH,_FREEDOM_OF_ASSOCIATION,_FREEDOM_OF_EXPRESSION,_FREEDOM_OF_RELIGION" name="%3C$nopage%3EFIRST_AMENDMENT_%3CIndexItal%3ESee%3CDefault_Para_Font%3E_FREEDOM_OF_SPEECH,_FREEDOM_OF_ASSOCIATION,_FREEDOM_OF_EXPRESSION,_FREEDOM_OF_RELIGION">\n</a>\n                FREE\n                <a id="index_15299" name="index_15299">\n</a>\n<a data-indexterm="%3C$nopage%3EFREEDOM%20OF%20EXPRESSION:%3CIndexItal%3ESee%3CDefault%20Para%20Font%3E%20FREEDOM%20OF%20SPEECH,%20FREEDOM%20OF%20ASSOCIATION" id="%3C$nopage%3EFREEDOM_OF_EXPRESSION_%3CIndexIt

In [49]:
out = out.replace('&nbsp;','')

In [50]:
with open("chunk.html", "w") as text_file:
    text_file.write(out)

In [111]:
import re
 
# Make a regular expression
# for validating an Email
regex = r'_[A-Za-z0-9]+[Capt|Text]'
 
# Define a function for
# for validating an Email
 
 
def check(email):
 
    # pass the regular expression
    # and the string into the fullmatch() method
    if(re.search(regex, email)):
        print("Valid Email")
 
    else:
        print("Invalid Email")
 
 
# Driver Code
if __name__ == '__main__':
 
    # Enter the email
    email = "_L8Capit"
 
    # calling run function
    check(email)
 
    email = "my.ownsite@our-earth.org"
    check(email)
 
    email = "ankitrai326.com"
    check(email)

Valid Email
Invalid Email
Invalid Email


In [22]:
import re

def useRegex(input):
    pattern = re.compile(r"\\.&nbsp;&nbsp;&nbsp;")
    return pattern.search(input, re.IGNORECASE)

k ="""<p class="_L6Capt" id="8560">
            <strong>
                2.&nbsp;&nbsp;&nbsp; Some examples:
                <span style="font-weight:normal;">
                </span>
            </strong>
            Following are some examples of governmental actions punishing or restricting speech, and the track into which each falls:
            <br>
        </p>"""

In [24]:
HTMLBeautifier.beautify(k.replace('&nbsp;',''))

'<html>\n  <head>\n  </head>\n  <body>\n    <p class="_L6Capt" id="8560">\n      <strong>\n        2. Some examples:\n        <span style="font-weight:normal;">\n        </span>\n      </strong>\n      Following are some examples of governmental actions punishing or restricting speech, and the track into which each falls:\n      <br>\n    </p>\n  </body>\n</html>\n'

In [29]:
upda2=[]
for i in range(len(upda)):
    
    t= str(arr[i])
    upda2.append(t.replace('&nbsp;',''))