# Simple example of reading a web page and converting it to plain text 
How the code works: 
* package **requests** is used to load web page from URL given in variable *documentURL* 
* package **BeautifulSoup4 (bs4)** is used to parse content of loaded web page 
* the call of *soup.get_text()* in the last line provides the content of page as plain text

In [208]:
import pandas as pd

In [209]:
df_data = pd.read_csv('SampleData.csv') #Read the data from the previous step of XML Parsing.

In [210]:
df_data #Read it into a pandas dataframe.

Unnamed: 0,from,to,subject,body
0,dummyfrom1@rabobank.nl,dummyto1@rabobank.nl,Dummy Subject for this email 1,\nNieuw-Vennep 24-09-2018.\n\nGoedemorgen ik h...
1,dummyfrom2@rabobank.nl,dummyto2@rabobank.nl,Dummy Subject for this email 2,"\nGoedemorgen heer, mevrouw, \n\nConform het t..."
2,dummyfrom7@rabobank.nl,dummyto7@rabobank.nl,Dummy Subject for this email 7,"\nGeachte heer, mevrouw,\n\nNaar aanleiding va..."
3,dummyfrom0@rabobank.nl,dummyto0@rabobank.nl,Dummy Subject for this email 0,"\nHello, \nMy mother, Frans MARLIER want to tr..."
4,dummyfrom9@rabobank.nl,dummyto9@rabobank.nl,Dummy Subject for this email 9,\nHi again! \nI want to confirm that you can c...
5,dummyfrom4@rabobank.nl,dummyto4@rabobank.nl,Dummy Subject for this email 4,"\nGeachte heer, mevrouw, \nIn verband met de a..."
6,dummyfrom3@rabobank.nl,dummyto3@rabobank.nl,Dummy Subject for this email 3,"\nGoedemorgen, \n\nIk ben de wettelijk vertege..."
7,dummyfrom6@rabobank.nl,dummyto6@rabobank.nl,Dummy Subject for this email 6,\nBeste \n\nIk heb enige tijd geleden een schr...


In [211]:
from googletrans import Translator #googletrans library is used for translating the engish text to dutch.

In [212]:
translator = Translator()

In [213]:
df_data["input_data"] = "" #Added a new column to just store the pre-processed data

In [214]:
df_data

Unnamed: 0,from,to,subject,body,input_data
0,dummyfrom1@rabobank.nl,dummyto1@rabobank.nl,Dummy Subject for this email 1,\nNieuw-Vennep 24-09-2018.\n\nGoedemorgen ik h...,
1,dummyfrom2@rabobank.nl,dummyto2@rabobank.nl,Dummy Subject for this email 2,"\nGoedemorgen heer, mevrouw, \n\nConform het t...",
2,dummyfrom7@rabobank.nl,dummyto7@rabobank.nl,Dummy Subject for this email 7,"\nGeachte heer, mevrouw,\n\nNaar aanleiding va...",
3,dummyfrom0@rabobank.nl,dummyto0@rabobank.nl,Dummy Subject for this email 0,"\nHello, \nMy mother, Frans MARLIER want to tr...",
4,dummyfrom9@rabobank.nl,dummyto9@rabobank.nl,Dummy Subject for this email 9,\nHi again! \nI want to confirm that you can c...,
5,dummyfrom4@rabobank.nl,dummyto4@rabobank.nl,Dummy Subject for this email 4,"\nGeachte heer, mevrouw, \nIn verband met de a...",
6,dummyfrom3@rabobank.nl,dummyto3@rabobank.nl,Dummy Subject for this email 3,"\nGoedemorgen, \n\nIk ben de wettelijk vertege...",
7,dummyfrom6@rabobank.nl,dummyto6@rabobank.nl,Dummy Subject for this email 6,\nBeste \n\nIk heb enige tijd geleden een schr...,


In [215]:
#This for loop removes all the \n\r characters and replace them with a ' '. This enables the translate function to work properly.
for index, row in df_data.iterrows():
    
    text_to_translate = df_data.iloc[index]['body']
    text_to_translate = text_to_translate.replace('\r', ' ').replace('\n', ' ')
    text_converted = translator.translate(text_to_translate, dest='nl', src='en')
    df_data.iloc[index]['input_data'] = text_converted.text
    print ("Done!")
    

Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!


In [216]:
df_data #Now Input_data contains the tranlsated texts.

Unnamed: 0,from,to,subject,body,input_data
0,dummyfrom1@rabobank.nl,dummyto1@rabobank.nl,Dummy Subject for this email 1,\nNieuw-Vennep 24-09-2018.\n\nGoedemorgen ik h...,Nieuw-Vennep 24-09-2018. Goedemorgen ik heb 18...
1,dummyfrom2@rabobank.nl,dummyto2@rabobank.nl,Dummy Subject for this email 2,"\nGoedemorgen heer, mevrouw, \n\nConform het t...","Goedemorgen heer, mevrouw, Conform het telefon..."
2,dummyfrom7@rabobank.nl,dummyto7@rabobank.nl,Dummy Subject for this email 7,"\nGeachte heer, mevrouw,\n\nNaar aanleiding va...","Geachte heer, mevrouw, Naar aanleiding van tel..."
3,dummyfrom0@rabobank.nl,dummyto0@rabobank.nl,Dummy Subject for this email 0,"\nHello, \nMy mother, Frans MARLIER want to tr...","Hallo, mijn moeder, Frans MARLIER wil alle bes..."
4,dummyfrom9@rabobank.nl,dummyto9@rabobank.nl,Dummy Subject for this email 9,\nHi again! \nI want to confirm that you can c...,Hallo opnieuw! Ik wil bevestigen dat je mijn a...
5,dummyfrom4@rabobank.nl,dummyto4@rabobank.nl,Dummy Subject for this email 4,"\nGeachte heer, mevrouw, \nIn verband met de a...","Geachte heer, mevrouw, In verband met de afwik..."
6,dummyfrom3@rabobank.nl,dummyto3@rabobank.nl,Dummy Subject for this email 3,"\nGoedemorgen, \n\nIk ben de wettelijk vertege...","Goedemorgen, Ik ben de wettelijke vertegenwoor..."
7,dummyfrom6@rabobank.nl,dummyto6@rabobank.nl,Dummy Subject for this email 6,\nBeste \n\nIk heb enige tijd geleden een schr...,Beste Ik heb enige tijd geleden at not least. ...


In [217]:
#Testing to remove some common greetings from the email. Need to build on it.
greetings = ['Met vriendelijke groeten','Met vriendelijke groet','Bij voorbaat dank','Met vriendelijke groete--------
Not Found
Found
Not Found
Not Found
Done!
--------
Not Found
Found
Not Found
Not Found
Done!
--------
Not Found
Not Found
Not Found
Not Found
Done!
n']

In [218]:
#This code replaces the existing input_body column with the updated emails (removed the text after the greetings + the greetings itself)
#Need to build more to this.

for index, row in df_data.iterrows():
    x = df_data.iloc[index]['input_data']
    for i in greetings:
        
        if x.find(i) == -1:
            print ("Not Found")
        else:
            print("Found")
            head, sep, tail = x.partition(i)
            x_head = head
            df_data.iloc[index]['input_data'] = x_head
    print ("Done!")
    print ("--------")
    




Not Found
Found
Not Found
Not Found
Done!
--------
Not Found
Found
Not Found
Not Found
Done!
--------
Found
Found
Not Found
Found
Done!
--------
Not Found
Not Found
Found
Not Found
Done!
--------
Found
Found
Not Found
Found
Done!
--------
Not Found
Found
Not Found
Not Found
Done!
--------
Not Found
Found
Not Found
Not Found
Done!
--------
Not Found
Not Found
Not Found
Not Found
Done!
--------


In [220]:
df_data #Dataframe after removing the signatures.

Unnamed: 0,from,to,subject,body,input_data
0,dummyfrom1@rabobank.nl,dummyto1@rabobank.nl,Dummy Subject for this email 1,\nNieuw-Vennep 24-09-2018.\n\nGoedemorgen ik h...,Nieuw-Vennep 24-09-2018. Goedemorgen ik heb 18...
1,dummyfrom2@rabobank.nl,dummyto2@rabobank.nl,Dummy Subject for this email 2,"\nGoedemorgen heer, mevrouw, \n\nConform het t...","Goedemorgen heer, mevrouw, Conform het telefon..."
2,dummyfrom7@rabobank.nl,dummyto7@rabobank.nl,Dummy Subject for this email 7,"\nGeachte heer, mevrouw,\n\nNaar aanleiding va...","Geachte heer, mevrouw, Naar aanleiding van tel..."
3,dummyfrom0@rabobank.nl,dummyto0@rabobank.nl,Dummy Subject for this email 0,"\nHello, \nMy mother, Frans MARLIER want to tr...","Hallo, mijn moeder, Frans MARLIER wil alle bes..."
4,dummyfrom9@rabobank.nl,dummyto9@rabobank.nl,Dummy Subject for this email 9,\nHi again! \nI want to confirm that you can c...,Hallo opnieuw! Ik wil bevestigen dat je mijn a...
5,dummyfrom4@rabobank.nl,dummyto4@rabobank.nl,Dummy Subject for this email 4,"\nGeachte heer, mevrouw, \nIn verband met de a...","Geachte heer, mevrouw, In verband met de afwik..."
6,dummyfrom3@rabobank.nl,dummyto3@rabobank.nl,Dummy Subject for this email 3,"\nGoedemorgen, \n\nIk ben de wettelijk vertege...","Goedemorgen, Ik ben de wettelijke vertegenwoor..."
7,dummyfrom6@rabobank.nl,dummyto6@rabobank.nl,Dummy Subject for this email 6,\nBeste \n\nIk heb enige tijd geleden een schr...,Beste Ik heb enige tijd geleden at not least. ...


In [219]:
df_data.to_csv("Sample_Data_2.csv") #New Sample Data