In [None]:
from deep_translator import GoogleTranslator
from langdetect import detect
import re
from nltk.tokenize import sent_tokenize

import os
from bs4 import BeautifulSoup
import PyPDF2
import json

translator = GoogleTranslator(source='auto', target='en')

def translate(text):
    if len(text) < 5000:
        translated = translator.translate(text)
    elif len(text) < 9999:
        translated = translator.translate(text[:4999]) +" "+ translator.translate(text[4999:])
    else:
        translated = translator.translate(text[:4999]) +" "+ translator.translate(text[4999:9998])

    return translated


def is_english(text):
    try:
        language = detect(text)
        return language == 'en'
    except:
        return False


def split_text_into_chunks(text, max_length):
    sentences = re.split(r'(?<=[.!?]) +', text)  # Split text into sentences
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_length:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def remove_extra_spaces_and_newlines(text):
    cleaned_text = re.sub(r'\s+', ' ', text)
    return cleaned_text.strip()



DIR_MAPS_PRIVACY_HTML = '../../MAPS Output/HTML Content/'
DIR_MAPS_PRIVACY_PDF = '../../MAPS Output/PDF Content/'
MAX_LENGTH = 4999

def get_privacy_content(domain):
    html_folder_path = os.path.join(DIR_MAPS_PRIVACY_HTML, domain)
    pdf_folder_path = os.path.join(DIR_MAPS_PRIVACY_PDF, domain)
    
    html_files_in_folder = os.listdir(DIR_MAPS_PRIVACY_HTML + domain)
    pdf_files_in_folder = os.listdir(DIR_MAPS_PRIVACY_PDF + domain)
    
    content = ''

    # Read and process each .txt file
    for file in html_files_in_folder:
        html_file_path = os.path.join(html_folder_path, file)
        with open(html_file_path, 'r', encoding='utf-8') as html_file:
            html_content = html_file.read()
            soup = BeautifulSoup(html_content, 'html.parser') # Parse the HTML
            plain_text = soup.get_text()  # Extract plain text
            
            text_chunk = split_text_into_chunks(plain_text, MAX_LENGTH)
            
            for chunk in text_chunk:
                if is_english(chunk): content += chunk  # print('NOT ENGLISH') print(chunk)
                else: content += translate(chunk) # print('NOT ENGLISH')   # print(chunk)


    # Open the PDF file
    for file in pdf_files_in_folder:
        pdf_file_path = os.path.join(pdf_folder_path, file)        
        with open(pdf_file_path, 'r', encoding='utf-8') as pdf_file:
            pdf_content = pdf_file.read()

            text_chunk = split_text_into_chunks(pdf_content, MAX_LENGTH)

            for chunk in text_chunk:
                if is_english(chunk): content += chunk  # print('NOT ENGLISH') print(chunk)
                else: content += translate(chunk)
            
    return len(html_files_in_folder), len(pdf_files_in_folder), remove_extra_spaces_and_newlines(content)


FILE_SUMMERY = '../../MAPS Output/summery.json'

def update_summery(domain, data):
    # Read existing JSON data (if any)
    existing_data = []
    try:
        with open(FILE_SUMMERY, 'r') as json_file:
            existing_data = json.load(json_file)
    except FileNotFoundError:
        pass
    # Append new data to existing data
    existing_data[domain] = data
    
    # Write the combined data back to the file
    with open(FILE_SUMMERY, 'w') as json_file:
        json.dump(existing_data, json_file, indent=4)

In [None]:
import pandas as pd

# file location
input_file = '../../Endpoint Mapping Data/Domain Data/v5_unique_domains.csv'
output_file = '../../MAPS Output/Cleaned Content/'


remote_hostname = pd.read_csv(input_file)
# todo testing by sampling 
remote_hostname=remote_hostname.sample(5)

for domain in remote_hostname['domain']:
    num_html, num_pdf, privacy_content = get_privacy_content(domain)
    # todo uncomment to write in file
    # update_summery(domain, {'num_html': num_html, 'num_pdf':num_pdf})
    # file_path = os.path.join(output_file, domain+'.txt')
    # with open(file_path, 'w', encoding='utf-8') as file:
    #     file.write(privacy_content)
    print(num_html, num_pdf)
