### SAP Machine Learning Embedding in OpenAI - step 04
##### Author: Sergiu Iatco. May, 2023
https://people.sap.com/iatco.sergiu <br>
https://www.linkedin.com/in/sergiuiatco/ <br>

In [1]:
import os
import re
import pathlib
from bs4 import BeautifulSoup

class collect_text():
    def __init__(self, mask_ext = None):
        if mask_ext==None:
            self.mask_ext = '.txt'
        else:
            self.mask_ext = mask_ext
        
    def open_html(self, html_file, encoding_read = None):

        if encoding_read==None:
            encoding_read = 'utf-8'

        with open(html_file, encoding=encoding_read) as f:
            html_content = f.read()
        
        return html_content
    
    def html_to_text(self, html_content):

        soup = BeautifulSoup(html_content)
        text = soup.get_text()
        text_content = re.sub(r'\n+', '\n', text)

        return text_content

    def html_to_text_file(self, html_file, path_save = None, content = False, verbose = 0, encoding_read=None, \
                           encoding_write = None):
        
              
        html_content = self.open_html(html_file, encoding_read = encoding_read)
        text_content = self.html_to_text(html_content)

        if encoding_write==None:
            encoding_write = 'utf-8'

        filename_path = os.path.split(html_file)[0] 
        filename_with_ext = os.path.split(html_file)[1]  #filename with ext & w/o path
        filename_ext_txt = os.path.splitext(filename_with_ext)[0] + self.mask_ext
        
        if path_save!= None:
            txt_file = os.path.join(path_save, filename_path, filename_ext_txt)
            
            path_save_subdir_source = os.path.split(txt_file)[0] # file path
            
            if not os.path.exists(path_save_subdir_source):
                os.makedirs(path_save_subdir_source)
            
            print(f"path_save: {path_save}")
            print(f"path_save_subdir_source: {path_save_subdir_source}")
                
        else:
            txt_file = os.path.join(filename_path, filename_ext_txt)
    
        with open(txt_file, 'w', encoding=encoding_write) as f:
            f.write(text_content)

        if verbose == 1:
            print(f"Source     : {html_file}")
            print(f"Destination: {txt_file}")
        
        if os.path.exists(txt_file):
            print("File conversion complete and save!")
        else:
            print("File was not saved!")
        
        if content == True:
            return txt_file
    
    def html_path_to_text(self, repo_path = None, path_save = None, encoding_read = None, encoding_write = None, verbose = 0):
        # verbose: 0 - Complete message | 1 - Source file & Saved file
        name_filter = "**/*.html"
        if repo_path==None:
            repo_path = ''
        
        repo_path_lib = pathlib.Path(repo_path)
        document_files = list(repo_path_lib.glob(name_filter))
        
        for html_file in document_files:
        
            self.html_to_text_file(html_file, path_save = path_save, content = False, verbose = verbose, \
                                   encoding_read=None, encoding_write = None)
       
        print(f"List conversion complete! Files: {len(document_files)}")



In [18]:
import pathlib

class clean_data():
    def __init__(self):
        pass
    def tail_trimm(self, path,  tail_del_n = 3, file_extension = '.txt', verbose = 0):
        path_lib = pathlib.Path(path)
        path_lib_files = list(path_lib.glob(f"**/*{file_extension}"))
        files_processed_n = 0
        encoding_read = 'utf-8'

        for file in path_lib_files:
            with open(file, 'r', encoding = encoding_read,) as f:
                lines = f.readlines()

            modified_lines = lines[:-tail_del_n]

            with open(file, 'w', encoding = encoding_read) as f:
                f.writelines(modified_lines)
            if verbose == 1:
                print(f"File processed: {file}")

        print(f"Files processed: {len(path_lib_files)}")      

In [3]:
# # Example 1 convert html_file to text and save from folder to path_save
# ct = collect_text(mask_ext = '.txt')
# html_file = "html_files_test/ipynb_source_test/example notebook v1.html"
# path_save = 'html_files_test_txt'
# ct.html_to_text_file(html_file = html_file, path_save = path_save, verbose = 1)

In [4]:
# # Example 2 convert html repo_path to text and save in same folder
# repo_path = 'html_files_test/'
# cl_to_text = collect_text(mask_ext = '.txt')
# cl_to_text.html_path_to_text(repo_path = repo_path, verbose = 1)

In [5]:
# # Example 3 convert html from repo_path to text and save in path_save
# repo_path = 'html_files_test/'
# path_save = 'html_files_test_path_txt'
# cl_to_text = collect_text(mask_ext = '.txt')
# cl_to_text.html_path_to_text(repo_path = repo_path, path_save = path_save, verbose = 1)

In [6]:
# Convert html from repo_path in txt and save in same folder
repo_path1 = 'llama_challenge/html_challenge/'

cl_to_text = collect_text(mask_ext = '.txt')
cl_to_text.html_path_to_text(repo_path = repo_path1, verbose = 1)

Source     : llama_challenge\html_challenge\understanding_metrics_blog.html
Destination: llama_challenge\html_challenge\understanding_metrics_blog.txt
File conversion complete and save!
Source     : llama_challenge\html_challenge\challenge_20221107.html
Destination: llama_challenge\html_challenge\challenge_20221107.txt
File conversion complete and save!
Source     : llama_challenge\html_challenge\challenge_20221128.html
Destination: llama_challenge\html_challenge\challenge_20221128.txt
File conversion complete and save!
Source     : llama_challenge\html_challenge\challenge_20221222.html
Destination: llama_challenge\html_challenge\challenge_20221222.txt
File conversion complete and save!
Source     : llama_challenge\html_challenge\hana_ml.dataframe.html
Destination: llama_challenge\html_challenge\hana_ml.dataframe.txt
File conversion complete and save!
Source     : llama_challenge\html_challenge\hana_ml.algorithms.pal.trees.HybridGradientBoostingClassifier.html
Destination: llama_challe

In [7]:
# Convert html from repo_path in txt and save in same folder
repo_path2 = "llama_challenge//ipynb_hana_ml_samples//Python-API//usecase-examples//sapcommunity-hanaml-challenge"
cl_to_text = collect_text(mask_ext = '.txt')
cl_to_text.html_path_to_text(repo_path = repo_path2, verbose = 1)

Source     : llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\10 Connectivity Check.html
Destination: llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\10 Connectivity Check.txt
File conversion complete and save!
Source     : llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\20 Data upload.html
Destination: llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\20 Data upload.txt
File conversion complete and save!
Source     : llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\PAL Tutorial - Unified Classification Hybrid Gradient Boosting - PredictiveQuality Example.html
Destination: llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\PAL Tutorial - Unified Classification Hybrid Gradient Boosting - PredictiveQuality Examp

In [8]:
# Convert html from repo_path in txt and save in same folder
repo_path3 = 'llama_challenge/ipynb_blog/'
cl_to_text = collect_text(mask_ext = '.txt')
repo_path = 'llama_challenge/html_challenge/'
cl_to_text.html_path_to_text(repo_path = repo_path3, verbose = 1)

Source     : llama_challenge\ipynb_blog\SAP HANA ML challendge - CHURN  v2.3 max.html
Destination: llama_challenge\ipynb_blog\SAP HANA ML challendge - CHURN  v2.3 max.txt
File conversion complete and save!
List conversion complete! Files: 1


In [9]:
import pathlib

def list_ipynb(repo_path, extension):
    name_filter = f"**/*.{extension}"
    repo_path_lib = pathlib.Path(repo_path)
    files = list(repo_path_lib.glob(name_filter))
    for file in files:
        print(file)

In [10]:
print(repo_path1)
print()
list_ipynb(repo_path1, "txt")

llama_challenge/html_challenge/

llama_challenge\html_challenge\understanding_metrics_blog.txt
llama_challenge\html_challenge\challenge_20221107.txt
llama_challenge\html_challenge\challenge_20221128.txt
llama_challenge\html_challenge\challenge_20221222.txt
llama_challenge\html_challenge\hana_ml.dataframe.txt
llama_challenge\html_challenge\hana_ml.algorithms.pal.trees.HybridGradientBoostingClassifier.txt


In [11]:
print(repo_path2)
print()
list_ipynb(repo_path2, "txt")

llama_challenge//ipynb_hana_ml_samples//Python-API//usecase-examples//sapcommunity-hanaml-challenge

llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\readme.txt
llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\10 Connectivity Check.txt
llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\20 Data upload.txt
llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\PAL Tutorial - Unified Classification Hybrid Gradient Boosting - PredictiveQuality Example.txt
llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\Upload and explore Employee Churn data.txt


In [12]:
print(repo_path3)
print()
list_ipynb(repo_path3, "txt")

llama_challenge/ipynb_blog/

llama_challenge\ipynb_blog\SAP HANA ML challendge - CHURN  v2.3 max.txt


In [19]:
clnd = clean_data()

In [20]:
clnd.tail_trimm(path = repo_path1, verbose = 1)

File processed: llama_challenge\html_challenge\understanding_metrics_blog.txt
File processed: llama_challenge\html_challenge\challenge_20221107.txt
File processed: llama_challenge\html_challenge\challenge_20221128.txt
File processed: llama_challenge\html_challenge\challenge_20221222.txt
File processed: llama_challenge\html_challenge\hana_ml.dataframe.txt
File processed: llama_challenge\html_challenge\hana_ml.algorithms.pal.trees.HybridGradientBoostingClassifier.txt
Files processed: 6


In [21]:
clnd.tail_trimm(path = repo_path2, verbose = 1)

File processed: llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\readme.txt
File processed: llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\10 Connectivity Check.txt
File processed: llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\20 Data upload.txt
File processed: llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\PAL Tutorial - Unified Classification Hybrid Gradient Boosting - PredictiveQuality Example.txt
File processed: llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\Upload and explore Employee Churn data.txt
Files processed: 5


In [22]:
clnd.tail_trimm(path = repo_path3, verbose = 1)

File processed: llama_challenge\ipynb_blog\SAP HANA ML challendge - CHURN  v2.3 max.txt
Files processed: 1
