In [15]:
import pandas as pd
import re
import os


def parse_text_to_df(file_path):
    # Regular expression to match the tagged sentences
    tag_regex = r'<([a-z]{1,3})(\d)>(.*?)</\1\2>'
    
    # Define the tag columns
    tag_columns = ['a', 'ch', 'cr', 'j', 'law', 'ltd', 'ter', 'use']

    # List to hold row data
    rows = []
    document_name = os.path.splitext(os.path.basename(file_path))[0]

    with open(file_path, 'r') as file:
        for line in file:
            # Find all tagged sentences in the line
            matches = re.findall(tag_regex, line)

            if matches:
                for match in matches:
                    tag, severity, text = match

                    # Create a row dictionary with all tags set to 0
                    row = {t: 0 for t in tag_columns}
                    row['clause'] = text.strip()
                    row['severity'] = int(severity)

                    # Set the tag column to 1 if the tag is present
                    if tag in tag_columns:
                        row[tag] = 1

                    row['document'] = document_name
                    rows.append(row)
                
                # Replace tagged sentences with empty string in the line
                line = re.sub(tag_regex, '', line)

            # Split the line into sentences and add those without tags
            sentences = re.split(r'(?<=[.!?])\s+', line)
            for sentence in sentences:
                if sentence.strip():
                    row = {t: 0 for t in tag_columns}
                    row['clause'] = sentence.strip()
                    row['severity'] = 0
                    row['document'] = document_name
                    rows.append(row)

    # Create a DataFrame
    df = pd.DataFrame(rows, columns=['document', 'clause', 'severity'] + tag_columns)
    return df

# Use the function to parse the file and create a DataFrame
file_path = 'ToS/OriginalTaggedDocuments/9gag.xml'  # Replace with your file path
df = parse_text_to_df(file_path)

# Show the DataFrame
df


Unnamed: 0,document,clause,severity,a,ch,cr,j,law,ltd,ter,use
0,9gag,Terms of services,0,0,0,0,0,0,0,0,0
1,9gag,* Accepting the Terms of Service,0,0,0,0,0,0,0,0,0
2,9gag,"By using or accessing the Services, you agree ...",2,0,0,0,0,0,0,0,1
3,9gag,"The purpose of this website, 9gag.com (the “Si...",0,0,0,0,0,0,0,0,0
4,9gag,Please read these terms of service (“Agreement...,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
178,9gag,17 U.S.C.,0,0,0,0,0,0,0,0,0
179,9gag,§ 512(f).,0,0,0,0,0,0,0,0,0
180,9gag,9GAG reserves the right to seek damages from a...,0,0,0,0,0,0,0,0,0
181,9gag,"For the avoidance of doubt, only notices submi...",0,0,0,0,0,0,0,0,0


In [22]:
def parse_and_combine_files(file_paths):
    # Initialize an empty DataFrame to store all data
    combined_df = pd.DataFrame()

    for file_path in file_paths:
        # Parse each file and create a DataFrame
        df = parse_text_to_df(file_path)

        # Concatenate this DataFrame with the combined DataFrame
        combined_df = pd.concat([combined_df, df], ignore_index=True)

    return combined_df

# get all file names from folder ToS/OriginalTaggedDocuments
import glob

def get_file_names():
    files = glob.glob('ToS/OriginalTaggedDocuments/**/*.xml', recursive=True)
    return files



files = get_file_names()

all_df = parse_and_combine_files(files)
all_df

Unnamed: 0,document,clause,severity,a,ch,cr,j,law,ltd,ter,use
0,Spotify,Spotify,0,0,0,0,0,0,0,0,0
1,Spotify,•\tPremium,0,0,0,0,0,0,0,0,0
2,Spotify,•\tHelp,0,0,0,0,0,0,0,0,0
3,Spotify,•\tDownload,0,0,0,0,0,0,0,0,0
4,Spotify,•,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
12312,Vivino,Where Vivino has provided you with a translati...,0,0,0,0,0,0,0,0,0
12313,Vivino,If there is any contradiction between what the...,0,0,0,0,0,0,0,0,0
12314,Vivino,Contact,0,0,0,0,0,0,0,0,0
12315,Vivino,You may contact Vivino at the following addres...,0,0,0,0,0,0,0,0,0


In [24]:
all_df.to_csv('combined.csv')