In [1]:
import json
import os
import re
from pathlib import Path
import pandas as pd
from bs4 import BeautifulSoup
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()
import html as ihtml

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


In [2]:
def read_jsons_into_dataframe(directory):
    temp_list_of_dfs = []
    directory = directory
    pathlist = Path(directory).rglob('*.json')
    for path in pathlist:
        with open(path) as f:
            json_data = pd.json_normalize(json.loads(f.read()))
        temp_list_of_dfs.append(json_data)
    combined_df = pd.concat(temp_list_of_dfs, ignore_index=True)
    return(combined_df)

In [3]:
df_1700s = read_jsons_into_dataframe('1700s')
df_1800s = read_jsons_into_dataframe('1800s')
df_1900s = read_jsons_into_dataframe('1900s')
df_2000s = read_jsons_into_dataframe('2000s')

In [4]:
df_1700s['label'] = 1700
df_1800s['label'] = 1800
df_1900s['label'] = 1900
df_2000s['label'] = 2000

In [5]:
merged_df = pd.concat([df_1700s, df_1800s, df_1900s, df_2000s])
print(merged_df.columns)

Index(['date_blocked', 'id', 'blocked', 'judges', 'court', 'date_filed',
       'download_url', 'source', 'local_path', 'html_lawbox', 'time_retrieved',
       'nature_of_suit', 'plain_text', 'html_with_citations', 'sha1',
       'date_modified', 'precedential_status', 'extracted_by_ocr',
       'citation_count', 'absolute_url', 'docket', 'html', 'resource_uri',
       'citation.state_cite_three', 'citation.federal_cite_two',
       'citation.resource_uri', 'citation.federal_cite_three',
       'citation.lexis_cite', 'citation.document_uris',
       'citation.scotus_early_cite', 'citation.federal_cite_one',
       'citation.case_name', 'citation.westlaw_cite',
       'citation.state_cite_one', 'citation.specialty_cite_one',
       'citation.state_cite_regional', 'citation.id', 'citation.docket_number',
       'citation.state_cite_two', 'citation.neutral_cite',
       'supreme_court_db_id', 'label'],
      dtype='object')


In [6]:
import numpy as np
merged_df_of_interest = merged_df[['date_filed', 'plain_text', 'html', 'html_with_citations', 'label']]
merged_df_of_interest = merged_df_of_interest.replace(r'^\s*$', np.nan, regex=True)
print(merged_df_of_interest.shape)
merged_df_of_interest = merged_df_of_interest.dropna(subset=['plain_text', 'html', 'html_with_citations'], how='all')
print(merged_df_of_interest.shape)
#filter = merged_df_of_interest['html'] != ''
#filtered_df = df[filter]
#filtered_df = filtered_df.dropna()

(63374, 5)
(61314, 5)


In [7]:
print(merged_df_of_interest['html'].isna().sum())

1427


In [8]:
def clean_html_text(html_text):
    if html_text == '':
        pass 
    else:
        cleaned_text = BeautifulSoup(html_text, 'lxml').get_text()
    return(cleaned_text)

In [9]:
def clean_text(html_text):
    if html_text == '':
        pass
    else:
        text = BeautifulSoup(ihtml.unescape(html_text), "lxml").text
        text = re.sub(r"http[s]?://\S+", "", html_text)
        text = re.sub(r"\s+", " ", html_text)
    return text

In [10]:
def strip_html(html):
    if html == '':
        pass
    else:
        soup = BeautifulSoup(html, "html.parser")
        for data in soup(['style', 'script', 'code', 'a']):
            data.decompose()
    return ' '.join(soup.stripped_strings)

In [11]:
m_df = merged_df_of_interest.copy()
#m_df.head()
#print(m_df.info())

In [12]:
#m_df.head()

In [13]:
def html_parser(raw_html):
    raw_html = str(raw_html)
    soup = BeautifulSoup(raw_html, 'html.parser')
    soup_string = soup.get_text()
    soup_string = re.sub('<.*>', ' ', soup_string)
    return soup_string

In [14]:
m_df['cleaned_text'] = m_df['html_with_citations'].progress_apply(lambda x: html_parser(x) if x != '' else x)


  0%|          | 0/61314 [00:00<?, ?it/s]

In [15]:
m_df['cleaned_text'] = m_df['cleaned_text'].replace(r'\n',' ', regex=True)

In [16]:
m_df.to_csv('cleaned_text_file.csv', sep=',', header=True, index=False)