In [None]:
"""
This notebook implements the baseline preprocessing routine. This uses spacy's 'en_core_web_sm' 
model to tokenize documents, remove common English stopwords (not law-specific), and keep only
NOUN, ADJ, ADV, and VERB parts of speech.
"""

import pandas as pd

import os
import time

import spacy

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


raw_data_header = '/Users/jhamer90811/Documents/Insight/legal_topic_modeling/data_uncompressed/raw_data_12k'

datasets = ['random_cases2', 'cases_after1950_12k', 'cases_IL_12k', 'cases_IL_after1950_12k']

processed_data_header = '/Users/jhamer90811/Documents/Insight/legal_topic_modeling/data_uncompressed/baseline_12k'




In [None]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

for dataset in datasets:
    data_raw = pd.read_csv(os.path.join(raw_data_header, dataset + '_raw.csv'))
    start = time.time()
    opinions = []
    print(f'Beginning preprocessing of opinions for {dataset}...')
    for i, row in data_raw.iterrows():
        case_id = row.case_id
        juris = row.jurisdiction
        text = row.opinion
        court = row.court_name
        if dataset=='random_cases2':
            decision_year = pd.to_datetime(row.decision_date).year
        else:
            decision_year = row.decision_year
        doc = nlp(text)
        doc = [t for t in doc if not t.is_stop]
        doc = [t.lemma_ for t in doc if t.pos_ in ['NOUN', 'ADJ', 'VERB', 'ADV']]
        opinions.append((case_id, juris, court, decision_year, doc))
        if (i+1)%1000==0:
            print(f'Done with {i+1} opinions.')
            print(f'Time elapsed: {round(time.time()-start)}')
            print('#######################################')
    print(f'Done preprocessing. Took {round(time.time()-start)} seconds.')

    data_raw = None

    df = pd.DataFrame(opinions, columns=['case_id', 'jurisdiction', 'court', 'decision_year', 'opinion'])
    opinions = None
    df.to_csv(os.path.join(processed_data_header, dataset + '_processed.csv'), index=False)

    df = None
    
nlp = None  