In [1]:
"""
This notebook implements the preprocessing routine which prunes the list of basic legal stop words
in addition to the baseline preprocessing tasks; moreover, only nouns are kept.
"""

import pandas as pd

import os
import time

import spacy

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


raw_data_header = '/Users/jhamer90811/Documents/Insight/legal_topic_modeling/data_uncompressed/raw_data_12k'

# Note: due to time constraints, I am only preprocessing the 'cases_IL_after1950_12k' dataset

# datasets = ['random_cases2', 'cases_after1950_12k', 'cases_IL_12k', 'cases_IL_after1950_12k']

datasets = ['cases_IL_after1950_12k']

basic_legal_stopwords = {'a.',
 'a.2d',
 'a.3d',
 'appeal',
 'appellant',
 'appellee',
 'case',
 'cir',
 'court',
 'defendant',
 'f. supp.',
 'f.supp.',
 'f.supp.2d',
 'f.supp.3d',
 'fact',
 'find',
 'hold',
 'judgment',
 'n.e.',
 'n.e.2d',
 'opinion',
 'order',
 'p.',
 'p.2d',
 'p.3d',
 'plaintiff',
 'question',
 's.e.',
 's.e.2d',
 's.e.3d',
 's.w.',
 's.w.2d',
 's.w.3d.',
 'see',
 'so.',
 'so.2d',
 'state',
 'time',
 'trial'}

processed_data_header = '/Users/jhamer90811/Documents/Insight/legal_topic_modeling/data_uncompressed/basic_stopwords_nouns_12k'




In [2]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

for dataset in datasets:
    data_raw = pd.read_csv(os.path.join(raw_data_header, dataset + '_raw.csv'))
    start = time.time()
    opinions = []
    print(f'Beginning preprocessing of opinions for {dataset}...')
    for i, row in data_raw.iterrows():
        case_id = row.case_id
        juris = row.jurisdiction
        text = row.opinion
        court = row.court_name
        if dataset=='random_cases2':
            decision_year = pd.to_datetime(row.decision_date).year
        else:
            decision_year = row.decision_year
        doc = nlp(text)
        doc = [t.lemma_ for t in doc if (t.pos_=='NOUN') and\
                                        (not t.is_stop) and\
                                        (t.lemma_ not in basic_legal_stopwords)]
        opinions.append((case_id, juris, court, decision_year, doc))
        if (i+1)%1000==0:
            print(f'Done with {i+1} opinions.')
            print(f'Time elapsed: {round(time.time()-start)}')
            print('#######################################')
    print(f'Done preprocessing. Took {round(time.time()-start)} seconds.')

    data_raw = None

    df = pd.DataFrame(opinions, columns=['case_id', 'jurisdiction', 'court', 'decision_year', 'opinion'])
    opinions = None
    df.to_csv(os.path.join(processed_data_header, dataset + '_processed.csv'), index=False)

    df = None
    
nlp = None  

Beginning preprocessing of opinions for cases_IL_after1950_12k...
Done with 1000 opinions.
Time elapsed: 188
#######################################
Done with 2000 opinions.
Time elapsed: 379
#######################################
Done with 3000 opinions.
Time elapsed: 567
#######################################
Done with 4000 opinions.
Time elapsed: 754
#######################################
Done with 5000 opinions.
Time elapsed: 948
#######################################
Done with 6000 opinions.
Time elapsed: 1144
#######################################
Done with 7000 opinions.
Time elapsed: 1342
#######################################
Done with 8000 opinions.
Time elapsed: 1547
#######################################
Done with 9000 opinions.
Time elapsed: 1745
#######################################
Done with 10000 opinions.
Time elapsed: 1941
#######################################
Done with 11000 opinions.
Time elapsed: 2130
#######################################
Done with 120