In [1]:
"""
This notebook implements the preprocessing routine which applies bigram-phrasing to the tokens 
extracted by the baseline preprocessing routine as well as removal of basic legal stopwords.
"""

import pandas as pd

import os
import time

import spacy

from gensim.models.phrases import Phrases, Phraser

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


raw_data_header = '/Users/jhamer90811/Documents/Insight/legal_topic_modeling/data_uncompressed/raw_data_12k'

# Note: due to time constraints, I am only preprocessing the 'cases_IL_after1950_12k' dataset

# datasets = ['random_cases2', 'cases_after1950_12k', 'cases_IL_12k', 'cases_IL_after1950_12k']

datasets = ['cases_IL_after1950_12k']

baseline_data_header = '/Users/jhamer90811/Documents/Insight/legal_topic_modeling/data_uncompressed/baseline_12k'

processed_data_header = '/Users/jhamer90811/Documents/Insight/legal_topic_modeling/data_uncompressed/phrasing_basic_stopwords_12k'

def parse_list_col(df, col_to_parse):
    df.loc[:, col_to_parse] = df[col_to_parse].apply(lambda x: x.strip('[]').split(','))
    df.loc[:, col_to_parse] = df[col_to_parse].apply(lambda x: [t.strip().strip("'") for t in x])
    
basic_legal_stopwords = {'a.',
 'a.2d',
 'a.3d',
 'appeal',
 'appellant',
 'appellee',
 'case',
 'cir',
 'court',
 'defendant',
 'f. supp.',
 'f.supp.',
 'f.supp.2d',
 'f.supp.3d',
 'fact',
 'find',
 'hold',
 'judgment',
 'n.e.',
 'n.e.2d',
 'opinion',
 'order',
 'p.',
 'p.2d',
 'p.3d',
 'plaintiff',
 'question',
 's.e.',
 's.e.2d',
 's.e.3d',
 's.w.',
 's.w.2d',
 's.w.3d.',
 'see',
 'so.',
 'so.2d',
 'state',
 'time',
 'trial'}
    
    


In [2]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

for w in basic_legal_stopwords:
    nlp.vocab[w].is_stop = True

for dataset in datasets:
    baseline_data = pd.read_csv(os.path.join(baseline_data_header, dataset + '_processed.csv'))
    parse_list_col(baseline_data, 'opinion')
    print(f'Training bigram phraser...')
    bigram = Phrases(baseline_data['opinion'].to_list(), min_count=5, threshold=100)
    bigram_mod = Phraser(bigram)
    print('Done.')
    baseline_data = None
    
    data_raw = pd.read_csv(os.path.join(raw_data_header, dataset + '_raw.csv'))
    start = time.time()
    opinions = []
    print(f'Beginning preprocessing of opinions for {dataset}...')
    for i, row in data_raw.iterrows():
        case_id = row.case_id
        juris = row.jurisdiction
        text = row.opinion
        court = row.court_name
        if dataset=='random_cases2':
            decision_year = pd.to_datetime(row.decision_date).year
        else:
            decision_year = row.decision_year
        doc = nlp(text)
        doc = [t.lemma_ for t in doc if (not t.is_stop) and\
                                        (t.lemma_ not in basic_legal_stopwords) and\
                                        (t.pos_ in ['NOUN', 'ADJ', 'VERB', 'ADV'])]
        doc = bigram_mod[doc]
        opinions.append((case_id, juris, court, decision_year, doc))
        if (i+1)%1000==0:
            print(f'Done with {i+1} opinions.')
            print(f'Time elapsed: {round(time.time()-start)}')
            print('#######################################')
    print(f'Done preprocessing. Took {round(time.time()-start)} seconds.')

    data_raw = None

    df = pd.DataFrame(opinions, columns=['case_id', 'jurisdiction', 'court', 'decision_year', 'opinion'])
    opinions = None
    df.to_csv(os.path.join(processed_data_header, dataset + '_processed.csv'), index=False)

    df = None
    
nlp = None  



Training bigram phraser...
Done.
Beginning preprocessing of opinions for cases_IL_after1950_12k...
Done with 1000 opinions.
Time elapsed: 236
#######################################
Done with 2000 opinions.
Time elapsed: 479
#######################################
Done with 3000 opinions.
Time elapsed: 719
#######################################
Done with 4000 opinions.
Time elapsed: 955
#######################################
Done with 5000 opinions.
Time elapsed: 1198
#######################################
Done with 6000 opinions.
Time elapsed: 1445
#######################################
Done with 7000 opinions.
Time elapsed: 1686
#######################################
Done with 8000 opinions.
Time elapsed: 1934
#######################################
Done with 9000 opinions.
Time elapsed: 2172
#######################################
Done with 10000 opinions.
Time elapsed: 2414
#######################################
Done with 11000 opinions.
Time elapsed: 2647
###################