In [None]:
"""
This notebook implements the preprocessing routine which prunes the list of basic legal stop words
in addition to the baseline preprocessing tasks, and keeps only nouns and verbs. The chosen 
dataset consists  of all cases appearing in the citation graph whose jurisdiction is Illinois 
and whose decision dates occured after 1950.
"""

import pandas as pd

import os
import time

import spacy

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


raw_data_header = '/Users/jhamer90811/Documents/Insight/legal_topic_modeling/data_uncompressed'

dataset = 'cases_IL_after1950_42k'

basic_legal_stopwords = {'a.',
 'a.2d',
 'a.3d',
 'appeal',
 'appellant',
 'appellee',
 'case',
 'cir',
 'court',
 'defendant',
 'f. supp.',
 'f.supp.',
 'f.supp.2d',
 'f.supp.3d',
 'fact',
 'find',
 'hold',
 'judgment',
 'n.e.',
 'n.e.2d',
 'opinion',
 'order',
 'p.',
 'p.2d',
 'p.3d',
 'plaintiff',
 'question',
 's.e.',
 's.e.2d',
 's.e.3d',
 's.w.',
 's.w.2d',
 's.w.3d.',
 'see',
 'so.',
 'so.2d',
 'state',
 'time',
 'trial'}

processed_data_header = '/Users/jhamer90811/Documents/Insight/legal_topic_modeling/data_uncompressed/basic_stopwords_nouns_verbs_42k'




In [None]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

reader = pd.read_csv(os.path.join(raw_data_header, dataset + '_raw.csv'), chunksize=1000)
i = 1
for data_raw in reader:
    start = time.time()
    opinions = []
    print(f'Beginning preprocessing of opinions for {dataset}...')
    for _, row in data_raw.iterrows():
        case_id = row.case_id
        juris = row.jurisdiction
        text = row.opinion
        court = row.court_name
        decision_year = row.decision_year
        doc = nlp(text)
        doc = [t.lemma_ for t in doc if (t.pos_ =='NOUN') and\
                                        (not t.is_stop) and\
                                        (t.lemma_ not in basic_legal_stopwords)]
        opinions.append((case_id, juris, court, decision_year, doc))
    df = pd.DataFrame(opinions, columns=['case_id', 'jurisdiction', 'court', 'decision_year', 'opinion'])
    opinions = None
    if i==1:
        df.to_csv(os.path.join(processed_data_header, dataset + '_processed.csv'), index=False)
    else:
        df.to_csv(os.path.join(processed_data_header, dataset + '_processed.csv'), index=False,
                 mode = 'a', header=False)
    df = None
    print(f'Done with {i*1000} opinions.')
    print(f'Time elapsed: {round(time.time()-start)}')
    i+=1
    print('#######################################')

print(f'Done preprocessing. Took {round(time.time()-start)} seconds.')

    
nlp = None  