# Label and Feature Creation

In this notebook, I will import the single-column dataframe of appellate opinions and create columns with labels and features. 

In [48]:
import io
import re
import pandas as pd
import pickle
import operator

In [2]:
# Open the dataframe
infile = open('ProjectData/df_clean.data', 'rb')
df = pickle.load(infile)
infile.close()

In [3]:
df.reset_index(inplace=True, drop=True)

In [4]:
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3922 entries, 0 to 3921
Data columns (total 1 columns):
Opinion    3922 non-null object
dtypes: object(1)
memory usage: 30.8+ KB


(None,                                              Opinion
 0   an unpublished opinion of the north carolina ...
 1  no. coa11-246 north carolina court of appeals ...
 2  no. coa08-347 north carolina court of appeals ...
 3  michael harrison gregory and wife, vivian greg...
 4  atlantic contracting and material company, inc...)

### 1. Creating a New Column for the File Numbers
This is more experimental than functional.

In [5]:
# capture file number into new column
def coa(string_):
    try:
        pat_coa_number = re.search("no.? ?coa.? ?(\d{2}-\d{1,5})",string_)
        return pat_coa_number.group(1)
    except:
        return('00-000')

In [6]:
coa(df.Opinion[3921])

'20-112'

In [7]:
coa_numbers = []
for i in range(len(df.Opinion)):
    x = coa(df.Opinion[i])
    coa_numbers.append(x)

In [8]:
placeholder=pd.Series(coa_numbers)
df["File_Numbers"] = placeholder.values

In [9]:
df.head(10)

Unnamed: 0,Opinion,File_Numbers
0,an unpublished opinion of the north carolina ...,19-563
1,no. coa11-246 north carolina court of appeals ...,11-246
2,no. coa08-347 north carolina court of appeals ...,08-347
3,"michael harrison gregory and wife, vivian greg...",05-885
4,"atlantic contracting and material company, inc...",02-1087
5,an unpublished opinion of the north carolina c...,13-222
6,in the court of appeals of north carolina no....,17-112
7,in the court of appeals of north carolina no....,15-862
8,no. coa11-1447 north carolina court of appeal...,11-1447
9,an unpublished opinion of the north carolina c...,13-248


### 2. Creating the Labels (Affirmed, Reversed, etc.)
The labels were created using the regex patterns below. Many iterations created the ultimate expression seen in the following function. Errors started at approximately 300; the model was tweaked to include more while maintaining reliability. Ultimately, XX of the "error" rows were dropped, because the cases were not beneficial to the model (i.e., they did not include a relevant summary judgment decision, the opinion on summary judgment was entwined with other components, etc.). 

In [10]:
# # During review of errors, drop rows wrongly included in the data set
# drop_list = [201, 393, 755, 780, 822, 1100, 1139, 1541, 1597, 1716,1751,1907,2059, 2092]
# df_clean.drop(drop_list, axis=0, inplace=True)
# df = df_clean.reset_index(drop=True)
# df.info()

In [11]:
# capture file number into new column
def labels(string_):
    try:
        try:
            try:
                try:  #this level has the highest confidence of getting an accurate label, based upon review of opinions (a single-word sentence)
                    labels = re.search("\.. ?(affirmed?)\.|\.?(reversed?)\.|(affirmed in part)|\.?(dismissed)\.",string_)
                    x = labels.group(1)
                    y = labels.group(2)
                    z = labels.group(3)
                    w = labels.group(4)
                    not_none = [x,y,z]
                    a = [i for i in not_none if i != None]
                    return a[0]
                except:  # slightly less confidence; looks for outcome word within 10 words of "concur", which frequently is at the end of a unanymous opinion
                    labels = re.search("(?:concurs?\W+(?:\w+\W+){0,40}?((affirmed in part)|reversed|affirmed|dismissed|no error|vacated)|((affirmed in part)|affirmed|reversed|dismissed|no error|vacated)\W+(?:\w+\W+){0,40}?concurs?)", string_)
                    #print("Group 0:", labels.group(0), "\nGroup 1:", labels.group(1), "\nGroup 2:", labels.group(2), "\nGroup 3:", labels.group(3), "\nGroup 4:", labels.group(4))
                    x = labels.group(1)
                    y = labels.group(2)
                    z = labels.group(3)
                    w = labels.group(4)
                    not_none = [x,y,z,w]
                    a = [i for i in not_none if i != None]
                    #print("This is resulting list a:", a)
                    return a[0]
            except: #slightly less confidence; if both of the previous methods fail, this clips the last 100 chars of the opinion for any of the outcome words
                clip = string_[-150:]
    #             print(clip)
                labels2 = re.search("('affirmed in part'|reversed|affirmed|dismissed|'affirm in part'|affirm|reverse|dismiss|improvidently allowed)",clip)
                return labels2.group(0)
        except: 
            labels = re.search("(?:reasons set forth?\W+(?:\w+\W+){0,5}?((affirm in part)|reverse|affirm|dismiss|no error|vacated?)|((affirm in part)|affirm|reversed?|dismiss|no error|vacated?)\W+(?:\w+\W+){0,10}?reasons set forth?)", string_)
            #print("Group 0:", labels.group(0), "\nGroup 1:", labels.group(1), "\nGroup 2:", labels.group(2), "\nGroup 3:", labels.group(3), "\nGroup 4:", labels.group(4))
            x = labels.group(1)
            y = labels.group(2)
            z = labels.group(3)
            w = labels.group(4)
            not_none = [x,y,z,w]
            a = [i for i in not_none if i != None]
            #print("This is resulting list a:", a)
            return a[0]
    except:
        return('error')

In [12]:
# Test Cell 
labels(df.Opinion[2092])

'affirmed'

In [13]:
# Apply labels to the DataFrame
labels_list = []
for i in range(len(df.Opinion)):
    x = labels(df.Opinion[i])
    labels_list.append(x)
    
labels_series = pd.Series(labels_list)
df["Result"] = labels_series.values

In [15]:
df.Result.value_counts()

affirmed                 2070
reversed                  691
affirmed in part          454
dismissed                 392
reverse                   122
no error                   70
vacated                    64
error                      25
affirm                     18
improvidently allowed      14
dismiss                     2
Name: Result, dtype: int64

In [16]:
df['Result'].replace(['reverse','affirm', 'dismiss','no error', 'vacated', 'improvidently allowed'],
                     ['reversed','affirmed','dismissed', 'affirmed', 'reversed', 'dismissed'], inplace=True)

# The model will treat 'no error' as 'affirmed' and 'vacated' as 'reversed'

In [17]:
df.Result.value_counts()

affirmed            2158
reversed             877
affirmed in part     454
dismissed            408
error                 25
Name: Result, dtype: int64

# DROP ERRORS AND CHECK

In [18]:
# Drop rows with 'error', 'dismissed', and 'affirmed in part'
drop_list1 = df.loc[df['Result'] == 'error']
drop_list2 = df.loc[df['Result'] == 'affirmed in part']
drop_list3 = df.loc[df['Result'] == 'dismissed']

In [19]:
drop_list = list(drop_list1.index) + list(drop_list2.index) + list(drop_list3.index)

In [20]:
df.drop(drop_list, axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [21]:
df.Result.value_counts()

affirmed    2158
reversed     877
Name: Result, dtype: int64

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3035 entries, 0 to 3034
Data columns (total 3 columns):
Opinion         3035 non-null object
File_Numbers    3035 non-null object
Result          3035 non-null object
dtypes: object(3)
memory usage: 71.3+ KB


### 3. Create Case-Type Feature By Sorting With Keywords

I created a simple sorting function which takes a dictionary of case types with associated keywords, and then it generates a popularity count of the various keywords, returning the highest-ranking case type for a given opinion. 

In [24]:
case_type_dict = {'premises':['premises', 'attractive nuisance', 'dangerous condition', 'slip and fall',
                            'defective condition', 'dog bite'], 
                  'dram_shop':['dram shop'], 
                  'car_crash':['collision', 'vehicle'], 
                  'med_mal':['medical malpractice'], 
                  'contract':['contract', 'formation', 'recission', 'consideration'], 
                  'family_law':['divorce', 'custody', 'maintenance', 'child support'], 
                  'wills':['intestate', 'probate', 'revocable trust', 'irrevocable trust'], 
                  'landlord_tenant':['lease', 'landlord', 'security deposit', 'rent', 'chapter 42'], 
                  'construction':['building defect', 'water intrusion', 'construction defect'], 
                  'property':['easement', 'fee simple', 'tenants in common', 'joint tenants', 'nuisance', 
                             'eminent domain', 'escheat', 'replevin'], 
                  'unfair_deceptive':['unfair and deceptive', 'chapter 75']}

In [106]:
def case_type_sorter(dict_of_keywords, string):
    """ 
    This function takes a dictionary of case types and associated 
    keywords, assigns points for the frequency of the keywords
    of a given case type, and returns the case type with the highest
    number of points.  The dict_of_keywords should be a dictionary
    of case-type keys and keyword values; the string should be a 
    single string.
    """
    counter_dict = {}

    # Iterate through dictionary, counting frequency of each keyword in the string/Opinion
    for key, values in dict_of_keywords.items():
        counter_dict[key] = 0
        for value in values:
            count = string.count(value)
            existing_count = counter_dict[key]
            counter_dict[key] = count + existing_count
    
    # Get total points for all keywords
    values = counter_dict.values()
    total_count = sum(values)
    
    likely_case_type = max(counter_dict.items(), key=operator.itemgetter(1))[0]
    try:
        confidence = str(round((counter_dict[likely_case_type]/total_count)*100,2))+'%'
    except:
        confidence = 'n/a'
    
    return likely_case_type, confidence


In [107]:
case_type_sorter(case_type_dict, df.Opinion[0])

('property', '64.71%')

In [109]:
# Apply case_type and confidence level to DataFrame

case_type_list = []
case_type_confidence = []
for i in range(len(df.Opinion)):
    y,z = case_type_sorter(case_type_dict, df.Opinion[i])
    case_type_list.append(y)
    case_type_confidence.append(z)

case_type_series = pd.Series(case_type_list)
case_confidence_series = pd.Series(case_type_confidence)
df["Case_Type"] = case_type_series.values
df["Case Type Confidence"] = case_confidence_series.values

In [111]:
df.sample(20)

Unnamed: 0,Opinion,File_Numbers,Result,Case_Type,Case Type Confidence
1158,"joyce otto, plaintiff, v. daniel and kimberly ...",10-172,reversed,landlord_tenant,55.56%
2326,an unpublished opinion of the north carolina c...,01-723,reversed,contract,91.67%
794,an unpublished opinion of the north carolina c...,06-780,affirmed,contract,100.0%
2535,no. coa12-1037 north carolina court of appeal...,12-1037,affirmed,contract,52.94%
110,"john p. reidy, and wife, terri l. reidy, plain...",06-1310,affirmed,contract,94.74%
2418,no. coa12-1378 north carolina court of appeals...,12-1378,affirmed,landlord_tenant,57.14%
1072,an unpublished opinion of the north carolina c...,05-797,affirmed,property,100.0%
1595,an unpublished opinion of the north carolina ...,13-1273,affirmed,premises,
1330,"wendell justin williamson, plaintiff v. myron ...",99-813,reversed,landlord_tenant,50.0%
974,"haley amanda lail, a minor by and through her ...",06-1244,affirmed,contract,91.67%
