# Authorship Attribution

We have etracted 1796 slip opinions by SCOTUS, each labeled with their 
author, the opinion type (concurrence vs. dissent vs. statement, etc.),
the date, case number, and text.

In [41]:
import pickle
import pandas as pd
import numpy as np
import spacy
import re
import matplotlib.pyplot as plt
import seaborn as sns

with open('./data/all_slip_opinions.pkl', 'rb') as f:
    data = pickle.load(f)



### Prepare the data

In [42]:
author_col, case_col, text_col, date_col, release_type_col, opinion_type_col = [], [], [], [], [], []

for term, releases in data.items():
    for release in releases:
        for opinion in release.opinions:
            author_col.append(opinion.author.name)
            case_col.append(release.case_number)
            text_col.append(opinion.text)
            date_col.append(release.date)
            release_type_col.append(release.document_type)
            opinion_type_col.append(opinion.type.name)
            

In [43]:
from omg_scotus.justice import Justice, create_court
def get_ideology(name:str):
    court = create_court(current=False)
    for j in court:
        if j.last_name.upper() == name:
            return j.ideology.name
    return np.nan
    

In [44]:
from uuid import UUID


df = pd.DataFrame({'text': text_col, 'case': case_col, 'date': date_col, 'opinion_type': opinion_type_col, 'release_type': release_type_col, 'label': author_col})
df['date'] = pd.to_datetime(df['date'])
df['term'] = df['date'].dt.year
df['ideology'] = df['label'].apply(get_ideology)
df['uid'] = df[['case', 'date', 'opinion_type', 'label']].sum(axis=1).map(hash)
df.head()

  df['uid'] = df[['case', 'date', 'opinion_type', 'label']].sum(axis=1).map(hash)


Unnamed: 0,text,case,date,opinion_type,release_type,label,term,ideology,uid
0,SUPREME COURT OF THE UNITED STATES \nNATALIE E...,No JSON case data. Case too old.,2012-09-25,PER_CURIAM,DocumentType.SLIP_OPINION,PER_CURIAM,2012,,0
1,SUPREME COURT OF THE UNITED STATES \n_________...,No JSON case data. Case too old.,2012-06-28,PER_CURIAM,DocumentType.SLIP_OPINION,PER_CURIAM,2012,,0
2,SUPREME COURT OF THE UNITED STATES \n_________...,No JSON case data. Case too old.,2012-06-28,PLURALITY,DocumentType.SLIP_OPINION,KENNEDY,2012,CONSERVATIVE,0
3,SUPREME COURT OF THE UNITED STATES \n_________...,No JSON case data. Case too old.,2012-06-28,CONCURRENCE,DocumentType.SLIP_OPINION,BREYER,2012,LIBERAL,0
4,SUPREME COURT OF THE UNITED STATES \n_________...,No JSON case data. Case too old.,2012-06-28,DISSENT,DocumentType.SLIP_OPINION,ALITO,2012,CONSERVATIVE,0


In [45]:
df.label.value_counts()

THOMAS        300
ALITO         223
SOTOMAYOR     211
BREYER        187
GINSBURG      144
ROBERTS       131
PER_CURIAM    122
KAGAN         119
SCALIA        102
GORSUCH        97
KENNEDY        82
KAVANAUGH      58
BARRETT        20
Name: label, dtype: int64

In [138]:
term_counts_by_author = df.pivot_table(index='label', columns='term', values='uid', aggfunc='count', fill_value=0, margins=True)
term_counts_by_author

term,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,All
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ALITO,0,18,25,19,30,20,17,15,20,27,16,16,223
BARRETT,0,0,0,0,0,0,0,0,0,0,8,12,20
BREYER,0,22,18,14,17,16,17,19,19,17,12,16,187
GINSBURG,1,20,18,15,13,17,18,14,15,13,0,0,144
GORSUCH,0,0,0,0,0,0,5,17,22,15,18,20,97
KAGAN,1,10,12,10,10,12,8,9,12,10,11,14,119
KAVANAUGH,0,0,0,0,0,0,0,0,14,16,13,15,58
KENNEDY,0,10,14,10,13,13,9,13,0,0,0,0,82
PER_CURIAM,4,12,8,9,8,17,11,14,8,14,12,5,122
ROBERTS,0,12,17,13,14,10,10,12,11,10,12,10,131


# NAIVE AF - Authorship

In [46]:
from omg_scotus.justice import JusticeTag
from omg_scotus.helpers import remove_hyphenation
def remove_justice_names_from_str(s: str):
    s = re.sub(r'|'.join([j.name for j in JusticeTag] + ['PER CURIAM']), repl='', string=s)
    return s

In [54]:
# Remove hyphenation from text
df['text'] = df['text'].apply(remove_hyphenation)

# Remove justice names from text (don't want the algorithm to know)
df['text'] = df['text'].apply(remove_justice_names_from_str)


In [48]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [49]:
pipe = Pipeline([('tfidf', TfidfVectorizer()), ('clf', RandomForestClassifier())])

In [55]:
X = df[df['label'].ne('PER_CURIAM')]['text']
y = df[df['label'].ne('PER_CURIAM')]['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape

((1171,), (503,))

In [56]:
pipe.fit(X_train, y_train)

In [57]:
y_pred = pipe.predict(X_test)

In [58]:
print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

       ALITO       0.48      0.49      0.48        74
     BARRETT       0.00      0.00      0.00        10
      BREYER       0.72      0.73      0.72        52
    GINSBURG       0.76      0.40      0.52        40
     GORSUCH       1.00      0.76      0.87        34
       KAGAN       0.93      0.57      0.70        46
   KAVANAUGH       0.00      0.00      0.00        14
     KENNEDY       0.00      0.00      0.00        23
     ROBERTS       0.54      0.21      0.30        34
      SCALIA       0.50      0.06      0.10        18
   SOTOMAYOR       0.38      0.23      0.29        69
      THOMAS       0.33      0.89      0.48        89

    accuracy                           0.49       503
   macro avg       0.47      0.36      0.37       503
weighted avg       0.52      0.49      0.46       503



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# NAIVE AF - Ideology

In [33]:
X = df[df['label'].ne('PER_CURIAM')]['text']
y = df[df['label'].ne('PER_CURIAM')]['ideology']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [35]:
pipe.fit(X_train, y_train)

In [36]:
y_pred = pipe.predict(X_test)

In [37]:
print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

CONSERVATIVE       0.67      0.99      0.80       296
     LIBERAL       0.94      0.31      0.47       207

    accuracy                           0.71       503
   macro avg       0.81      0.65      0.64       503
weighted avg       0.78      0.71      0.66       503



In [39]:
pipe = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LogisticRegression())])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

CONSERVATIVE       0.63      0.95      0.76       296
     LIBERAL       0.75      0.21      0.33       207

    accuracy                           0.65       503
   macro avg       0.69      0.58      0.54       503
weighted avg       0.68      0.65      0.58       503



In [40]:
df.ideology.value_counts()

CONSERVATIVE    1013
LIBERAL          661
Name: ideology, dtype: int64