In [None]:
import requests

def getOpinion(opinion_url):
  response = requests.get(opinion_url)
  if response.status_code == 200:
    data = response.json()
    return data['casebody']['opinions'][0]['text']
  return

In [None]:
def getVolumeOpinions(volume):
  url = f"https://static.case.law/ad3d/{volume}/cases/"
  opinions = []
  response = requests.get(url)

  if response.status_code == 200:
      html_content = response.text
      soup = BeautifulSoup(html_content, 'html.parser')
      links = soup.find_all('a')
      for link in links[3:]:
        href = link.get('href')
        if href:  # Some links might not have the href attribute
          print(href)
          opinions.append(getOpinion(href))
  strings = [string.replace('\n', ' ') for string in opinions]
  with open(f'{volume}.txt', 'w') as file:
    for opinion in strings:
      file.write(opinion+'\n')

In [None]:
from bs4 import BeautifulSoup
import threading
ids = [1,2,3,4,5,6,7,8,9,10,150,151,152,153,154,155,156,157]
def process_volume_ids(ids):
    threads = []
    for volume in ids:
        thread = threading.Thread(target=getVolumeOpinions, args=(volume,))
        thread.start()
        threads.append(thread)

    # Wait for all threads to finish
    for thread in threads:
        thread.join()

process_volume_ids(ids)

In [2]:
##PREPROCESSING

from gensim.models import Word2Vec, FastText
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import nltk
import re
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

MIN_FREQ = 25

# COLLECT CORPUS
def get_corpus(corpus_file):
  with open(corpus_file, "r", encoding="utf-8") as file:
    corpus = file.readlines()
  return corpus
# REMOVE PUNCTUATION CHARACTERS
def remove_punctuation(corpus):
    punctuation_pattern = re.compile(r'[^\w\s<>_]')
    # Iterate through each sentence in the corpus and remove punctuation using regex
    cleaned_corpus = []
    for sentence in corpus:
        cleaned_sentence = punctuation_pattern.sub('', sentence)
        cleaned_sentence = re.compile('\sv\s').sub('',cleaned_sentence)
        cleaned_corpus.append(cleaned_sentence)
    return cleaned_corpus
def replaceDate(corpus_file):
    full_date_pattern = re.compile('\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}\b')
    year_pattern = re.compile('\d{4}')
    case_id = re.compile('\d+\s+(?:AD|NY)2d\s+\d+(?:,\s+\d+)?\s+\[\d+\]')
    cleaned_corpus = []
    for sentence in corpus:
        cleaned_sentence = case_id.sub('full_date', sentence)
        cleaned_sentence = full_date_pattern.sub('full_date', cleaned_sentence)
        cleaned_sentence = year_pattern.sub('year', cleaned_sentence)

        cleaned_corpus.append(cleaned_sentence)
    return cleaned_corpus
#TOKENIZATION
def get_tokens(corpus):
  return [word_tokenize(sentence.lower()) for sentence in corpus]

#STEMMING
def stem_tokens(corpus):
  stemmer = PorterStemmer()
  stemmed_corpus = [[stemmer.stem(word) for word in tokens] for tokens in corpus]
  return stemmed_corpus

def remove_stopwords(corpus):
  stop_words = set(stopwords.words('english'))
  unstopped_corpus = [[word for word in tokens if word not in stop_words] for tokens in corpus]
  return unstopped_corpus

def remove_below_freq(corpus, min_freq):
  all_words = [word for sentence in corpus for word in sentence]
  word_freq = FreqDist(all_words)
  filtered_corpus = [[word for word in sentence if word_freq[word] > min_freq] for sentence in corpus]
  return filtered_corpus


corpus = get_corpus('1.txt')
corpus = replaceDate(corpus)
corpus = remove_punctuation(corpus)
corpus = get_tokens(corpus)
corpus = remove_stopwords(corpus)
corpus = stem_tokens(corpus)
corpus = remove_below_freq(corpus,MIN_FREQ)

w2v = Word2Vec(sentences=corpus , vector_size=100, window=5, min_count=1, workers=4)
w2v.save("word2vec.model")

ft = FastText(sentences=corpus , vector_size=100, window=5, min_count=1, workers=4)
ft.save("ft.model")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import pandas as pd
vocab = w2v.wv.key_to_index

vocab_list = []
most_sim_list = []

for word in w2v.wv.key_to_index:
  vocab_list.append(word)
  most_similar_words = w2v.wv.most_similar(word, topn=5)
  most_sim_list.append([w for w, _ in most_similar_words])
df = pd.DataFrame({'Vocabulary': vocab_list, 'Most Similar': most_sim_list})
display(df)

Unnamed: 0,Vocabulary,Most Similar
0,full_dat,"[affd, lv, see, cf, also]"
1,year,"[9, 3, nycrr, 14, 28]"
2,defend,"[contend, asid, respect, suppress, indict]"
3,court,"[suprem, er, remit, westchest, counti]"
4,see,"[ny2d, cf, full_dat, affd, ad2d]"
...,...,...
1184,primarili,"[opportun, unless, ration, clear, possibl]"
1185,supervisor,"[ultim, site, undisput, client, earli]"
1186,larceni,"[weapon, rape, burglari, contempt, aggrav]"
1187,financ,"[east, estat, ad, hold, kraumlap]"




In [None]:
import random
random_sentence = random.choice(corpus)
random_word = random.choice(random_sentence)

similar_words = ft.wv.most_similar(random_word)
print(f"Similar words to '{random_word}':", similar_words)

Similar words to 'order': [('judg', 0.9572159051895142), ('appeal', 0.9413971304893494), ('bring', 0.91146320104599), ('adjudg', 0.9093562960624695), ('much', 0.8965600728988647), ('monro', 0.8913715481758118), ('memorandum', 0.8893234133720398), ('modifi', 0.8814156651496887), ('herebi', 0.8631449937820435), ('modif', 0.8573394417762756)]


In [None]:
def getConceptVector(word_a,word_b, model):
  vector_a = model.wv[word_a]
  vector_b = model.wv[word_b]
  return vector_b - vector_a

def getClosestConceptMatch(concept, word, model):
  result_vector = concept + model.wv[word]
  most_sim_list = []
  most_similar_words = model.wv.most_similar(result_vector, topn=3)
  most_sim_list= [w for w, _ in most_similar_words]
  return most_sim_list

def getSum(word_a,word_b):
  vector_a = model.wv[word_a]
  vector_b = model.wv[word_b]
  return vector_b + vector_a


def pickRandomWord():
  a = random.choice(list(model.wv.key_to_index.keys()))
  return a

def getRandomConceptMatch():
  a = picked_word
  b = pickRandomWord()
  c = pickRandomWord()
  concept = getConceptVector(a,b, w2v)
  d = getClosestConceptMatch(concept,c, w2v)
  concept = getConceptVector(a,b, ft)
  e = getClosestConceptMatch(concept,c, ft)
  return [a,b,c,d,e]

def getConceptMatch(a,b,c):
  stemmer = PorterStemmer()
  a = stemmer.stem(a)
  b = stemmer.stem(b)
  c = stemmer.stem(c)
  concept = getConceptVector(a,b, w2v)
  d = ",".join(getClosestConceptMatch(concept,c, w2v))
  concept = getConceptVector(a,b, ft)
  e = ",".join(getClosestConceptMatch(concept,c, ft))
  return [a,b,c,d,e]

In [None]:
combinations = [
    ['complaint','hospital','cause'],
    ['defend','plaintiff','judgment'],
    ['motion','appeal','supreme'],
    ['term','operation','murder'],
    ['file','divorce','Statements'],
    ['violation','products','warning'],
    ['reversed','sentence','recommendation'],
    ['criminal', 'guilty','victim'],
    ['verdict','prison','guilt'],
    ['prove','offense','term'],
    ['recommend', 'warning', 'damages']
]
model = w2v
test_df = pd.DataFrame(columns=['A', 'B', 'C', 'W2V', 'FastText'])
for combo in combinations:
  test_df.loc[len(test_df)] = getConceptMatch(combo[0],combo[1],combo[2])
display(test_df)

Unnamed: 0,A,B,C,W2V,FastText
0,complaint,hospit,caus,"victim,use,condit","accid,use,obtain"
1,defend,plaintiff,judgment,"counterclaim,reinstat,judgment","insofar,ensu,reinstat"
2,motion,appeal,suprem,"westchest,queen,king","queen,king,peter"
3,term,oper,murder,"condit,defect,knowledg","oper,plan,futur"
4,file,divorc,statement,"evid,view,suffici","favor,identif,suffici"
5,violat,product,warn,"nation,sidewalk,opposit","liabl,opposit,demonstr"
6,revers,sentenc,recommend,"grand,charg,plea","guilti,plea,jurisdict"
7,crimin,guilti,victim,"charg,admit,allocut","asid,trial,evict"
8,verdict,prison,guilt,"area,care,particip","five,offens,crime"
9,prove,offens,term,"term,feloni,offend","term,feloni,imprison"


In [21]:
from gensim.test.utils import datapath
from gensim import utils
import matplotlib.pyplot as plt
import pandas as pd

word_pairs = w2v.wv.evaluate_word_pairs(datapath('wordsim353.tsv'))
print(word_pairs)
# Extract similarity scores
pearson_result = word_pairs[0]
significance_result = word_pairs[1]
confidence_interval = (word_pairs[2])

# Plotting
labels = ['Pearson Correlation', 'P-Value','Confidence (%)']
values = [pearson_result[0], significance_result[0], confidence_interval]
errors = [pearson_result[1], significance_result[1],""]


# Create DataFrame
df = pd.DataFrame({'Word2Vec': labels, 'Values': values, 'Errors': errors})
display(df)


(PearsonRResult(statistic=0.3210742943310341, pvalue=0.24326824659185078), SignificanceResult(statistic=0.35094015854120253, pvalue=0.19965492823843248), 95.75070821529745)


Unnamed: 0,Word2Vec,Values,Errors
0,Pearson Correlation,0.321074,0.243268
1,P-Value,0.35094,0.199655
2,Confidence (%),95.750708,


In [24]:
word_pairs = ft.wv.evaluate_word_pairs(datapath('wordsim353.tsv'))
print(word_pairs)
# Extract similarity scores
pearson_result = word_pairs[0]
significance_result = word_pairs[1]
confidence_interval = (word_pairs[2])

# Plotting
labels = ['Pearson Correlation', 'P-Value','Confidence (%)']
values = [pearson_result[0], significance_result[0], confidence_interval]
errors = [pearson_result[1], significance_result[1],""]


# Create DataFrame
df = pd.DataFrame({'FastText': labels, 'Values': values, 'Errors': errors})
display(df)

(PearsonRResult(statistic=0.3663114313382591, pvalue=0.17930991172895408), SignificanceResult(statistic=0.24709051978921398, pvalue=0.37462820977214173), 95.75070821529745)


Unnamed: 0,FastText,Values,Errors
0,Pearson Correlation,0.366311,0.17931
1,P-Value,0.247091,0.374628
2,Confidence (%),95.750708,
