# uninstalling and reinstalling transformers because of key error when trying to use the Mistral model

In [15]:
!pip uninstall transformers -y

Found existing installation: transformers 4.35.2
Uninstalling transformers-4.35.2:
  Successfully uninstalled transformers-4.35.2
[0m

In [22]:
!pip install git+https://github.com/huggingface/transformers

[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
!nvidia-smi

Sat Nov 18 18:44:07 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.07             Driver Version: 537.34       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090 Ti     On  | 00000000:06:00.0  On |                  Off |
|  0%   40C    P8              36W / 450W |    924MiB / 24564MiB |     14%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import os

from dotenv import load_dotenv
load_dotenv()
HUGGINGFACEHUB_API_TOKEN = os.getenv("HF_AUTH_TOKEN")

In [3]:
from huggingface_hub import login
login(token=HUGGINGFACEHUB_API_TOKEN)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
torch.set_default_device('cuda')

In [5]:
# model = AutoModelForCausalLM.from_pretrained("Open-Orca/Mistral-7B-OpenOrca",
#                                              torch_dtype="auto")

# tokenizer = AutoTokenizer.from_pretrained("Open-Orca/Mistral-7B-OpenOrca",
#                                           torch_dtype="auto")

In [6]:
# Save models and tokenizer locally
# model.save_pretrained('./Mistral-7B-OpenOrca')
# tokenizer.save_pretrained('./Mistral-7B-OpenOrca')

# loading the model locally
## - uses around 16 Gb of GPU memory

In [7]:
model_directory = "./Mistral-7B-OpenOrca"
tokenizer = AutoTokenizer.from_pretrained(model_directory)

model = AutoModelForCausalLM.from_pretrained(model_directory,
                                             torch_dtype="auto"
                                             )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

# defining some functions and variables for the model to use

In [15]:
text = """<|im_start|>system\n
You are MistralOrca, a large language model trained by Alignment Lab AI. Write out your reasoning step-by-step to be sure you get the right answers!\n
<|im_end|>\n
<|im_start|>user\n
what is the meaning of life?\n
<|im_end|>"""

encodeds = tokenizer(text, return_tensors="pt", add_special_tokens=False)

device = 'cuda'

In [33]:
import textwrap

def wrap_text(text, width=90): #preserve_newlines
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def generate(input_text, system_prompt="",max_length=1024):
    if system_prompt != "":
        system_prompt = f"""<|im_start|> system\n{system_prompt}<|im_end|>"""
    else:
        system_prompt = ""
    prompt = f"""<|im_start|> user\n{input_text}<|im_end|>"""
    final_prompt = system_prompt + prompt
    inputs = tokenizer(final_prompt, return_tensors="pt", add_special_tokens=False)
    model_inputs = encodeds.to(device)
    model.to(device)
    outputs = model.generate(**inputs,
                             max_length=max_length,
                             temperature=0.1,
                             pad_token_id = 3200,
                             do_sample=True)
    text = tokenizer.batch_decode(outputs)[0]
    # text = text[len(final_prompt):] if text.startswith(final_prompt) else text
    text = text.replace(final_prompt, '', 1)
    wrapped_text = wrap_text(text)
    # print(wrapped_text)
    return wrapped_text

In [48]:
import pandas as pd

csv_file_path = 'extracted_toc_with_conclusion.csv'

df = pd.read_csv(csv_file_path)

conclusion = df.iloc[0, 3]
conclusion_category_column = df.iloc[:, 3]

conclusion_category_column_name = 'Conclusion Category'
conclusion_category_column.name = conclusion_category_column_name

print(first_row)

CONCLUSION 
 
The issue of student-athlete compensation is com-
plex, and it deserves a full, unfettered national discus-
sion to ensure the fair treatment of student-athletes 
while also protecting the matchless beneﬁts that col-
lege athletics provide to institutions of higher educa-
tion, their students, and the public. The injunction below cuts off that important discussion based on an
untenable view of federal antitrust law. This Court
should reverse.



# testing to see if it works

In [49]:
%%time
instruction = "Classify the conclusion into one of those following categories: affirm, deny, reverse, remand, other, or incomplete. Return only the classification. {text}"
instruction = instruction.format(text = conclusion)
generate(instruction,
         system_prompt="You are MistralOrca, a legal expert who specializes in classifying the conclusion types of legal briefs. Write out your short and succinct!",
         max_length=256)

CPU times: user 616 ms, sys: 0 ns, total: 616 ms
Wall time: 652 ms


'\nClassification: Reverse<|im_end|>'

In [193]:
torch.cuda.empty_cache()

In [194]:
import gc
gc.collect()

376

# Classifying the conclusions in the following cells
## the "complex" conclusion categories are as follows:
## -affirm, deny, reverse, remand, part-affirm part-deny, part-affirm part-reverse, part-affirm part-remand, part-deny part-reverse, part-deny part-remand, part-reverse part-remand, other, or incomplete

## There is a "simple" conclusion category as well, but that will be noted in a different section below

In [195]:
import pandas as pd

csv_file_path = 'extracted_toc_with_conclusion.csv'

df = pd.read_csv(csv_file_path)

conclusion = df.iloc[0, 3]
conclusion_category_column = df.iloc[:, 3].copy()

conclusion_category_column_name = 'Complex Conclusion Category'
conclusion_category_column.name = conclusion_category_column_name

print(first_row)

CONCLUSION 
 
The issue of student-athlete compensation is com-
plex, and it deserves a full, unfettered national discus-
sion to ensure the fair treatment of student-athletes 
while also protecting the matchless beneﬁts that col-
lege athletics provide to institutions of higher educa-
tion, their students, and the public. The injunction below cuts off that important discussion based on an
untenable view of federal antitrust law. This Court
should reverse.



In [196]:
formatted_conclusion = first_row.replace('-\n', '').replace('\n', ' ').strip()

print(formatted_conclusion)

CONCLUSION    The issue of student-athlete compensation is complex, and it deserves a full, unfettered national discussion to ensure the fair treatment of student-athletes  while also protecting the matchless beneﬁts that college athletics provide to institutions of higher education, their students, and the public. The injunction below cuts off that important discussion based on an untenable view of federal antitrust law. This Court should reverse.


In [197]:
import re

In [198]:
# for index, conclusion in conclusion_category_column.items():
#     # if index > 10:
#     #     break
#     print(index)
#     try:
#         conclusion = conclusion.replace('-\n', '').replace('\n', ' ').strip()
#         conclusion = re.sub(' +', ' ', conclusion)
    
#         # print(conclusion)
    
#         instruction = "Classify the conclusion into one of those following categories: affirm, deny, reverse, remand, part-affirm part-deny, part-affirm part-reverse, part-affirm part-remand, part-deny part-reverse, part-deny part-remand, part-reverse part-remand, other, or incomplete. Return only the classification. {text}"
#         instruction = instruction.format(text = conclusion)
#         classification = generate(instruction,
#              system_prompt="You are MistralOrca, a legal expert who specializes in classifying the conclusion types of legal briefs. Write out your short and succinct!",
#              max_length=1024)
        
#         classification = classification.replace('<|im_end|>', '').strip()
#     except:
#         print('error')
#     conclusion_category_column.at[index] = classification

import re

for index, conclusion in conclusion_category_column.items():
    # if index > 50:
    #     break
    print(index)

    if conclusion is None or conclusion == "":
        continue

    # Start of the loop
    count = 0
    while True:
        if count > 10:
            break
        else:
            count += 1
        try:
            # for some reason, a couple of conclusions are not strings
            if not isinstance(conclusion, str):
                conclusion = str(conclusion)
            conclusion = conclusion.replace('-\n', '').replace('\n', ' ').strip()
            conclusion = re.sub(' +', ' ', conclusion)

            instruction = "Classify the conclusion into one of those following categories: affirm, deny, reverse, remand, part-affirm part-deny, part-affirm part-reverse, part-affirm part-remand, part-deny part-reverse, part-deny part-remand, part-reverse part-remand, other, or incomplete. Return only the classification. {text}"
            instruction = instruction.format(text=conclusion)
            classification = generate(instruction,
                system_prompt="You are MistralOrca, a legal expert who specializes in classifying the conclusion types of legal briefs. Write out your short and succinct!",
                max_length=1024)

            classification = classification.replace('<|im_end|>', '').strip()

            # Break the loop if classification is not empty or just spaces
            if classification and not classification.isspace():
                break
        except Exception as e:
            print('error:', e)
            

    # Assign the classification
    conclusion_category_column.at[index] = classification

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56




57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317

In [199]:
conclusion_category_column.head(11)

0     The classification for this conclusion is: rev...
1                       The classification is: reverse.
2                        The conclusion type is: affirm
3                      The conclusion type is "affirm".
4                          The conclusion type is: deny
5                          The conclusion type is: deny
6     The classification for this conclusion is: aff...
7                        The conclusion type is: affirm
8                        The conclusion type is: affirm
9     The classification for this conclusion is: affirm
10                       The conclusion type is: affirm
Name: Complex Conclusion Category, dtype: object

In [200]:
conclusion_category_column.to_csv('conclusion_classifications_expanded.csv', index=False)

# This is where we had the model classify the conclusions into more simplier categories
## - affirm, deny, reverse, remand, other, or incomplete

In [187]:
import pandas as pd

csv_file_path = 'extracted_toc_with_conclusion.csv'

df = pd.read_csv(csv_file_path)

conclusion = df.iloc[0, 3]
conclusion_category_column = df.iloc[:, 3].copy()

conclusion_category_column_name = 'Simple Conclusion Category'
conclusion_category_column.name = conclusion_category_column_name

print(first_row)

CONCLUSION 
 
The issue of student-athlete compensation is com-
plex, and it deserves a full, unfettered national discus-
sion to ensure the fair treatment of student-athletes 
while also protecting the matchless beneﬁts that col-
lege athletics provide to institutions of higher educa-
tion, their students, and the public. The injunction below cuts off that important discussion based on an
untenable view of federal antitrust law. This Court
should reverse.



In [188]:
formatted_conclusion = first_row.replace('-\n', '').replace('\n', ' ').strip()

print(formatted_conclusion)

CONCLUSION    The issue of student-athlete compensation is complex, and it deserves a full, unfettered national discussion to ensure the fair treatment of student-athletes  while also protecting the matchless beneﬁts that college athletics provide to institutions of higher education, their students, and the public. The injunction below cuts off that important discussion based on an untenable view of federal antitrust law. This Court should reverse.


In [190]:
import re

for index, conclusion in conclusion_category_column.items():
    # if index > 50:
    #     break
    print(index)

    if conclusion is None or conclusion == "":
        continue

    # Start of the loop
    count = 0
    while True:
        if count > 10:
            break
        else:
            count += 1
        try:
            # for some reason, a couple of conclusions are not strings
            if not isinstance(conclusion, str):
                conclusion = str(conclusion)
            conclusion = conclusion.replace('-\n', '').replace('\n', ' ').strip()
            conclusion = re.sub(' +', ' ', conclusion)

            instruction = "Classify the conclusion into one of the following categories: affirm, deny, reverse, remand, other, or incomplete. Return only the classification. {text}"
            instruction = instruction.format(text=conclusion)
            classification = generate(instruction,
                system_prompt="You are MistralOrca, a legal expert who specializes in classifying the conclusion types of legal briefs. Write out your short and succinct!",
                max_length=1024)

            classification = classification.replace('<|im_end|>', '').strip()

            # Break the loop if classification is not empty or just spaces
            if classification and not classification.isspace():
                break
        except Exception as e:
            print('error:', e)
            

    # Assign the classification
    conclusion_category_column.at[index] = classification


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56




57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317

In [191]:
conclusion_category_column.head(50)

0     The classification for this conclusion is: rev...
1                               Classification: Reverse
2                                Classification: Affirm
3                      The conclusion type is "affirm".
4                        The conclusion type is "deny".
5                          The conclusion type is: deny
6                        The conclusion type is: affirm
7                        The conclusion type is: affirm
8                                Classification: Affirm
9                                Classification: Affirm
10                               Classification: Affirm
11                              Classification: Reverse
12                              Classification: Reverse
13                      The conclusion type is: reverse
14                               Classification: Affirm
15                       The conclusion type is: affirm
16                          The classification is: deny
17                         The conclusion is to 

In [192]:
conclusion_category_column.to_csv('conclusion_classifications_simple.csv', index=False)

In [202]:
import pandas as pd

df1 = pd.read_csv('extracted_toc_with_conclusion.csv')
df1.reset_index(drop=True, inplace=True)

df2 = pd.read_csv('conclusion_classifications_simple.csv', header=None)
df3 = pd.read_csv('conclusion_classifications_complex.csv', header=None)

# Combine the DataFrames
combined_df = pd.concat([df1, df2, df3], axis=1)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv('extracted_toc_with_conclusion_classifications.csv', index=False)


In [207]:
torch.cuda.empty_cache()
gc.collect()

0

# Model Assisted Argument Extraction

In [208]:
df1 = pd.read_csv('extracted_toc_with_conclusion.csv')
arguments_column = df1.iloc[:, 1].copy()
arguments_column_name = 'Arguments'
arguments_column.name = arguments_column_name

In [209]:
import re

for index, toc in arguments_column.items():
    # if index > 3:
    #     break
    print(index)

    if toc is None or toc == "":
        continue

    # Start of the loop
    count = 0
    while True:
        if count > 10:
            break
        else:
            count += 1
        try:
            if not isinstance(toc, str):
                toc = str(toc)
            toc = toc.replace('-\n', '').replace('\n', ' ').strip()
            toc = re.sub(' +', ' ', toc)

            instruction = "Extract the explicitly stated arguments from the following table of contents. Return only the arguments or 'None' if there are none. {text}"
            instruction = instruction.format(text=toc)
            arguments = generate(instruction,
                system_prompt="You are MistralOrca, a legal expert who specializes in extracting arguments of legal briefs. Write your answer short and succinct!",
                max_length=2048)

            arguments = arguments.replace('<|im_end|>', '').strip()

            # Break the loop if argument is not empty or just spaces
            if arguments and not arguments.isspace():
                break
        except Exception as e:
            print('error:', e)
            

    # Assign the argument
    arguments_column.at[index] = arguments

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167




168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317


In [210]:
arguments_column.head(10)

0    The explicitly stated arguments are:\n\n1. The...
1    The explicitly stated arguments are:\n\n1. The...
2    From the provided table of contents, the expli...
3    The explicitly stated arguments are:\n\n1. The...
4    The explicitly stated arguments are:\n\n1. Thi...
5    From the table of contents, the explicitly sta...
6    The explicitly stated arguments are:\n\n1. The...
7                                                     
8    The explicitly stated arguments are:\n\n1. The...
9    The explicitly stated arguments are:\n1. The N...
Name: Arguments, dtype: object

In [211]:
arguments_column.to_csv('extracted_arguments.csv', index=False)

In [212]:
import pandas as pd

df1 = pd.read_csv('extracted_toc_with_conclusion_classifications.csv')
df1.reset_index(drop=True, inplace=True)

df2 = pd.read_csv('extracted_arguments.csv', header=None)

# Combine the DataFrames
combined_df = pd.concat([df1, df2], axis=1)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv('extracted_toc_with_classifications_and_arguments.csv', index=False)


# Note: 
## - even after adding all of these to the saved csv, I manually remove a couple of cells to shift the added columns up
## - this is because adding in these dataframe added '0' to be the column name with the actual column underneath that
## - so the added columns would be one off
## - I haven't yet gone into the code to fix that bug; I just manually removed the '0'