In [18]:
import polars as pl
from pathlib import Path
import re
import json
import random

In [22]:
# Reading raw data
data_path = Path(r"C:\Users\User\Projects\RAG\data\patent_samples.csv")
df = pl.read_csv(data_path)
df


id,app_doc_number,invention_title,abstract,description,claims
i64,i64,str,str,str,str
415065,13538195,"""Programmable low power multi-m…","""  Disclosed is a method and ap…","""  BACKGROUND 1. Field of th…","""  1. A multi-modulus divider…"
550748,13248165,"""Hearing aid device for frequen…","""  With a hearing aid device su…","""  CROSS-REFERENCE TO RELATED …","""  1. A method for frequency …"
110416,12569987,"""Battery cooling apparatus for …","""  A battery pack apparatus has…","""  BACKGROUND The present di…","""  1. A battery pack apparatu…"
619075,13656105,"""Methods of inhibiting cyanobac…","""  This invention offers an eff…","""  BACKGROUND OF THE INVENTION…","""  1. A method of inhibiting …"
597454,14104237,"""Method for welding gold-silico…","""  Relating to electronic compo…","""  CROSS-REFERENCE TO RELATED …","""  1. A method for welding a …"
…,…,…,…,…,…
468352,13335140,"""Continuous recovery system for…","""  A continuous recovery system…","""  FEDERALLY SPONSORED RESEARC…","""  1. A continuous recovery s…"
374985,13762177,"""Cement compositions with a hig…","""  A method of cementing in a s…","""  CROSS-REFERENCE TO RELATED …","""  1. A cement composition co…"
316450,13135745,"""Use of titanium-based material…","""  Compositions containing meta…","""  RELATED APPLICATIONS This…","""  1. A process for treating …"
604579,13778244,"""Lamp""","""  A lamp includes a housing ha…","""  BACKGROUND OF THE INVENTION…","""  1. A lamp comprising:  a h…"


In [10]:
df[1]['description']

description
str
"""  CROSS-REFERENCE TO RELATED …"


In [23]:
# Preprocess raw data, remove special chracters only
def preprocess_str(text):
    cleaned_text = re.sub(r'\s+', ' ', text)
    cleaned_text = ''.join(c for c in cleaned_text if c.isprintable())
    cleaned_text = cleaned_text.strip()
    return cleaned_text

In [24]:
new_list = []
for i, row in enumerate(df.iter_rows(named=True)):
    new_dict = {'id': row['id'], 'doc_num': row['app_doc_number'], 'title': row['invention_title'],
                'abstract': preprocess_str(row['abstract']), 'description': preprocess_str(row['description']),
                'claims': preprocess_str(row['claims'])}
    new_list.append(new_dict)

In [14]:
len(new_list)

5000

In [25]:
# Data after process
new_df_cols = ['id', 'doc_num', 'title', 'abstract', 'description', 'claims']
new_df = pl.DataFrame(new_list, schema=new_df_cols)
new_df

id,doc_num,title,abstract,description,claims
i64,i64,str,str,str,str
415065,13538195,"""Programmable low power multi-m…","""Disclosed is a method and appa…","""BACKGROUND 1. Field of the Dis…","""1. A multi-modulus divider (MM…"
550748,13248165,"""Hearing aid device for frequen…","""With a hearing aid device suit…","""CROSS-REFERENCE TO RELATED APP…","""1. A method for frequency comp…"
110416,12569987,"""Battery cooling apparatus for …","""A battery pack apparatus has a…","""BACKGROUND The present disclos…","""1. A battery pack apparatus co…"
619075,13656105,"""Methods of inhibiting cyanobac…","""This invention offers an effec…","""BACKGROUND OF THE INVENTION 1.…","""1. A method of inhibiting the …"
597454,14104237,"""Method for welding gold-silico…","""Relating to electronic compone…","""CROSS-REFERENCE TO RELATED APP…","""1. A method for welding a gold…"
…,…,…,…,…,…
468352,13335140,"""Continuous recovery system for…","""A continuous recovery system f…","""FEDERALLY SPONSORED RESEARCH O…","""1. A continuous recovery syste…"
374985,13762177,"""Cement compositions with a hig…","""A method of cementing in a sub…","""CROSS-REFERENCE TO RELATED APP…","""1. A cement composition compri…"
316450,13135745,"""Use of titanium-based material…","""Compositions containing metal …","""RELATED APPLICATIONS This appl…","""1. A process for treating oral…"
604579,13778244,"""Lamp""","""A lamp includes a housing havi…","""BACKGROUND OF THE INVENTION 1.…","""1. A lamp comprising: a housin…"


In [8]:
# Save after preprocessed data
write_path = r"./data/patent_samples.json"
with open(".\data\patent_samples.json", "w", encoding='utf-8') as f:
    json.dump(new_list, f, ensure_ascii=False, indent=4)

In [3]:
label_file = ("patent_labeled_v2.json")
with open(label_file, 'r', encoding='utf-8') as f:
    label_data = json.load(f)

label_data

[{'patent_id': 415065,
  'doc_num': 13538195,
  'title': 'Programmable low power multi-modulus divider with 50/50 duty cycle',
  'top_parameters': [{'id': 34,
    'name': 'Ease of operation',
    'score': 0.580027960548128,
    'confidence': 'medium'},
   {'id': 28,
    'name': 'Loss of information',
    'score': 0.5276137167421715,
    'confidence': 'medium'},
   {'id': 44,
    'name': 'Productivity',
    'score': 0.44802544541788014,
    'confidence': 'medium'},
   {'id': 32,
    'name': 'Adaptability or versatility',
    'score': 0.4327422784134627,
    'confidence': 'medium'},
   {'id': 45,
    'name': 'System complexity',
    'score': 0.4189319604667733,
    'confidence': 'medium'}],
  'all_scores': {'1': 0.07015414543972817,
   '2': -0.0003948583497087703,
   '3': 0.18328106572922262,
   '4': 0.12367735773647105,
   '5': 0.10080857290192385,
   '6': 0.03707240339953741,
   '7': 0.19500932229225684,
   '8': 0.135075585473167,
   '9': 0.15963419212022203,
   '10': 0.038148426269082

In [8]:
random_numbers = random.sample(range(0, 5001), 5)

print(random_numbers)


[2720, 3969, 548, 1637, 4886]


In [46]:
id_one = pl.DataFrame(label_data[4886]['top_parameters'])
id_one

id,name,score,confidence
i64,str,f64,str
33,"""Compatibility or connectabilit…",0.384119,"""low"""
31,"""Other harmful effects generate…",0.376292,"""low"""
40,"""Other Harmful Effects Acting O…",0.330081,"""low"""
11,"""Amount of information""",0.276608,"""low"""
36,"""Repairability""",0.273707,"""low"""


In [44]:
id_one_ab = new_df.filter(pl.col("doc_num")==label_data[4886]['doc_num']).select('abstract')
id_one_ab

abstract
str
"""A paper conveying path extends…"


In [45]:
label_data[4886]['section_analysis']

{'abstract': {'top_parameter': [47, 1.0], 'avg_score': 0.2047665472843473},
 'claims': {'top_parameter': [11, 0.5], 'avg_score': 0.0922772106688503},
 'description': {'top_parameter': [33, 0.5], 'avg_score': 0.10855602844098339}}

In [31]:
label_data[3969]

{'patent_id': 639510,
 'doc_num': 13570376,
 'title': 'Image instance mapping',
 'top_parameters': [{'id': 31,
   'name': 'Other harmful effects generated by system',
   'score': 0.6458242591476462,
   'confidence': 'medium'},
  {'id': 11,
   'name': 'Amount of information',
   'score': 0.6292141401334597,
   'confidence': 'medium'},
  {'id': 45,
   'name': 'System complexity',
   'score': 0.6079375199942538,
   'confidence': 'medium'},
  {'id': 47,
   'name': 'Ability to detect or measure',
   'score': 0.6025968810374552,
   'confidence': 'medium'},
  {'id': 28,
   'name': 'Loss of information',
   'score': 0.5131444453753239,
   'confidence': 'medium'}],
 'all_scores': {'1': 0.17915147497970138,
  '2': 0.14130397484947152,
  '3': -0.02353245119073505,
  '4': 0.029581467261759208,
  '5': 0.2221320144278695,
  '6': 0.1651245573641773,
  '7': 0.14152332172483875,
  '8': 0.16622694721801773,
  '9': -0.039506801329949486,
  '10': 0.11719608246858884,
  '11': 0.6292141401334597,
  '12': 0.

In [13]:
label_data[548]

{'patent_id': 679497,
 'doc_num': 13794125,
 'title': 'Spatially coherent nearest neighbor fields',
 'top_parameters': [{'id': 6,
   'name': 'Area of stationary object',
   'score': 0.5170345629917674,
   'confidence': 'medium'},
  {'id': 5,
   'name': 'Area of moving object',
   'score': 0.4267196944405796,
   'confidence': 'medium'},
  {'id': 36,
   'name': 'Repairability',
   'score': 0.3975996272317598,
   'confidence': 'low'},
  {'id': 11,
   'name': 'Amount of information',
   'score': 0.36048691551257217,
   'confidence': 'low'},
  {'id': 4,
   'name': 'Length or angle of stationary object',
   'score': 0.3478426742371781,
   'confidence': 'low'}],
 'all_scores': {'1': 0.10560795107437168,
  '2': 0.15076300291200376,
  '3': 0.19028247262483347,
  '4': 0.3478426742371781,
  '5': 0.4267196944405796,
  '6': 0.5170345629917674,
  '7': 0.05299981771089163,
  '8': 0.127571286659509,
  '9': 0.19258791863888064,
  '10': 0.08997404681645585,
  '11': 0.36048691551257217,
  '12': 0.0478270

In [14]:
label_data[1637]

{'patent_id': 38383,
 'doc_num': 12393670,
 'title': 'Method and apparatus for channel encoding and decoding in a communication system using low-density-parity-check codes',
 'top_parameters': [{'id': 34,
   'name': 'Ease of operation',
   'score': 0.3603312128674281,
   'confidence': 'low'},
  {'id': 11,
   'name': 'Amount of information',
   'score': 0.3434122683992514,
   'confidence': 'low'},
  {'id': 31,
   'name': 'Other harmful effects generated by system',
   'score': 0.32144715714077654,
   'confidence': 'low'},
  {'id': 33,
   'name': 'Compatibility or connectability',
   'score': 0.31491728478459435,
   'confidence': 'low'},
  {'id': 45,
   'name': 'System complexity',
   'score': 0.2688617173690831,
   'confidence': 'low'}],
 'all_scores': {'1': 0.12643601761565618,
  '2': 0.11834572102741615,
  '3': 0.14482907180130744,
  '4': 0.09570725704657518,
  '5': 0.1686797000543188,
  '6': 0.08599655008947041,
  '7': 0.041762605840022324,
  '8': 0.0016598183159677662,
  '9': 0.0622

In [53]:
rows = []
for i in label_data:
    wr_dict = {}
    wr_dict['patent_num'] = i['doc_num']
    wr_dict['title'] = i['title']
    wr_dict['params_score'] = i['all_scores']
    rows.append(wr_dict)
rows[:10]

[{'patent_num': 13538195,
  'title': 'Programmable low power multi-modulus divider with 50/50 duty cycle',
  'params_score': {'1': 0.07015414543972817,
   '2': -0.0003948583497087703,
   '3': 0.18328106572922262,
   '4': 0.12367735773647105,
   '5': 0.10080857290192385,
   '6': 0.03707240339953741,
   '7': 0.19500932229225684,
   '8': 0.135075585473167,
   '9': 0.15963419212022203,
   '10': 0.03814842626908242,
   '11': 0.19107505717537823,
   '12': 0.12658206633449848,
   '13': 0.08854123918045446,
   '14': 0.15662854028479192,
   '15': -0.017165701065129883,
   '16': 0.30399655898576433,
   '17': 0.22993781328418578,
   '18': 0.3283402693745783,
   '19': 0.08281319926159031,
   '20': 0.07477030309868912,
   '21': -0.025335893231735792,
   '22': 0.06326991620452428,
   '23': 0.06834739468250095,
   '24': 0.31091315548650783,
   '25': -0.03733236533534577,
   '26': 0.18400498430049858,
   '27': 0.035581666908652665,
   '28': 0.5276137167421715,
   '29': 0.1716856113969889,
   '30': 0.0

In [55]:
label_df = pl.DataFrame(rows)

In [57]:
label_df.write_json('patent_score_table.json')