In [None]:
!pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.28.0


In [None]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [None]:
import re
import pandas as pd
import openai
import json
import ast
from fuzzywuzzy import fuzz
openai.api_key = "YOUR_API_KEY"



## Processing Evaluation Dataset -- Cora Ref
Cora Ref consists of 500 computer science conference and journals citation strings.

In [None]:
def extract_text_from_tags(citation_string):
    # Define regular expression patterns for each tag
    tag_patterns = {
        'author': r'<author>(.*?)<\/author>',
        'title': r'<title>(.*?)<\/title>',
        'editor': r'<editor>(.*?)<\/editor>',
        'venue': r'<(?:journal|booktitle)>(.*?)<\/(?:journal|booktitle)>',
        'volume': r'<volume>(.*?)<\/volume>',
        'date': r'<date>(.*?)<\/date>'
    }

    # Initialize dictionary to store extracted text
    extracted_text = {}

    # Extract text for each tag
    for tag, pattern in tag_patterns.items():
        matches = re.findall(pattern, citation_string)
        if matches:
            extracted_text[tag] = matches[0]
        else:
            extracted_text[tag] = None

    return extracted_text

def main():
    # Read CORA Ref citation strings from a text file
    with open('/content/tagged_references.txt', 'r') as file:
        citation_strings = file.read().splitlines()

    # Initialize list to store extracted data
    data = []

    # Extract text from each citation string
    for citation_string in citation_strings:
        extracted_data = extract_text_from_tags(citation_string)

        # Extract citation string without tags
        citation_only = re.sub(r'<\/?[\w]+>', '', citation_string)
        extracted_data['CitationString'] = citation_only.strip()

        data.append(extracted_data)

    # Create DataFrame from the extracted data
    df = pd.DataFrame(data)

    # Merge 'journal' and 'booktitle' into 'Venue' column
    if 'journal' in df.columns and 'booktitle' in df.columns:
        df['venue'] = df[['journal', 'booktitle', ]].fillna('').sum(axis=1)
        df.drop(columns=['journal', 'booktitle'], inplace=True)

    # Save DataFrame to CSV
    df.to_csv('CORA-for-eval.csv', index=False)

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import re

# Read citation strings from a text file
with open("/content/tagged_references.txt", "r") as file:
    citation_strings = file.readlines()

# Function to extract text between tags
def extract_text(tag, string):
    pattern = fr"<{tag}>(.*?)</{tag}>"
    matches = re.findall(pattern, string, re.DOTALL)
    return matches[0].strip() if matches else ""

# Extract text between tags for each citation string
citation_data = []
for string in citation_strings:
    author = extract_text("author", string)
    title = extract_text("title", string)
    editor = extract_text("editor", string)
    booktitle = extract_text("booktitle", string)
    journal = extract_text("journal", string)
    volume = extract_text("volume", string)
    date = extract_text("date", string)
    publisher = extract_text("publisher", string)
    tech = extract_text("tech", string)
    institution = extract_text("institution", string)
    location = extract_text("location", string)

    # Combine journal, booktitle, publisher, and institution into a single venue column
    venue = journal or booktitle or publisher or institution

    citation_data.append({
        "CitationString": string.strip(),
        "author": author,
        "title": title,
        "editor": editor,
        "venue": venue,
        "volume": volume,
        "date": date,
        "tech": tech,
        "location": location
    })

# Create dataframe from the extracted data
citation_df = pd.DataFrame(citation_data)

# Save dataframe to CSV file
citation_df.to_csv("citation_venue_mapped.csv", index=False)

print("Dataframe created and saved successfully!")



Dataframe created and saved successfully!


In [None]:
dataframe = pd.read_csv('CORA-for-eval.csv')
dataframe.head()

Unnamed: 0,author,title,editor,venue,volume,date,CitationString
0,"A. Cau, R. Kuiper, and W.-P. de Roever.",Formalising Dijkstra's development strategy w...,"In C. B. Jones, R. C. Shaw, and T. Denvir, ed...","Proc. 5th. BCS-FACS Refinement Workshop,",,1992.,"A. Cau, R. Kuiper, and W.-P. de Roever. Form..."
1,"M. Kitsuregawa, H. Tanaka, and T. Moto-oka.",Application of hash to data base machine and ...,,"New Generation Computing,","1(1),",1983.,"M. Kitsuregawa, H. Tanaka, and T. Moto-oka. ..."
2,Alexander Vrchoticky.,Modula/R language definition.,,,,May 1993.,Alexander Vrchoticky. Modula/R language defi...
3,Marc Shapiro and Susan Horwitz.,Fast and accurate flow-insensitive points-to ...,,In Proceedings of the 24th Annual ACM Symposi...,,January 1997.,Marc Shapiro and Susan Horwitz. Fast and acc...
4,W. Landi and B. G. Ryder.,Aliasing with and without pointers: A problem...,,,,September 1990.,W. Landi and B. G. Ryder. Aliasing with and ...


## Prompt Engineering using Few-Shot Learning:

We need:
a) context (i.e., the particular task (e.g., Citation Parsing AI assistant) we want to achieve)
b) examples (e.g., provide variety of citation string examples)
c) task (e.g. provide any test examples)
d) output

In [None]:
SYS_PROMPT = """You are a smart and intelligent Citation Parsing system (i.e., very similar to Named Entity Recognition (NER) tasks or information extraction or metadata extraction or knowledge extraction).
I will provide you the definition of the metadata fields from which you need to extract the metadata fields from a given citation string in different styles (e.g., IEEE, ACM, APA, MLA, Chicago, etc.).
I will also provide you with the output in given format with examples."""

USER_PROMPT_1 = "Are you clear about your role?"

ASSIST_PROMPT_1 = "Sure, I am ready to help you with your citation parsing task where the citation strings may have adopted different citation styles (e.g., IEEE, ACM, APA, MLA, Chicago, etc.)"

PROMPT = (
    "Metadata Fields Definition\n"
    "1. title: The title of the paper/article.\n"
    "2. author: List of authors involved in the publication.\n"
    "3. venue: Information about the publication venue. Depending on the source, this could be a journal name, conference proceedings, book title, thesis, technical report, etc.\n"
    "4. date: The year, month, and day of publication.\n"
    "\n"
    "Output Format:\n"
    "{'title': [list of metadata fields present], 'author': [list of metadata fields present], 'venue': [list of metadata fields present], 'date': [list of metadata fields present]}\n"
    "If no metadata fields are presented in any categories, keep it None\n"
    "\n"
    "Examples:\n"
    "\n"
    "1. Citation String: K. A. Bowman and J. D. Meindl. 2001. Impact of within-die parameter fluctuations on future maximum clock frequency distributions, Proceedings of the IEEE Custom Integrated  Circuits Conference, 229-232.\n"
    "Output: {'title': ['Impact of within-die parameter fluctuations on future maximum clock frequency distributions'], 'author': ['K. A. Bowman', 'J. D. Meindl'], 'venue': ['Proceedings of the IEEE Custom Integrated  Circuits Conference'], 'date': ['2001']}\n"
    "\n"
    "2. Citation String: Elghadamsi, F. E., and Mohraz, B., “Inelastic Earthquake Spectra,” Earthquake Engineering and Structural Dynamics. Vol. 15, 1987.\n"
    "Output: {'title': ['Inelastic Earthquake Spectra'], 'author': ['Elghadamsi, F. E.', 'Mohraz, B.'], 'venue': ['Earthquake Engineering and Structural Dynamics'], 'date': ['1987']}\n"
    "\n"
    "3. Citation String: Mataric. M. J. (1990). A distributed model for mobile robot environment-learning and navigation. Master's thesis. MIT. Cambridge. MA.\n"
    "Output: {'title': ['A distributed model for mobile robot environment-learning and navigation'], 'author': ['Mataric. M. J.'], 'venue': ['MIT'], 'date': ['1990']}\n"
    "\n"
    "4. Citation String: Mettala. E. and Graham. M. (1992). The domain-specific software architecture program. Technical Report CMU/SEI-92-TR-22 ESD-92-TR-223. Carnegie Mellon Software Engineering Institute.\n"
    "Output: {'title': ['The domain-specific software architecture program'], 'author': ['Mettala. E.', 'Graham. M.'], 'venue': ['Carnegie Mellon Software Engineering Institute'], 'date': ['1992']}\n"
    "\n"
    "5. Citation String: W. Landi and B. G. Ryder. Aliasing with and without pointers: A problem taxonomy. Center for Computer Aids for Industrial Productivity Technical Report CAIP-TR-125, Rutgers University, September 1990.\n"
    "Output: {'title': ['Aliasing with and without pointers: A problem taxonomy'], 'author': ['W. Landi and B. G. Ryder'], 'venue': ['Rutgers University'], 'date': ['September 1990']}\n"
    "6. Citation String: {}\n"
    "Output: {'title': None, 'author': None, 'venue': None, 'date': None}"
)


In [None]:
def openai_chat_completion_response(final_prompt):
  response = openai.ChatCompletion.create(
      model = "gpt-3.5-turbo",
      messages = [
          {"role": "system", "content": SYS_PROMPT},
          {"role": "user", "content": USER_PROMPT_1},
          {"role": "assistant", "content": ASSIST_PROMPT_1},
          {"role": "user", "content": final_prompt}

      ]
  )

  return response['choices'][0]['message']['content'].strip("\n")

In [None]:
citedata = pd.read_csv('/content/CORA-for-eval.csv', encoding='utf-8')
result = []

# Iterate through each citation string in the DataFrame
for citation in citedata['CitationString']:
    # Replace the placeholder in the PROMPT with the actual citation string
    prompt = PROMPT.replace('{}', citation)

    # Perform the OpenAI completion using the modified prompt
    parsing = openai_chat_completion_response(prompt)

    # Append the result to the list
    result.append(parsing)

RateLimitError: Rate limit reached for gpt-3.5-turbo in organization org-mnEsSk4cHZq32Vi9kBfq8iyf on tokens per min (TPM): Limit 60000, Used 59216, Requested 873. Please try again in 89ms. Visit https://platform.openai.com/account/rate-limits to learn more.

In [None]:
dataframe = pd.DataFrame(result, columns = ['metadata'])
dataframe.to_csv("predicted-gpt-cora.csv", index = False)

### Parsing Predicted Data (JSON output of GPT model)

In [None]:
import pandas as pd
import json

# Load the CSV file
df = pd.read_csv('/content/predicted-gpt-cora.csv')

# Function to fix JSON data
def fix_json(json_str):
    try:
        # Replace single quotes with double quotes
        json_str = json_str.replace("'", '"')
        # Handle apostrophes in the JSON string
        json_str = json_str.replace('"s', "'s")
        # Handle single quotes in the venue field
        json_str = json_str.replace("'90", "'90")
        json_str = json_str.replace("'97", "'97")
        json_str = json_str.replace("'s", "'s")
        json_str = json_str.replace('(ASPLOS-VI)"', "(ASPLOS-VI)'")
        # Handle None values
        json_str = json_str.replace("[None]", '["None"]')
        json_str = json_str.replace("None", '["None"]')
        # Load JSON string
        return json.loads(json_str)
    except (json.JSONDecodeError, TypeError):
        return {}

# Fix JSON data under the 'metadata' column
df['metadata'] = df['metadata'].apply(fix_json)

# Normalize JSON data and save in a new DataFrame
normalized_data = pd.json_normalize(df['metadata']).fillna('')

# Save the result to a new CSV file
normalized_data.to_csv('predicted_result.csv', index=False)

###Evaluation

In [None]:
eval_data = pd.read_csv('/content/predicted_result-updated.csv')
eval_data

Unnamed: 0,title,author,venue,date
0,"[""Formalising Dijkstra's development strategy ...","['A. Cau', 'R. Kuiper', 'W.-P. de Roever']",['Proc. 5th. BCS-FACS Refinement Workshop'],['1992']
1,['Application of hash to data base machine and...,"['M. Kitsuregawa', 'H. Tanaka', 'T. Moto-oka']",['New Generation Computing'],['1983']
2,['Modula/R language definition'],['Alexander Vrchoticky'],"['Technical Report TU Wien rr-02-92, version 2...",['May 1993']
3,['Fast and accurate flow-insensitive points-to...,"['Marc Shapiro', 'Susan Horwitz']",['Proceedings of the 24th Annual ACM Symposium...,['January 1997']
4,['Aliasing with and without pointers: A proble...,"['W. Landi', 'B. G. Ryder']",['Center for Computer Aids for Industrial Prod...,['September 1990']
...,...,...,...,...
495,['Semantic parallels in natural language and c...,"['van Benthem, J. F. A. K.']","['Logic Colloquium'87, Granada']",['1989']
496,['Bias-driven revision of logical domain theor...,"['Koppel, M.', 'Feldman, R.', 'Segre, A. M.']",['Journal of Artificial Intelligence Research'],['1994b']
497,['Learning decision lists by prepending inferr...,"['Webb, G. I.', 'Brkic, N.']",['Proceedings of the Australian Workshop on Ma...,['1993']
498,['Integrating Reflection into SLD-Resolution'],"['Barklund, J.', 'Costantini, S.', ""Dell'Acqua...",['Proc. Post-Conf. Ws. on Proof-Theoretical Ex...,['1994']


## Title Field Evaluation

In [None]:
pred_title = []
for string in eval_data['title']:
  strings = string.strip()
  strings = strings.strip("['").strip("']").strip('"')
  strings = strings.lower()
  pred_title.append(strings)

pred_data_title = pd.DataFrame(pred_title, columns = ['pred-title'])

In [None]:
pred_data_title

Unnamed: 0,pred-title
0,formalising dijkstra's development strategy wi...
1,application of hash to data base machine and i...
2,modula/r language definition
3,fast and accurate flow-insensitive points-to a...
4,aliasing with and without pointers: a problem ...
...,...
495,semantic parallels in natural language and com...
496,bias-driven revision of logical domain theories
497,learning decision lists by prepending inferred...
498,integrating reflection into sld-resolution


In [None]:
test_data = pd.read_csv('CORA-for-eval.csv')
test_data['title'].fillna('None', inplace=True)

In [None]:
test_title = []
characters_to_remove = ['.', ',', '"', "''", '``']
for string in test_data['title']:
  cleaned_string = string.strip().strip(''.join(characters_to_remove))
  normalize = cleaned_string.lower()
  test_title.append(normalize)

test_data_title = pd.DataFrame(test_title, columns = ['test-title'])

In [None]:
test_data_title

Unnamed: 0,test-title
0,formalising dijkstra's development strategy wi...
1,application of hash to data base machine and i...
2,modula/r language definition
3,fast and accurate flow-insensitive points-to a...
4,aliasing with and without pointers: a problem ...
...,...
495,semantic parallels in natural language and com...
496,bias-driven revision of logical domain theories
497,learning decision lists by prepending inferred...
498,integrating reflection into sld-resolution


In [None]:
matched = pd.DataFrame(columns=['title_match'])
matched['title_match'] = pred_data_title['pred-title'].eq(test_data_title['test-title']).replace([True, False], [1,0])
result = pd.concat([pred_data_title, test_data_title, matched], axis = 1, sort = False)
result.to_csv("title_output.csv", index=False)

In [None]:
print(matched.value_counts())

title_match
1              480
0               20
Name: count, dtype: int64


## Fuzzy Matching For Title Field

In [None]:
from fuzzywuzzy import fuzz
import pandas as pd


# Function to perform fuzzy string matching
def fuzzy_match(str1, str2):
    return fuzz.token_sort_ratio(str1.lower(), str2.lower())

# Perform string matching and fuzzy matching
matched = []
for pred_title in pred_data_title['pred-title']:
    match_found = False
    for test_title in test_data_title['test-title']:
        if pred_title.lower() == test_title.lower():  # Exact match
            matched.append(1)
            match_found = True
            break
        elif fuzzy_match(pred_title, test_title) >= 90:  # Fuzzy match with threshold 90%
            matched.append(1)
            match_found = True
            break
    if not match_found:
        matched.append(0)

# Print number of matches found
print("Number of matches:", sum(matched))

# Add matching result to dataframe
result = pd.concat([pred_data_title, test_data_title], axis=1)
result['title_match'] = matched

# Save result to CSV
result.to_csv("title_output-fuzzy.csv", index=False)

Number of matches: 487


#### Considering GT discrepency (wrong annotation (i.e., 5)), the total should be 492. So, F1 score: 98.4% (NeuralParsCit -- 97.45% & TransParsCit -- 90.7%). In the CORA-ref, the title field annotated as booktitle.

## Date Field Evaluation

In [None]:
pred_date = []
for string in eval_data['date']:
  strings = string.strip()
  strings = strings.strip("['").strip("']").strip("'")
  strings = strings.lower()
  pred_date.append(strings)

pred_date = pd.DataFrame(pred_date, columns = ['pred-date'])

In [None]:
test_data['date'].fillna('None', inplace=True)

test_date = []
characters_to_remove = ['.', ',', '(', ')', '[', ']', ':']
for string in test_data['date']:
  cleaned_string = string.strip().strip(''.join(characters_to_remove))
  normalize = cleaned_string.lower()
  test_date.append(normalize)

test_date = pd.DataFrame(test_date, columns = ['test-date'])

In [None]:
matched_date = pd.DataFrame(columns=['date_match'])
matched_date['date_match'] = pred_date['pred-date'].eq(test_date['test-date']).replace([True, False], [1,0])
result = pd.concat([pred_date, test_date, matched_date], axis = 1, sort = False)
result.to_csv("date_output.csv", index=False)

In [None]:
print(matched_date.value_counts())

date_match
1             485
0              15
Name: count, dtype: int64


### With fuzzy match, the total should be 494. F1 score: 98.80% (NeuralParsCit -- 98.93 & TransParsCit -- 87.7%)

## Author Field Evaluation

In [None]:
pred_author = []
characters_to_remove = ['.']
for string in eval_data['author']:
  strings = string.strip()
  strings = strings.strip("['").strip("']")
  strings = strings.replace("'", "")
  cleaned_string = strings.rstrip(''.join(characters_to_remove))
  normalized = cleaned_string.lower()
  pred_author.append(normalized)

pred_author_data = pd.DataFrame(pred_author, columns = ['pred-author'])

In [None]:
pred_author_data

In [None]:
for i in pred_author_data['pred-author']:
  print(type(i))

In [None]:
test_data['author'].fillna('None', inplace=True)

test_author = []
characters_to_remove = ['.']
for string in test_data['author']:
  cleaned_string = string.strip().rstrip(''.join(characters_to_remove))
  normalize = cleaned_string.lower()
  test_author.append(normalize)

test_author_data = pd.DataFrame(test_author, columns = ['test-author'])

In [None]:
for i in test_author_data['test-author']:
  print(type(i))

In [None]:
matched_author = pd.DataFrame(columns=['author_match'])
matched_author['author_match'] = pred_author_data['pred-author'].eq(test_author_data['test-author']).replace([True, False], [1,0])
result = pd.concat([pred_author_data, test_author_data, matched_author], axis = 1, sort = False)
result.to_csv("author_output.csv", index=False)

In [None]:
print(matched_author.value_counts())

author_match
0               369
1               131
Name: count, dtype: int64


## Fuzzy Matching For Author Field

In [None]:
from fuzzywuzzy import fuzz
import pandas as pd


# Function to perform fuzzy string matching
def fuzzy_match(str1, str2):
    return fuzz.token_sort_ratio(str1.lower(), str2.lower())

# Perform string matching and fuzzy matching
matched = []
for pred_author in pred_author_data['pred-author']:
    match_found = False
    for test_author in test_author_data['test-author']:
        if pred_author.lower() == test_author.lower():  # Exact match
            matched.append(1)
            match_found = True
            break
        elif fuzzy_match(pred_author, test_author) >= 82:  # Fuzzy match with threshold 85%
            matched.append(1)
            match_found = True
            break
    if not match_found:
        matched.append(0)

# Print number of matches found
print("Number of matches:", sum(matched))

# Add matching result to dataframe
result = pd.concat([pred_author_data, test_author_data], axis=1)
result['author_match'] = matched

# Save result to CSV
result.to_csv("author_output-fuzzy.csv", index=False)

Number of matches: 488


#### Considering GT discrepency (author name annotated as editors). So, considering this wrong annotation (i.e, 9)), the total should be 497. So, F1 score: 99.4% (NeuralParsCit -- 99.1% & TransParsCit -- 98.8%)

##Venue Field Evaluation

In [None]:
pred_venue = []
for string in eval_data['venue']:
  strings = string.strip()
  strings = strings.strip("['").strip("']").strip(".")
  strings = strings.lower()
  pred_venue.append(strings)

pred_data_venue = pd.DataFrame(pred_venue, columns = ['pred-venue'])

In [None]:
pred_data_venue

Unnamed: 0,pred-venue
0,proc. 5th. bcs-facs refinement workshop
1,new generation computing
2,"technical report tu wien rr-02-92, version 2.0..."
3,proceedings of the 24th annual acm symposium o...
4,center for computer aids for industrial produc...
...,...
495,"logic colloquium'87, granada"
496,journal of artificial intelligence research
497,proceedings of the australian workshop on mach...
498,proc. post-conf. ws. on proof-theoretical exte...


In [None]:
venue_data = pd.read_csv("/content/citation_venue_mapped.csv")

In [None]:
venue_data['venue'].fillna('None', inplace=True)

test_venue = []
characters_to_remove = [',', '.']
for string in venue_data['venue']:
  cleaned_string = string.strip().rstrip(''.join(characters_to_remove))
  normalize_venue = cleaned_string.lower()
  test_venue.append(normalize_venue)

test_data_venue = pd.DataFrame(test_venue, columns = ['test-venue'])

In [None]:
test_data_venue

Unnamed: 0,test-venue
0,proc. 5th. bcs-facs refinement workshop
1,new generation computing
2,"dept. for real-time systems, technical univers..."
3,in proceedings of the 24th annual acm symposiu...
4,center for computer aids for industrial produc...
...,...
495,"logic colloquium'87, granada"
496,journal of artificial intelligence research
497,in proceedings of the australian workshop on m...
498,proc. post-conf. ws. on proof-theoretical exte...


In [None]:
matched_venue = pd.DataFrame(columns=['venue_match'])
matched_venue['venue_match'] = pred_data_venue['pred-venue'].eq(test_data_venue['test-venue']).replace([True, False], [1,0])
result = pd.concat([pred_data_venue, test_data_venue, matched_venue], axis = 1, sort = False)
result.to_csv("venue_output.csv", index=False)

In [None]:
print(matched_venue.value_counts())

venue_match
1              285
0              215
Name: count, dtype: int64


##Fuzzy Matching For Venue Field

In [None]:
from fuzzywuzzy import fuzz
import pandas as pd


# Function to perform fuzzy string matching
def fuzzy_match(str1, str2):
    return fuzz.token_sort_ratio(str1.lower(), str2.lower())

# Perform string matching and fuzzy matching
matched = []
for pred_venue in pred_data_venue['pred-venue']:
    match_found = False
    for test_venue in test_data_venue['test-venue']:
        if pred_venue.lower() == test_venue.lower():  # Exact match
            matched.append(1)
            match_found = True
            break
        elif fuzzy_match(pred_venue, test_venue) >= 70:  # Fuzzy match with threshold 85%
            matched.append(1)
            match_found = True
            break
    if not match_found:
        matched.append(0)

# Print number of matches found
print("Number of matches:", sum(matched))

# Add matching result to dataframe
result = pd.concat([pred_data_venue, test_data_venue], axis=1)
result['venue_match'] = matched

# Save result to CSV
result.to_csv("venue_output-fuzzy.csv", index=False)

Number of matches: 463


#### We observed more strings did not match, further requires string normalization, such as, removing trailing or leading space, period, semicolon, colon, etc. Also, observed a lot of TN. For example, in the cases of pre-print, or thesis. If the evaluation is done again with solving the issues described, the result should be 484 correctly predicted. So, We will be looking at micro F1 score: 96.8% (NeuralParsCit -- micro F1 91.91%)