In [45]:
def count_and_print_references(file_path):
    # Read the content of the file
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    import xml.etree.ElementTree as ET

    # Parse the content as XML
    root = ET.fromstring(content)

    # Find the <H1> tag with the content "References"
    references_tags = [root.find(".//H1[.='References']"), 
                        root.find(".//H1[.='REFERENCES']"),
                        root.find(".//H1[.='Bibliography']"),
                        root.find(".//H1[.='Reference']"),
                        root.find(".//H1[.='10. References']"),]

    # Display the result
    reference_count = 0
    for references_tag in references_tags:
        if references_tag is not None:
            print(f"Found tag: {references_tag.tag} with content: {references_tag.text}\n")
            references_text = ET.tostring(references_tag, encoding='unicode')
            for idx, line in enumerate(references_text.splitlines()[2:], start=1):
                if not line.strip():  # Stop when an empty line occurs
                    break
                reference_count += 1
                print(f"{idx}: {line}")
            return reference_count
        else:
            print("Tag with content 'References' not found.")
    return None

In [46]:
file_path = "../../data_dr_inventor/compiled_corpus/A01.txt"

In [47]:
counts = {}

for i in range(1, 41):
    print(f"{i:02}")
    file_path = f"../../data_dr_inventor/compiled_corpus/A{i:02}.txt"
    reference_count = count_and_print_references(file_path)
    counts[i] = reference_count

01
Found tag: H1 with content: References

1:           [1] J. P. Lewis, Matt Cordner, and Nickson Fong. Pose space deformation: a unified approach to shape interpolation and skeleton-driven deformation. In SIGGRAPH 2000: Proceedings of the 27th annual conference on Computer graphics and interactive techniques, pages 165– 172, New York, NY, USA, 2000. ACM Press/Addison-Wesley Publishing Co.
2:           [2] Peter-Pike J. Sloan, III Charles F. Rose, and Michael F. Cohen. Shape by example. In SI3D 2001: Proceedings of the 2001 symposium on Interactive 3D graphics, pages 135–143, New York, NY, USA, 2001. ACM Press.
3:           [3] Paul G. Kry, Doug L. James, and Dinesh K. Pai. Eigenskin: real time large deformation character skinning in hardware. In SCA 2002: Proceedings of the 2002 ACM SIGGRAPH/Eurographics symposium on Computer animation, pages 153–159, New York, NY, USA, 2002. ACM Press.
4:           [4] Tsuneya Kurihara and Natsuki Miyata. Modeling deformable human hands from medical

In [54]:
counts

{1: 12,
 2: 20,
 3: 32,
 4: 31,
 5: 19,
 6: 42,
 7: 18,
 8: 38,
 9: 39,
 10: 31,
 11: 35,
 12: 35,
 13: 46,
 14: 25,
 15: 37,
 16: 21,
 17: 31,
 18: 22,
 19: 56,
 20: 36,
 21: 32,
 22: 59,
 23: 19,
 24: 20,
 25: 40,
 26: 41,
 27: 30,
 28: 27,
 29: 44,
 30: 47,
 31: 0,
 32: 14,
 33: 0,
 34: 16,
 35: 12,
 36: 32,
 37: 29,
 38: 40,
 39: 21,
 40: 31}

In [55]:
sorted_counts = dict(sorted(counts.items(), key=lambda item: item[1]))
sorted_counts

{31: 0,
 33: 0,
 1: 12,
 35: 12,
 32: 14,
 34: 16,
 7: 18,
 5: 19,
 23: 19,
 2: 20,
 24: 20,
 16: 21,
 39: 21,
 18: 22,
 14: 25,
 28: 27,
 37: 29,
 27: 30,
 4: 31,
 10: 31,
 17: 31,
 40: 31,
 3: 32,
 21: 32,
 36: 32,
 11: 35,
 12: 35,
 20: 36,
 15: 37,
 8: 38,
 9: 39,
 25: 40,
 38: 40,
 26: 41,
 6: 42,
 29: 44,
 13: 46,
 30: 47,
 19: 56,
 22: 59}

In [56]:
# Remove keys with value 0 from sorted_counts
filtered_counts = {k: v for k, v in sorted_counts.items() if v != 0}
filtered_counts

{1: 12,
 35: 12,
 32: 14,
 34: 16,
 7: 18,
 5: 19,
 23: 19,
 2: 20,
 24: 20,
 16: 21,
 39: 21,
 18: 22,
 14: 25,
 28: 27,
 37: 29,
 27: 30,
 4: 31,
 10: 31,
 17: 31,
 40: 31,
 3: 32,
 21: 32,
 36: 32,
 11: 35,
 12: 35,
 20: 36,
 15: 37,
 8: 38,
 9: 39,
 25: 40,
 38: 40,
 26: 41,
 6: 42,
 29: 44,
 13: 46,
 30: 47,
 19: 56,
 22: 59}

In [57]:
filtered_counts = dict(list(filtered_counts.items())[:5])
filtered_counts

{1: 12, 35: 12, 32: 14, 34: 16, 7: 18}

In [58]:
sorted_filtered_counts = dict(sorted(filtered_counts.items()))
sorted_filtered_counts

{1: 12, 7: 18, 32: 14, 34: 16, 35: 12}

In [59]:
print(sorted_filtered_counts.keys())

dict_keys([1, 7, 32, 34, 35])
