Implement the sequence protocol

jalan · Jul 17, 2017 · c5252e0 · c5252e0
1 parent df2ccbb
commit c5252e0
Show file tree

Hide file tree

Showing 3 changed files with 110 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -10,15 +10,19 @@ import pdftotext
 with open("lorem_ipsum.pdf", "rb") as f:
     pdf = pdftotext.PDF(f)
 
+# How many pages?
+print(len(pdf))
+
 # Iterate over all the pages
 for page in pdf:
     print(page)
 
-# Just read the second page
-print(pdf.read(2))
+# Read some individual pages
+print(pdf[0])
+print(pdf[1])
 
-# Or read all the text at once
-print(pdf.read_all())
+# Read all the text into one string
+print("\n\n".join(pdf))
 ```
 
 

diff --git a/pdftotext/pdftotext.cpp b/pdftotext/pdftotext.cpp
@@ -14,25 +14,24 @@ static PyObject* PdftotextError;
 typedef struct {
     PyObject_HEAD
     int page_count;
-    int page_number;
     PyObject* data;
     poppler::document* doc;
 } PDF;
 
+// TODO: deprecated
 static PyMemberDef PDF_members[] = {
     {
         (char*)"page_count",
         T_INT,
         offsetof(PDF, page_count),
         READONLY,
-        (char*)"Page count.",
+        (char*)"Deprecated--instead of p.page_count, use len(p).",
     },
     {NULL},  // Sentinel
 };
 
 static void PDF_clear(PDF* self) {
     self->page_count = 0;
-    self->page_number = 0;
     delete self->doc;
     self->doc = NULL;
     Py_CLEAR(self->data);
@@ -82,7 +81,6 @@ static int PDF_init(PDF* self, PyObject* args, PyObject* kwds) {
         return -1;
     }
     self->page_count = self->doc->pages();
-    self->page_number = 0;
     return 0;
 }
 
@@ -95,7 +93,7 @@ static PyObject* PDF_read_page(PDF* self, int page_number) {
     const poppler::page* page;
     std::vector<char> page_utf8;
 
-    page = self->doc->create_page(page_number - 1);
+    page = self->doc->create_page(page_number);
     if (page == NULL) {
         return PyErr_Format(PdftotextError, "Poppler error creating page");
     }
@@ -104,6 +102,7 @@ static PyObject* PDF_read_page(PDF* self, int page_number) {
     return PyUnicode_DecodeUTF8(page_utf8.data(), page_utf8.size(), NULL);
 }
 
+// TODO: deprecated
 static PyObject* PDF_read(PDF* self, PyObject* args, PyObject* kwds) {
     int page_number;
     static char* kwlist[] = {(char*)"page_number", NULL};
@@ -118,9 +117,10 @@ static PyObject* PDF_read(PDF* self, PyObject* args, PyObject* kwds) {
         return PyErr_Format(
             PdftotextError, "Invalid page number: %i", page_number);
     }
-    return PDF_read_page(self, page_number);
+    return PDF_read_page(self, page_number - 1);
 }
 
+// TODO: deprecated
 static PyObject* PDF_read_all(PDF* self) {
     const poppler::page* page;
     std::vector<char> page_utf8;
@@ -148,30 +148,43 @@ static PyObject* PDF_read_all(PDF* self) {
     return PyUnicode_DecodeUTF8(doc_utf8.data(), doc_utf8.size(), NULL);
 }
 
-static PyObject* PDF_next(PDF* self) {
-    if (self->page_number >= self->page_count) {
-        return NULL;
-    }
-    self->page_number++;
-    return PDF_read_page(self, self->page_number);
-}
-
 static PyMethodDef PDF_methods[] = {
     {
         "read",
         (PyCFunction)PDF_read,
         METH_VARARGS | METH_KEYWORDS,
-        "Extract text from the given page number.",
+        "Deprecated--instead of p.read(1), use p[0].",
     },
     {
         "read_all",
         (PyCFunction)PDF_read_all,
         METH_NOARGS,
-        "Extract all text from the document, joining pages with \"\\n\\n\".",
+        "Deprecated--instead of p.read_all(), use \"\\n\\n\".join(p).",
     },
     {NULL},  // Sentinel
 };
 
+static Py_ssize_t PDF_len(PyObject* obj) {
+    PDF* self = (PDF*)obj;
+    return self->page_count;
+}
+
+static PyObject* PDF_getitem(PyObject* obj, Py_ssize_t i) {
+    PDF* self = (PDF*)obj;
+
+    if (i < 0 || i >= self->page_count) {
+        return PyErr_Format(PyExc_IndexError, "Index out of range");
+    }
+    return PDF_read_page(self, i);
+}
+
+static PySequenceMethods PDF_sequence_methods = {
+    PDF_len,      // sq_length (__len__)
+    0,            // sq_concat
+    0,            // sq_repeat
+    PDF_getitem,  // sq_item (__getitem__)
+};
+
 static PyTypeObject PDFType = {
     PyVarObject_HEAD_INIT(NULL, 0)
     "pdftotext.PDF",                           // tp_name
@@ -184,7 +197,7 @@ static PyTypeObject PDFType = {
     0,                                         // tp_reserved
     0,                                         // tp_repr
     0,                                         // tp_as_number
-    0,                                         // tp_as_sequence
+    &PDF_sequence_methods,                     // tp_as_sequence
     0,                                         // tp_as_mapping
     0,                                         // tp_hash
     0,                                         // tp_call
@@ -198,8 +211,8 @@ static PyTypeObject PDFType = {
     0,                                         // tp_clear
     0,                                         // tp_richcompare
     0,                                         // tp_weaklistoffset
-    PyObject_SelfIter,                         // tp_iter
-    (iternextfunc)PDF_next,                    // tp_iternext
+    0,                                         // tp_iter
+    0,                                         // tp_iternext
     PDF_methods,                               // tp_methods
     PDF_members,                               // tp_members
     0,                                         // tp_getset

diff --git a/tests/test_pdf.py b/tests/test_pdf.py
@@ -32,7 +32,7 @@ class InitTest(unittest.TestCase):
     def test_double_init_success(self):
         pdf = pdftotext.PDF(get_file("abcde.pdf"))
         pdf.__init__(get_file("blank.pdf"))
-        self.assertEqual(pdf.page_count, 1)
+        self.assertEqual(len(pdf), 1)
 
     def test_double_init_failure(self):
         pdf = pdftotext.PDF(get_file("blank.pdf"))
@@ -58,7 +58,7 @@ class BrokenPDF(pdftotext.PDF):
             def __init__(self):
                 pass
         pdf = BrokenPDF()
-        self.assertEqual(pdf.page_count, 0)
+        self.assertEqual(len(pdf), 0)
 
 
 class ReadTest(unittest.TestCase):
@@ -123,6 +123,43 @@ def test_read_page_two(self):
         self.assertIn("two", result)
 
 
+class GetItemTest(unittest.TestCase):
+    """Test the __getitem__ method."""
+
+    def test_read(self):
+        pdf = pdftotext.PDF(get_file("abcde.pdf"))
+        result = pdf[0]
+        self.assertIn("abcde", result)
+
+    def test_no_doc_to_read(self):
+        class BrokenPDF(pdftotext.PDF):
+            def __init__(self):
+                pass
+        pdf = BrokenPDF()
+        with self.assertRaises(IndexError):
+            pdf[0]
+
+    def test_pdf_read_invalid_page_number(self):
+        pdf = pdftotext.PDF(get_file("blank.pdf"))
+        with self.assertRaises(IndexError):
+            pdf[100]
+
+    def test_pdf_read_wrong_arg_type(self):
+        pdf = pdftotext.PDF(get_file("blank.pdf"))
+        with self.assertRaises(TypeError):
+            pdf["wrong"]
+
+    def test_read_corrupt_page(self):
+        with self.assertRaises((pdftotext.Error, IndexError)):
+            pdf = pdftotext.PDF(get_file("corrupt_page.pdf"))
+            pdf[0]
+
+    def test_read_page_two(self):
+        pdf = pdftotext.PDF(get_file("two_page.pdf"))
+        result = pdf[1]
+        self.assertIn("two", result)
+
+
 class ReadAllTest(unittest.TestCase):
     """Test the read_all method."""
 
@@ -163,29 +200,48 @@ def test_page_count_two(self):
         self.assertEqual(pdf.page_count, 2)
 
 
-class IterationTest(unittest.TestCase):
+class LengthTest(unittest.TestCase):
+    """Test the __len__ method."""
+
+    def test_length_one(self):
+        pdf = pdftotext.PDF(get_file("blank.pdf"))
+        self.assertEqual(len(pdf), 1)
+
+    def test_length_two(self):
+        pdf = pdftotext.PDF(get_file("two_page.pdf"))
+        self.assertEqual(len(pdf), 2)
+
+    def test_length_no_doc(self):
+        class BrokenPDF(pdftotext.PDF):
+            def __init__(self):
+                pass
+        pdf = BrokenPDF()
+        self.assertEqual(len(pdf), 0)
+
+
+class ListTest(unittest.TestCase):
     """Test iterating over pages."""
 
     def test_list_length(self):
         pdf = pdftotext.PDF(get_file("two_page.pdf"))
-        result = list(pdf)
-        self.assertEqual(len(result), 2)
+        self.assertEqual(len(pdf), 2)
 
     def test_list_first_element(self):
         pdf = pdftotext.PDF(get_file("two_page.pdf"))
-        result = list(pdf)
-        self.assertIn("one", result[0])
+        self.assertIn("one", pdf[0])
 
     def test_list_second_element(self):
         pdf = pdftotext.PDF(get_file("two_page.pdf"))
-        result = list(pdf)
-        self.assertIn("two", result[1])
+        self.assertIn("two", pdf[1])
 
-    def test_stop_iteration(self):
-        pdf = pdftotext.PDF(get_file("blank.pdf"))
-        with self.assertRaises(StopIteration):
-            next(pdf)
-            next(pdf)
+    def test_list_invalid_element(self):
+        pdf = pdftotext.PDF(get_file("two_page.pdf"))
+        with self.assertRaises(IndexError):
+            pdf[2]
+
+    def test_list_last_element(self):
+        pdf = pdftotext.PDF(get_file("two_page.pdf"))
+        self.assertIn("two", pdf[-1])
 
     def test_for_loop(self):
         pdf = pdftotext.PDF(get_file("two_page.pdf"))