Skip to content

Commit

Permalink
Implement the sequence protocol
Browse files Browse the repository at this point in the history
  • Loading branch information
jalan committed Jul 17, 2017
1 parent df2ccbb commit c5252e0
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 37 deletions.
12 changes: 8 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,19 @@ import pdftotext
with open("lorem_ipsum.pdf", "rb") as f:
pdf = pdftotext.PDF(f)

# How many pages?
print(len(pdf))

# Iterate over all the pages
for page in pdf:
print(page)

# Just read the second page
print(pdf.read(2))
# Read some individual pages
print(pdf[0])
print(pdf[1])

# Or read all the text at once
print(pdf.read_all())
# Read all the text into one string
print("\n\n".join(pdf))
```


Expand Down
51 changes: 32 additions & 19 deletions pdftotext/pdftotext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,24 @@ static PyObject* PdftotextError;
typedef struct {
PyObject_HEAD
int page_count;
int page_number;
PyObject* data;
poppler::document* doc;
} PDF;

// TODO: deprecated
static PyMemberDef PDF_members[] = {
{
(char*)"page_count",
T_INT,
offsetof(PDF, page_count),
READONLY,
(char*)"Page count.",
(char*)"Deprecated--instead of p.page_count, use len(p).",
},
{NULL}, // Sentinel
};

static void PDF_clear(PDF* self) {
self->page_count = 0;
self->page_number = 0;
delete self->doc;
self->doc = NULL;
Py_CLEAR(self->data);
Expand Down Expand Up @@ -82,7 +81,6 @@ static int PDF_init(PDF* self, PyObject* args, PyObject* kwds) {
return -1;
}
self->page_count = self->doc->pages();
self->page_number = 0;
return 0;
}

Expand All @@ -95,7 +93,7 @@ static PyObject* PDF_read_page(PDF* self, int page_number) {
const poppler::page* page;
std::vector<char> page_utf8;

page = self->doc->create_page(page_number - 1);
page = self->doc->create_page(page_number);
if (page == NULL) {
return PyErr_Format(PdftotextError, "Poppler error creating page");
}
Expand All @@ -104,6 +102,7 @@ static PyObject* PDF_read_page(PDF* self, int page_number) {
return PyUnicode_DecodeUTF8(page_utf8.data(), page_utf8.size(), NULL);
}

// TODO: deprecated
static PyObject* PDF_read(PDF* self, PyObject* args, PyObject* kwds) {
int page_number;
static char* kwlist[] = {(char*)"page_number", NULL};
Expand All @@ -118,9 +117,10 @@ static PyObject* PDF_read(PDF* self, PyObject* args, PyObject* kwds) {
return PyErr_Format(
PdftotextError, "Invalid page number: %i", page_number);
}
return PDF_read_page(self, page_number);
return PDF_read_page(self, page_number - 1);
}

// TODO: deprecated
static PyObject* PDF_read_all(PDF* self) {
const poppler::page* page;
std::vector<char> page_utf8;
Expand Down Expand Up @@ -148,30 +148,43 @@ static PyObject* PDF_read_all(PDF* self) {
return PyUnicode_DecodeUTF8(doc_utf8.data(), doc_utf8.size(), NULL);
}

static PyObject* PDF_next(PDF* self) {
if (self->page_number >= self->page_count) {
return NULL;
}
self->page_number++;
return PDF_read_page(self, self->page_number);
}

static PyMethodDef PDF_methods[] = {
{
"read",
(PyCFunction)PDF_read,
METH_VARARGS | METH_KEYWORDS,
"Extract text from the given page number.",
"Deprecated--instead of p.read(1), use p[0].",
},
{
"read_all",
(PyCFunction)PDF_read_all,
METH_NOARGS,
"Extract all text from the document, joining pages with \"\\n\\n\".",
"Deprecated--instead of p.read_all(), use \"\\n\\n\".join(p).",
},
{NULL}, // Sentinel
};

static Py_ssize_t PDF_len(PyObject* obj) {
PDF* self = (PDF*)obj;
return self->page_count;
}

static PyObject* PDF_getitem(PyObject* obj, Py_ssize_t i) {
PDF* self = (PDF*)obj;

if (i < 0 || i >= self->page_count) {
return PyErr_Format(PyExc_IndexError, "Index out of range");
}
return PDF_read_page(self, i);
}

static PySequenceMethods PDF_sequence_methods = {
PDF_len, // sq_length (__len__)
0, // sq_concat
0, // sq_repeat
PDF_getitem, // sq_item (__getitem__)
};

static PyTypeObject PDFType = {
PyVarObject_HEAD_INIT(NULL, 0)
"pdftotext.PDF", // tp_name
Expand All @@ -184,7 +197,7 @@ static PyTypeObject PDFType = {
0, // tp_reserved
0, // tp_repr
0, // tp_as_number
0, // tp_as_sequence
&PDF_sequence_methods, // tp_as_sequence
0, // tp_as_mapping
0, // tp_hash
0, // tp_call
Expand All @@ -198,8 +211,8 @@ static PyTypeObject PDFType = {
0, // tp_clear
0, // tp_richcompare
0, // tp_weaklistoffset
PyObject_SelfIter, // tp_iter
(iternextfunc)PDF_next, // tp_iternext
0, // tp_iter
0, // tp_iternext
PDF_methods, // tp_methods
PDF_members, // tp_members
0, // tp_getset
Expand Down
84 changes: 70 additions & 14 deletions tests/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class InitTest(unittest.TestCase):
def test_double_init_success(self):
pdf = pdftotext.PDF(get_file("abcde.pdf"))
pdf.__init__(get_file("blank.pdf"))
self.assertEqual(pdf.page_count, 1)
self.assertEqual(len(pdf), 1)

def test_double_init_failure(self):
pdf = pdftotext.PDF(get_file("blank.pdf"))
Expand All @@ -58,7 +58,7 @@ class BrokenPDF(pdftotext.PDF):
def __init__(self):
pass
pdf = BrokenPDF()
self.assertEqual(pdf.page_count, 0)
self.assertEqual(len(pdf), 0)


class ReadTest(unittest.TestCase):
Expand Down Expand Up @@ -123,6 +123,43 @@ def test_read_page_two(self):
self.assertIn("two", result)


class GetItemTest(unittest.TestCase):
"""Test the __getitem__ method."""

def test_read(self):
pdf = pdftotext.PDF(get_file("abcde.pdf"))
result = pdf[0]
self.assertIn("abcde", result)

def test_no_doc_to_read(self):
class BrokenPDF(pdftotext.PDF):
def __init__(self):
pass
pdf = BrokenPDF()
with self.assertRaises(IndexError):
pdf[0]

def test_pdf_read_invalid_page_number(self):
pdf = pdftotext.PDF(get_file("blank.pdf"))
with self.assertRaises(IndexError):
pdf[100]

def test_pdf_read_wrong_arg_type(self):
pdf = pdftotext.PDF(get_file("blank.pdf"))
with self.assertRaises(TypeError):
pdf["wrong"]

def test_read_corrupt_page(self):
with self.assertRaises((pdftotext.Error, IndexError)):
pdf = pdftotext.PDF(get_file("corrupt_page.pdf"))
pdf[0]

def test_read_page_two(self):
pdf = pdftotext.PDF(get_file("two_page.pdf"))
result = pdf[1]
self.assertIn("two", result)


class ReadAllTest(unittest.TestCase):
"""Test the read_all method."""

Expand Down Expand Up @@ -163,29 +200,48 @@ def test_page_count_two(self):
self.assertEqual(pdf.page_count, 2)


class IterationTest(unittest.TestCase):
class LengthTest(unittest.TestCase):
"""Test the __len__ method."""

def test_length_one(self):
pdf = pdftotext.PDF(get_file("blank.pdf"))
self.assertEqual(len(pdf), 1)

def test_length_two(self):
pdf = pdftotext.PDF(get_file("two_page.pdf"))
self.assertEqual(len(pdf), 2)

def test_length_no_doc(self):
class BrokenPDF(pdftotext.PDF):
def __init__(self):
pass
pdf = BrokenPDF()
self.assertEqual(len(pdf), 0)


class ListTest(unittest.TestCase):
"""Test iterating over pages."""

def test_list_length(self):
pdf = pdftotext.PDF(get_file("two_page.pdf"))
result = list(pdf)
self.assertEqual(len(result), 2)
self.assertEqual(len(pdf), 2)

def test_list_first_element(self):
pdf = pdftotext.PDF(get_file("two_page.pdf"))
result = list(pdf)
self.assertIn("one", result[0])
self.assertIn("one", pdf[0])

def test_list_second_element(self):
pdf = pdftotext.PDF(get_file("two_page.pdf"))
result = list(pdf)
self.assertIn("two", result[1])
self.assertIn("two", pdf[1])

def test_stop_iteration(self):
pdf = pdftotext.PDF(get_file("blank.pdf"))
with self.assertRaises(StopIteration):
next(pdf)
next(pdf)
def test_list_invalid_element(self):
pdf = pdftotext.PDF(get_file("two_page.pdf"))
with self.assertRaises(IndexError):
pdf[2]

def test_list_last_element(self):
pdf = pdftotext.PDF(get_file("two_page.pdf"))
self.assertIn("two", pdf[-1])

def test_for_loop(self):
pdf = pdftotext.PDF(get_file("two_page.pdf"))
Expand Down

0 comments on commit c5252e0

Please sign in to comment.