Skip to content

Commit

Permalink
BUG: read_fwf inference should respect skiprows (pandas-dev#11256)
Browse files Browse the repository at this point in the history
Fix the fact that we don't skip the rows when inferring colspecs by
passing skiprows down the chain until it's needed.  - [X] closes
pandas-dev#11256 - [X] 3 tests added / passed - [X] passes `git diff
upstream/master | flake8 --diff` - [X] whatsnew entry

Author: D.S. McNeil <dsm054@gmail.com>

Closes pandas-dev#14028 from dsm054/bugfix/fwf_skiprows and squashes the following commits:

b5b3e66 [D.S. McNeil] BUG: read_fwf inference should respect skiprows (pandas-dev#11256)
  • Loading branch information
dsm054 authored and jreback committed Jan 10, 2017
1 parent e1a4144 commit aa03e7f
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 33 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,7 @@ Bug Fixes
- Bug in ``DataFrame.reindex()`` in which ``method`` was ignored when passing ``columns`` (:issue:`14992`)
- Bug in ``pd.to_numeric()`` in which float and unsigned integer elements were being improperly casted (:issue:`14941`, :issue:`15005`)
- Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
- Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)

- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`)

Expand Down
61 changes: 45 additions & 16 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@
fields of each line as half-open intervals (i.e., [from, to[ ).
String value 'infer' can be used to instruct the parser to try
detecting the column specifications from the first 100 rows of
the data (default='infer').
the data which are not being skipped via skiprows (default='infer').
widths : list of ints. optional
A list of field widths which can be used instead of 'colspecs' if
the intervals are contiguous.
Expand Down Expand Up @@ -3034,13 +3034,13 @@ class FixedWidthReader(BaseIterator):
A reader of fixed-width lines.
"""

def __init__(self, f, colspecs, delimiter, comment):
def __init__(self, f, colspecs, delimiter, comment, skiprows=None):
self.f = f
self.buffer = None
self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t '
self.comment = comment
if colspecs == 'infer':
self.colspecs = self.detect_colspecs()
self.colspecs = self.detect_colspecs(skiprows=skiprows)
else:
self.colspecs = colspecs

Expand All @@ -3049,28 +3049,57 @@ def __init__(self, f, colspecs, delimiter, comment):
"input was a %r" % type(colspecs).__name__)

for colspec in self.colspecs:

if not (isinstance(colspec, (tuple, list)) and
len(colspec) == 2 and
isinstance(colspec[0], (int, np.integer, type(None))) and
isinstance(colspec[1], (int, np.integer, type(None)))):
raise TypeError('Each column specification must be '
'2 element tuple or list of integers')

def get_rows(self, n):
rows = []
for i, row in enumerate(self.f, 1):
rows.append(row)
if i >= n:
def get_rows(self, n, skiprows=None):
"""
Read rows from self.f, skipping as specified.
We distinguish buffer_rows (the first <= n lines)
from the rows returned to detect_colspecs because
it's simpler to leave the other locations with
skiprows logic alone than to modify them to deal
with the fact we skipped some rows here as well.
Parameters
----------
n : int
Number of rows to read from self.f, not counting
rows that are skipped.
skiprows: set, optional
Indices of rows to skip.
Returns
-------
detect_rows : list of str
A list containing the rows to read.
"""
if skiprows is None:
skiprows = set()
buffer_rows = []
detect_rows = []
for i, row in enumerate(self.f):
if i not in skiprows:
detect_rows.append(row)
buffer_rows.append(row)
if len(detect_rows) >= n:
break
self.buffer = iter(rows)
return rows
self.buffer = iter(buffer_rows)
return detect_rows

def detect_colspecs(self, n=100):
def detect_colspecs(self, n=100, skiprows=None):
# Regex escape the delimiters
delimiters = ''.join([r'\%s' % x for x in self.delimiter])
pattern = re.compile('([^%s]+)' % delimiters)
rows = self.get_rows(n)
rows = self.get_rows(n, skiprows)
if not rows:
raise EmptyDataError("No rows from which to infer column width")
max_len = max(map(len, rows))
mask = np.zeros(max_len + 1, dtype=int)
if self.comment is not None:
Expand All @@ -3081,7 +3110,8 @@ def detect_colspecs(self, n=100):
shifted = np.roll(mask, 1)
shifted[0] = 0
edges = np.where((mask ^ shifted) == 1)[0]
return list(zip(edges[::2], edges[1::2]))
edge_pairs = list(zip(edges[::2], edges[1::2]))
return edge_pairs

def __next__(self):
if self.buffer is not None:
Expand All @@ -3106,9 +3136,8 @@ class FixedWidthFieldParser(PythonParser):
def __init__(self, f, **kwds):
# Support iterators, convert to a list.
self.colspecs = kwds.pop('colspecs')

PythonParser.__init__(self, f, **kwds)

def _make_reader(self, f):
self.data = FixedWidthReader(f, self.colspecs, self.delimiter,
self.comment)
self.comment, self.skiprows)
72 changes: 55 additions & 17 deletions pandas/io/tests/parser/test_read_fwf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from pandas import DataFrame
from pandas import compat
from pandas.compat import StringIO, BytesIO
from pandas.io.parsers import read_csv, read_fwf
from pandas.io.parsers import read_csv, read_fwf, EmptyDataError


class TestFwfParsing(tm.TestCase):
Expand Down Expand Up @@ -248,83 +248,83 @@ def test_bool_header_arg(self):

def test_full_file(self):
# File with all values
test = '''index A B C
test = """index A B C
2000-01-03T00:00:00 0.980268513777 3 foo
2000-01-04T00:00:00 1.04791624281 -4 bar
2000-01-05T00:00:00 0.498580885705 73 baz
2000-01-06T00:00:00 1.12020151869 1 foo
2000-01-07T00:00:00 0.487094399463 0 bar
2000-01-10T00:00:00 0.836648671666 2 baz
2000-01-11T00:00:00 0.157160753327 34 foo'''
2000-01-11T00:00:00 0.157160753327 34 foo"""
colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
expected = read_fwf(StringIO(test), colspecs=colspecs)
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))

def test_full_file_with_missing(self):
# File with missing values
test = '''index A B C
test = """index A B C
2000-01-03T00:00:00 0.980268513777 3 foo
2000-01-04T00:00:00 1.04791624281 -4 bar
0.498580885705 73 baz
2000-01-06T00:00:00 1.12020151869 1 foo
2000-01-07T00:00:00 0 bar
2000-01-10T00:00:00 0.836648671666 2 baz
34'''
34"""
colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
expected = read_fwf(StringIO(test), colspecs=colspecs)
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))

def test_full_file_with_spaces(self):
# File with spaces in columns
test = '''
test = """
Account Name Balance CreditLimit AccountCreated
101 Keanu Reeves 9315.45 10000.00 1/17/1998
312 Gerard Butler 90.00 1000.00 8/6/2003
868 Jennifer Love Hewitt 0 17000.00 5/25/1985
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
317 Bill Murray 789.65 5000.00 2/5/2007
'''.strip('\r\n')
""".strip('\r\n')
colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
expected = read_fwf(StringIO(test), colspecs=colspecs)
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))

def test_full_file_with_spaces_and_missing(self):
# File with spaces and missing values in columsn
test = '''
test = """
Account Name Balance CreditLimit AccountCreated
101 10000.00 1/17/1998
312 Gerard Butler 90.00 1000.00 8/6/2003
868 5/25/1985
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
317 Bill Murray 789.65
'''.strip('\r\n')
""".strip('\r\n')
colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
expected = read_fwf(StringIO(test), colspecs=colspecs)
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))

def test_messed_up_data(self):
# Completely messed up file
test = '''
test = """
Account Name Balance Credit Limit Account Created
101 10000.00 1/17/1998
312 Gerard Butler 90.00 1000.00
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
317 Bill Murray 789.65
'''.strip('\r\n')
""".strip('\r\n')
colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79))
expected = read_fwf(StringIO(test), colspecs=colspecs)
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))

def test_multiple_delimiters(self):
test = r'''
test = r"""
col1~~~~~col2 col3++++++++++++++++++col4
~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves
33+++122.33\\\bar.........Gerard Butler
++44~~~~12.01 baz~~Jennifer Love Hewitt
~~55 11+++foo++++Jada Pinkett-Smith
..66++++++.03~~~bar Bill Murray
'''.strip('\r\n')
""".strip('\r\n')
colspecs = ((0, 4), (7, 13), (15, 19), (21, 41))
expected = read_fwf(StringIO(test), colspecs=colspecs,
delimiter=' +~.\\')
Expand All @@ -335,22 +335,22 @@ def test_variable_width_unicode(self):
if not compat.PY3:
raise nose.SkipTest(
'Bytes-related test - only needs to work on Python 3')
test = '''
test = """
שלום שלום
ום שלל
של ום
'''.strip('\r\n')
""".strip('\r\n')
expected = read_fwf(BytesIO(test.encode('utf8')),
colspecs=[(0, 4), (5, 9)],
header=None, encoding='utf8')
tm.assert_frame_equal(expected, read_fwf(
BytesIO(test.encode('utf8')), header=None, encoding='utf8'))

def test_dtype(self):
data = ''' a b c
data = """ a b c
1 2 3.2
3 4 5.2
'''
"""
colspecs = [(0, 5), (5, 10), (10, None)]
result = pd.read_fwf(StringIO(data), colspecs=colspecs)
expected = pd.DataFrame({
Expand All @@ -365,3 +365,41 @@ def test_dtype(self):
result = pd.read_fwf(StringIO(data), colspecs=colspecs,
dtype={'a': 'float64', 'b': str, 'c': 'int32'})
tm.assert_frame_equal(result, expected)

def test_skiprows_inference(self):
# GH11256
test = """
Text contained in the file header
DataCol1 DataCol2
0.0 1.0
101.6 956.1
""".strip()
expected = read_csv(StringIO(test), skiprows=2,
delim_whitespace=True)
tm.assert_frame_equal(expected, read_fwf(
StringIO(test), skiprows=2))

def test_skiprows_by_index_inference(self):
test = """
To be skipped
Not To Be Skipped
Once more to be skipped
123 34 8 123
456 78 9 456
""".strip()

expected = read_csv(StringIO(test), skiprows=[0, 2],
delim_whitespace=True)
tm.assert_frame_equal(expected, read_fwf(
StringIO(test), skiprows=[0, 2]))

def test_skiprows_inference_empty(self):
test = """
AA BBB C
12 345 6
78 901 2
""".strip()

with tm.assertRaises(EmptyDataError):
read_fwf(StringIO(test), skiprows=3)

0 comments on commit aa03e7f

Please sign in to comment.