# Ideas - Dealing with Fixed Width

In [8]:
from datetime import datetime
import pandas as pd
import numpy as np
import boto3
import glob
import json
import math
import io
import os
import csv
from flatten_json import flatten
import operator
from collections import Counter
from numpy import nan

In [73]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = -1
pd.set_option('colheader_justify', 'left')
pd.set_option('mode.chained_assignment', None) # disable the SettingwithCopyWarning

In [90]:
test_file = 'TestFixedWidth.txt'

In [91]:
df = pd.read_fwf(test_file)
df.name = test_file
df['fileName'] = test_file

In [92]:
df

Unnamed: 0,head1 head2 head3 head4,fileName
0,123456789012345671234567890123456789012345678901234567890,TestFixedWidth.txt
1,data1 d1-2 d1-3 d1-4,TestFixedWidth.txt
2,f1=10 f2=7 f3=30 f4=10,TestFixedWidth.txt


In [93]:
with open(test_file) as f:
    first_line = f.readline()

In [94]:
first_line

'head1     head2  head3                         head4     \n'

In [95]:
headers = first_line.split()

In [102]:
headers

['head1', 'head2', 'head3', 'head4']

In [103]:
print(headers)

['head1', 'head2', 'head3', 'head4']


In [97]:
widths = [10, 7, 30, 10]
#df_fixed = pd.read_fwf(test_file, widths=widths, names=headers)
df_fixed = pd.read_fwf(test_file, widths=widths)

In [98]:
df_fixed

Unnamed: 0,head1,head2,head3,head4
0,1234567890,1234567,123456789012345678901234567890,1234567890
1,data1,d1-2,d1-3,d1-4
2,f1=10,f2=7,f3=30,f4=10


In [99]:
# find widths using first line 
#  - assumes at least 1 space between columns
#  - and final field goes to end of line
first_line

'head1     head2  head3                         head4     \n'

In [101]:
w = list()
c = 0
new_field = False
for elem in first_line:
#     if elem == '\n':
#         print('END OF LINE')
#         print(c)
#         w.append(c)
    if elem == ' ':
        new_field = True
    if new_field and elem != ' ':
        w.append(c)
        print(c)
        new_field = False
        c = 0

    c += 1

w

10
7
30
10


[10, 7, 30, 10]

## Make a function  

In [131]:
def df_from_fixedfile (in_file):
    
    # Need the first line to get field lengths
    with open(in_file) as f:
        first_line = f.readline()

    # these are the fields being created
    headers = first_line.split()
    print ('Preparing to load dataframe with fields: ' + str(headers))
    
    
    # parse the first line to get field lengths
    widths = list()
    c = 0
    new_field = False
    for elem in first_line:
        if elem == ' ':
            new_field = True
        if new_field and elem != ' ':
            widths.append(c)
            new_field = False
            c = 0
        c += 1

    # check
    assert (len(widths) == len(headers)),"Field Lengths don't align with Headers!"
        
    for f, w in zip(headers, widths): 
        print ("Field :  %s     Width : %d" %(f, w)) 

    df_fixed = pd.read_fwf(in_file, widths=widths)
    return df_fixed

In [132]:
# testing
df = df_from_fixedfile (test_file)
df

Preparing to load dataframe with fields: ['head1', 'head2', 'head3', 'head4']
Field :  head1     Width : 10
Field :  head2     Width : 7
Field :  head3     Width : 30
Field :  head4     Width : 10


Unnamed: 0,head1,head2,head3,head4
0,1234567890,1234567,123456789012345678901234567890,1234567890
1,data1,d1-2,d1-3,d1-4
2,f1=10,f2=7,f3=30,f4=10


# DONE

### Red Herring .... not what I wanted   
below is from  
https://stackoverflow.com/questions/9721429/how-do-i-read-a-fix-width-format-text-file-in-pandas  
and V2 python so a little flaky  

In [40]:
def fixed_width_to_items(filename, fields, first_column_is_index=False, ignore_first_rows=0):
    reader = open(filename, 'r+')
    # skip first rows 
    for i in range(ignore_first_rows):
        reader.readline()
    if first_column_is_index:
        index = slice(0, fields[1])
        fields = [slice(*x) for x  in zip(fields[1:-1], fields[2:])]
        return ((line[index], [line[x].strip() for x in fields]) for line in reader)
    else:
        fields = [slice(*x) for x  in zip(fields[:-1], fields[1:])]
        return ((i, [line[x].strip() for x in fields]) for i,line in enumerate(reader)) 


In [56]:
import pandas
import numpy
import tempfile

# create a data frame
df = pandas.DataFrame(numpy.random.randn(100, 5))
file_ = tempfile.NamedTemporaryFile(mode='w+t',delete=True)
file_.write(df.to_string())
file_.flush()

# to see the data ...
# !cat $file_.name

# specify fields
fields = [0, 3, 12, 22, 32, 42, 52]
df2 = pandas.DataFrame.from_items( fixed_width_to_items(file_.name, fields, first_column_is_index=True, ignore_first_rows=1) ).T

print(df.head())

# need to specify the datatypes, otherwise everything is a string
df2 = pandas.DataFrame(df2, dtype=float)
df2.index = [int(x) for x in df2.index]


print(df2.head())



# check
assert (df - df2).abs().max().max() < 1E-6

          0         1         2         3         4
0  0.000996 -0.511832  0.547574 -0.185335  2.241652
1  0.030817 -0.748530  1.863612 -0.621193  0.556809
2 -1.103929  0.160402  0.889361  2.086914 -0.368338
3  2.733203  0.391572 -0.496395  0.896599 -0.323682
4  1.190319 -1.401182  0.822401  0.412505 -1.683764
          0         1         2         3         4
0  0.000996 -0.511832  0.547574 -0.185335  2.241652
1  0.030817 -0.748530  1.863612 -0.621193  0.556809
2 -1.103929  0.160402  0.889361  2.086914 -0.368338
3  2.733203  0.391572 -0.496395  0.896599 -0.323682
4  1.190319 -1.401182  0.822401  0.412505 -1.683764


  app.launch_new_instance()


In [23]:
file_

<tempfile._TemporaryFileWrapper at 0x121c28710>

In [37]:
reader = open(test_file, 'r')
type(reader)

_io.TextIOWrapper

In [32]:
for i in range(10):
    print(i)

0
1
2
3
4
5
6
7
8
9


In [30]:
!cat $file_.name

           0         1         2         3         4
0   1.542481 -0.381571  0.396385 -0.805327 -2.859339
1   0.413153  0.471801 -0.096420 -0.048273  1.397348
2  -0.689513  1.974896 -1.573189 -0.675019  0.953947
3  -1.430628 -0.845663 -0.806684  0.520087 -0.127139
4   0.645987  0.848140  0.107414 -1.447902 -0.598712
5   0.013328 -0.857769  0.262768 -0.202442  1.705372
6  -0.001290 -1.096686 -0.748881 -0.386024  0.984038
7   0.911516  0.110086  0.063316 -0.588026 -0.597471
8  -0.475004 -0.200050  0.345783 -0.368519  0.661583
9  -0.712458  0.603031  0.786267  0.278901  1.188164
10  0.796933 -1.472363  1.074477 -1.118687  0.133539
11 -0.546450  0.656727  0.982765  0.514329  1.136790
12  0.267614 -1.559825  0.588456 -1.269471  0.055710
13  0.955381 -1.606156  0.116972  0.449469  0.149579
14 -0.284648  0.659659  0.155021  1.498994 -1.295744
15  1.142939 -1.243861 -0.208409 -1.637410  0.551201
16  0.912642  0.553739  1.001158  1.199589 -1.398569
17 -1.172962  0.617984 -1.270453  1.016715  0.