In [1]:
#In a nutshell, genfromtxt runs two main loops. The first loop converts each line of the file in a 
#sequence of strings. The second loop converts each string to the appropriate data type. 

In [2]:
import numpy as np
from io import StringIO

In [3]:
data = u"1, 2, 3\n4, 5, 6"
np.genfromtxt(StringIO(data), delimiter=",")

array([[1., 2., 3.],
       [4., 5., 6.]])

In [5]:
data = u"  1  2  3\n  4  5 67\n890123  4"
np.genfromtxt(StringIO(data), delimiter=3) #se o limite dos valores for de 3 dígitos(ou um inteiro) 
#setar o delimiter para o inteiro ou uma quantidade de inteiros

array([[  1.,   2.,   3.],
       [  4.,   5.,  67.],
       [890., 123.,   4.]])

In [6]:
data = u"123456789\n   4  7 9\n   4567 9"
np.genfromtxt(StringIO(data), delimiter=(4, 3, 2)) #

array([[1234.,  567.,   89.],
       [   4.,    7.,    9.],
       [   4.,  567.,    9.]])

In [7]:
#By default, when a line is decomposed into a series of strings, the individual entries are not stripped 
#of leading nor trailing white spaces. This behavior can be overwritten by setting the optional argument 
#autostrip to a value of True:
data = u"1, abc , 2\n 3, xxx, 4"
np.genfromtxt(StringIO(data), delimiter=",", dtype="|U5")

array([['1', ' abc ', ' 2'],
       ['3', ' xxx', ' 4']], dtype='<U5')

In [8]:
np.genfromtxt(StringIO(data), delimiter=",", dtype="|U5", autostrip=True)

array([['1', 'abc', '2'],
       ['3', 'xxx', '4']], dtype='<U5')

In [9]:
data = u"""#
... # Skip me !
... # Skip me too !
... 1, 2
... 3, 4
... 5, 6 #This is the third line of the data
... 7, 8
... # And here comes the last line
... 9, 0
... """
np.genfromtxt(StringIO(data), comments="#", delimiter=",") #ignora os comentários

array([[1., 2.],
       [3., 4.],
       [5., 6.],
       [7., 8.],
       [9., 0.]])

In [10]:
#The presence of a header in the file can hinder data processing. In that case, we need to use the 
#skip_header optional argument. The values of this argument must be an integer which corresponds to 
#the number of lines to skip at the beginning of the file, before any other action is performed. 
#Similarly, we can skip the last n lines of the file by using the skip_footer attribute and giving it 
#a value of n:
data = u"\n".join(str(i) for i in range(10))
data

'0\n1\n2\n3\n4\n5\n6\n7\n8\n9'

In [11]:
np.genfromtxt(StringIO(data),)

array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])

In [13]:
np.genfromtxt(StringIO(data),skip_header=3, skip_footer=5) #se skip_header=0 e skip_footer=0 então nenhuma linha é pulada

array([3., 4.])

In [18]:
#In some cases, we are not interested in all the columns of the data but only a few of them. 
#We can select which columns to import with the usecols argument. This argument accepts a single integer 
#or a sequence of integers corresponding to the indices of the columns to import.
data = u"1 2 3\n4 5 6"

In [17]:
np.genfromtxt(StringIO(data), usecols=(0, -1)) #se só quisermos usar a primeira e ultima coluna

array([[1., 3.],
       [4., 6.]])

In [20]:
#If the columns have names, we can also select which columns to import by giving their name to 
#the usecols argument, either as a sequence of strings or a comma-separated string:
data = u"1 2 3\n4 5 6"
np.genfromtxt(StringIO(data),names="a, b, c", usecols=("a", "c")) #colocando um nome para cada coluna e determinando quais colunas serão usadas

array([(1., 3.), (4., 6.)], dtype=[('a', '<f8'), ('c', '<f8')])

In [23]:
#A natural approach when dealing with tabular data is to allocate a name to each column. 
#A first possibility is to use an explicit structured dtype, as mentioned previously:
data = StringIO("1 2 3\n 4 5 6")
np.genfromtxt(data, dtype=[(_, int) for _ in "abc"])

array([(1, 2, 3), (4, 5, 6)],
      dtype=[('a', '<i8'), ('b', '<i8'), ('c', '<i8')])

In [25]:
#Another simpler possibility is to use the names keyword with a sequence of strings or a 
#comma-separated string:
data = StringIO("1 2 3\n 4 5 6")
np.genfromtxt(data, names="A, B, C")

array([(1., 2., 3.), (4., 5., 6.)],
      dtype=[('A', '<f8'), ('B', '<f8'), ('C', '<f8')])

In [27]:
#We may sometimes need to define the column names from the data itself. In that case, we must use the 
#names keyword with a value of True. The names will then be read from the first line (after the 
#skip_header ones), even if the line is commented out:
data = StringIO("So it goes\n#a b c\n1 2 3\n 4 5 6")
np.genfromtxt(data, skip_header=1, names=True)

array([(1., 2., 3.), (4., 5., 6.)],
      dtype=[('a', '<f8'), ('b', '<f8'), ('c', '<f8')])

In [28]:
#If names=None but a structured dtype is expected, names are defined with the standard NumPy default 
#of "f%i", yielding names like f0, f1 and so forth:
data = StringIO("1 2 3\n 4 5 6")
np.genfromtxt(data, dtype=(int, float, int)) #qnd n especifica o nome de cada coluna ele usa esse template f%i

array([(1, 2., 3), (4, 5., 6)],
      dtype=[('f0', '<i8'), ('f1', '<f8'), ('f2', '<i8')])

In [29]:
#é possível mudar esse template usando defaultmt
data = StringIO("1 2 3\n 4 5 6")
np.genfromtxt(data, dtype=(int, float, int), defaultfmt="var_%02i")

array([(1, 2., 3), (4, 5., 6)],
      dtype=[('var_00', '<i8'), ('var_01', '<f8'), ('var_02', '<i8')])

In [30]:
#Usually, defining a dtype is sufficient to define how the sequence of strings must be converted. 
#However, some additional control may sometimes be required. For example, we may want to make sure that 
#a date in a format YYYY/MM/DD is converted to a datetime object, or that a string like xx% is properly 
#converted to a float between 0 and 1. In such cases, we should define conversion functions with the 
#converters arguments.

#The value of this argument is typically a dictionary with column indices or column names as keys 
#and a conversion functions as values. These conversion functions can either be actual functions 
#or lambda functions. In any case, they should accept only a string as input and output only a single 
#element of the wanted type.

convertfunc = lambda x: float(x.strip(b"%"))/100.
data = u"1, 2.3%, 45.\n6, 78.9%, 0"
names = ("i", "p", "n")
np.genfromtxt(StringIO(data), delimiter=",", names=names)

array([(1., nan, 45.), (6., nan,  0.)],
      dtype=[('i', '<f8'), ('p', '<f8'), ('n', '<f8')])

In [31]:
#We need to keep in mind that by default, dtype=float. A float is therefore expected for the second 
#column. However, the strings ' 2.3%' and ' 78.9%' cannot be converted to float and we end up having 
#np.nan instead. Let’s now use a converter:
np.genfromtxt(StringIO(data), delimiter=",", names=names,converters={1: convertfunc})

array([(1., 0.023, 45.), (6., 0.789,  0.)],
      dtype=[('i', '<f8'), ('p', '<f8'), ('n', '<f8')])

In [35]:
#Some entries may be missing in the dataset we are trying to import. In a previous example, we used 
#a converter to transform an empty string into a float. However, user-defined converters may rapidly 
#become cumbersome to manage.

#The genfromtxt function provides two other complementary mechanisms: the missing_values argument is
#used to recognize missing data and a second argument, filling_values, is used to process these 
#missing data.

#In the following example, we suppose that the missing values are flagged with "N/A" in the first 
#column and by "???" in the third column. We wish to transform these missing values to 0 if they occur 
#in the first and second column, and to -999 if they occur in the last column:

data = u"N/A, 2, 3\n4, ,???"
kwargs = dict(delimiter=",",dtype=int,names="a,b,c",missing_values={0:"N/A", 'b':" ", 2:"???"},filling_values={0:0, 'b':0, 2:-999})
np.genfromtxt(StringIO(data), **kwargs)

array([(0, 2,    3), (4, 0, -999)],
      dtype=[('a', '<i8'), ('b', '<i8'), ('c', '<i8')])