# 6 Importing and Saving Data with NumPy
## 6_3 Strings vs Objects vs Numbers

- Data: all columns from LoanID to TotalPrice (LT).

- numpy.genfromtxt(fname, dtype=<class 'float'>, comments='#', delimiter=None, skip_header=0, skip_footer=0, converters=None, missing_values=None, filling_values=None, usecols=None, names=None, excludelist=None, deletechars=" !#$%&'()*+, -./:;<=>?@[\\]^{|}~", replace_space='_', autostrip=False, case_sensitive=True, defaultfmt='f%i', unpack=None, usemask=False, loose=True, invalid_raise=True, max_rows=None, encoding=None, *, ndmin=0, like=None)

- skip_header, skip_footer, usecols, unpack

In [2]:
import numpy as np
np.__version__

'1.26.2'

In [10]:
lending_co_lt = np.genfromtxt('Lending-co-LT.csv', delimiter=',')

print(lending_co_lt.shape, lending_co_lt.ndim,
      lending_co_lt.size, lending_co_lt.dtype)

print(lending_co_lt)


(1044, 7) 2 7308 float64
[[      nan       nan       nan ...       nan       nan       nan]
 [1.000e+00       nan       nan ...       nan       nan 1.660e+04]
 [2.000e+00       nan       nan ...       nan       nan 1.660e+04]
 ...
 [1.041e+03       nan       nan ...       nan       nan 1.660e+04]
 [1.042e+03       nan       nan ...       nan       nan 1.560e+04]
 [1.043e+03       nan       nan ...       nan       nan 1.660e+04]]


In [14]:
# 2. Importing telling data is Integer 32bits
lending_co_lt = np.genfromtxt('Lending-co-LT.csv',
                              delimiter=',',
                              dtype=np.int32)

print(lending_co_lt.shape, lending_co_lt.ndim,
      lending_co_lt.size, lending_co_lt.dtype)

lending_co_lt
# All numbers are integers without scientific notation
# All NANs appears as -1

(1044, 7) 2 7308 int32


array([[   -1,    -1,    -1, ...,    -1,    -1,    -1],
       [    1,    -1,    -1, ...,    -1,    -1, 16600],
       [    2,    -1,    -1, ...,    -1,    -1, 16600],
       ...,
       [ 1041,    -1,    -1, ...,    -1,    -1, 16600],
       [ 1042,    -1,    -1, ...,    -1,    -1, 15600],
       [ 1043,    -1,    -1, ...,    -1,    -1, 16600]])

In [13]:
# All NANs appears as -1
lending_co_lt[0,0] + lending_co_lt[0,1]

-2

In [16]:
# 3. Importing telling data is String
lending_co_lt = np.genfromtxt('Lending-co-LT.csv',
                              delimiter=',',
                              dtype=str)

print(lending_co_lt.shape, lending_co_lt.ndim,
      lending_co_lt.size, lending_co_lt.dtype)

print(lending_co_lt)
# We cannot do Mathematical Operations

(1044, 7) 2 7308 <U14
[['LoanID' 'StringID' 'Product' ... 'Location' 'Region' 'TotalPrice']
 ['1' 'id_1' 'Product B' ... 'Location 2' 'Region 2' '16600.0']
 ['2' 'id_2' 'Product B' ... 'Location 3' '' '16600.0']
 ...
 ['1041' 'id_1041' 'Product B' ... 'Location 23' 'Region 4' '16600.0']
 ['1042' 'id_1042' 'Product C' ... 'Location 52' 'Region 6' '15600.0']
 ['1043' 'id_1043' 'Product B' ... 'Location 142' 'Region 6' '16600.0']]


In [21]:
# 4. Importing the information as objects.
lending_co_lt = np.genfromtxt('Lending-co-LT.csv',
                              delimiter=',',
                              dtype=object)

print(lending_co_lt.shape, lending_co_lt.ndim,
      lending_co_lt.size, lending_co_lt.dtype)

lending_co_lt
# The data inside is not plain text we can't freely manipulate the values.
# ONLY for backward compatibility

(1044, 7) 2 7308 object


array([[b'LoanID', b'StringID', b'Product', ..., b'Location', b'Region',
        b'TotalPrice'],
       [b'1', b'id_1', b'Product B', ..., b'Location 2', b'Region 2',
        b'16600.0'],
       [b'2', b'id_2', b'Product B', ..., b'Location 3', b'', b'16600.0'],
       ...,
       [b'1041', b'id_1041', b'Product B', ..., b'Location 23',
        b'Region 4', b'16600.0'],
       [b'1042', b'id_1042', b'Product C', ..., b'Location 52',
        b'Region 6', b'15600.0'],
       [b'1043', b'id_1043', b'Product B', ..., b'Location 142',
        b'Region 6', b'16600.0']], dtype=object)

In [22]:
# 5. Import as an array of multiply types
lending_co_lt = np.genfromtxt('Lending-co-LT.csv',
                              delimiter=',',
                              dtype=(np.int32, str, str, str,
                                     str, str, np.int32))

print(lending_co_lt.shape, lending_co_lt.ndim,
      lending_co_lt.size, lending_co_lt.dtype)

lending_co_lt
# Arrays usually consist of a single numeric datatype
# We should avoid specifying various datatypes when working with NumPy

(1044,) 1 1044 [('f0', '<i4'), ('f1', '<U'), ('f2', '<U'), ('f3', '<U'), ('f4', '<U'), ('f5', '<U'), ('f6', '<i4')]


array([(  -1, '', '', '', '', '',    -1),
       (   1, '', '', '', '', '', 16600),
       (   2, '', '', '', '', '', 16600), ...,
       (1041, '', '', '', '', '', 16600),
       (1042, '', '', '', '', '', 15600),
       (1043, '', '', '', '', '', 16600)],
      dtype=[('f0', '<i4'), ('f1', '<U'), ('f2', '<U'), ('f3', '<U'), ('f4', '<U'), ('f5', '<U'), ('f6', '<i4')])