# 1. Flat files

## importing entire text files

In [None]:
# Open a file: file
file = open('moby_dick.txt', mode='r')

# Print it
print(file.read())

# Check whether file is closed
print(file.closed)

# Close file

file.close()

# Check whether file is closed

print(file.closed)

## Importing text files line by line


In [None]:
# Read & print the first 3 lines
## Open moby_dick.txt using the with context manager and the variable file
with open('moby_dick.txt') as file:
    print(file.readline())
    print(file.readline())
    print(file.readline())

## Definition of flat files (vs. relational files)

Flat files consist of rows and each row is called a record.

Flat files consist of multiple tables with UNstructured relationships between the tables.

A record in a flat file is composed of fields or attributes, each of which contains at most one item of information.

## Using NumPy to import flat files

In [None]:
# Import package
import numpy as np

# Assign filename to variable: file
file = 'digits.csv'

# Load file as array: digits
digits = np.loadtxt(file, delimiter=',')

# Print datatype of digits
print(type(digits))

# Select and reshape a row
im = digits[21, 1:]
im_sq = np.reshape(im, (28, 28))

# Plot reshaped data (matplotlib.pyplot already loaded as plt)
plt.imshow(im_sq, cmap='Greys', interpolation='nearest')
plt.show()

### Customizing your NumPy import


In [None]:
# Import numpy
import numpy as np

# Assign the filename: file
file = 'digits_header.txt'

# Load the data: data
data = np.loadtxt(file, delimiter='\t', skiprows=1, usecols=[0, 2])  ## note that we want to skip the first row and write it explicitly (unlike indices, we straightforwardly put 1). However, choosing columns requires us to use indices-like coding that starts with [0]

# Print data
print(data)


### Importing different datatypes


In [None]:
# Assign filename: file
file = 'seaslug.txt'

# Import file: data
data = np.loadtxt(file, delimiter='\t', dtype=str)

# Print the first element of data
print(data[0])

# Import data as floats and skip the first row: data_float
data_float = np.loadtxt(file, delimiter='\t', dtype=float, skiprows=1)

# Print the 10th element of data_float
print(data_float[9])

# Plot a scatterplot of the data
plt.scatter(data_float[:, 0], data_float[:, 1])
plt.xlabel('time (min.)')
plt.ylabel('percentage of larvae')
plt.show()

### Working with mixed datatypes (1)


Much of the time you will need to import datasets which have different datatypes in different columns; one column may contain strings and another floats, for example. The function np.loadtxt() will freak at this. There is another function, ***np.genfromtxt()***, which can handle such structures. If we pass dtype=None to it, it will figure out what types each column should be.

Import 'titanic.csv' using the function np.genfromtxt() as follows:

data = np.genfromtxt('titanic.csv', delimiter=',', names=True, dtype=None)

You have just used np.genfromtxt() to import data containing mixed datatypes. There is also another function ***np.recfromcsv()*** that behaves similarly to np.genfromtxt(), except that its default dtype is None

In [None]:
# Assign the filename: file
file = 'titanic.csv'

# Import file using np.recfromcsv: d
## Import titanic.csv using the function np.recfromcsv() and assign it to the variable, d. You'll only need to pass file to it because it has the defaults delimiter=',' and names=True in addition to dtype=None!

d = np.recfromcsv(file)

# Print out first three entries of d
print(d[:3])


## Using pandas to import flat files as DataFrames (1)


In [None]:
# Import pandas as pd

import pandas as pd

# Assign the filename: file
file = 'titanic.csv'

# Read the file into a DataFrame: df
df = pd.read_csv(file)

# View the head of the DataFrame

print(df.head())

In [None]:
# Assign the filename: file
file = 'digits.csv'

# Read the first 5 rows of the file into a DataFrame: data

data = pd.read_csv(file, nrows=5, header=None)

# Build a numpy array from the DataFrame: data_array

data_array = data.values

# Print the datatype of data_array to the shell
print(type(data_array))

### Customizing your pandas import


In [None]:
# Import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

# Assign filename: file
file = 'titanic_corrupt.txt'

# Import file: data
data = pd.read_csv(file, sep='\t', comment='#', na_values='Nothing')

# Print the head of the DataFrame
print(data.head())

# Plot 'Age' variable in a histogram
pd.DataFrame.hist(data[['Age']])
plt.xlabel('Age (years)')
plt.ylabel('count')
plt.show()