# Import flat files with Pandas
- handles 2D labeled data structures
- columns of different types
- manipulate, slice, reshape, groupby, join, merge
- uses: EDA, data wrangling, data preprocessing, building models, visualization

In [None]:
# example
import pandas as pd
filename = 'blah.csv'
data = pd.read_csv(filename)
data.head()

In [None]:
# convert dataframe to numpy array
data.array = data.values


# example
# Assign the filename: file
file = 'digits.csv'

# Read the first 5 rows of the file into a DataFrame: data
data = pd.read_csv(file, nrows=5, header=None)

# Build a numpy array from the DataFrame: data_array
data_array = data.values

# Print the datatype of data_array to the shell
print(type(data_array))

In [None]:
# read in functions
read_csv()
read_table()

Note that missing values are also commonly referred to as NA or NaN

In [None]:
# example: 
#import a slightly corrupted copy of the Titanic dataset titanic_corrupt.txt, which contains comments after the character '#'
#and is tab-delimited.

#comment takes characters that comments occur after in the file, 
#which in this case is '#'. na_values takes a list of strings to 
#recognize as NA/NaN, in this case the string 'Nothing'

# Import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

# Assign filename: file
file = 'titanic_corrupt.txt'

# Import file: data
data = pd.read_csv(file, sep='\t', comment='#', na_values='Nothing')

# Print the head of the DataFrame
print(data.head())

# Plot 'Age' variable in a histogram
pd.DataFrame.hist(data[['Age']])
plt.xlabel('Age (years)')
plt.ylabel('count')
plt.show()


Note: Wes McKinney (Pandas) and Hadley Wickham (R) announced Feather - fast, language-agnostic data frame file format

## Excel spreadsheets

In [None]:
import pandas as pd
file = 'urbanpop.xlsx'
# import Excel spreadsheet
data = pd.ExcelFile(file)

# get sheet names
print(data.sheet_names)

# select sheet - 2 ways
df1 = data.parse('1960-1966')
df2 = data.parse(0)

In [None]:
# customize spreadsheet import with additional args
# args: skiprows, names, and parse_cols need to be lists

# Parse the first sheet and rename the columns: df1
df1 = xl.parse(0, skiprows=[0], names=['Country','AAM due to War (2002)'])

# Print the head of the DataFrame df1
print(df1.head())

# Parse the first column of the second sheet and rename the column: df2
df2 = xl.parse(1, parse_cols=[0], skiprows=[0], names=['Country'])

# Print the head of the DataFrame df2
print(df2.head())

## SAS files
- used in business analytics and biostats
- uses: advanced analytics, multivariate analysis, business intelligence, data management, predictive analytics
- standard for computational analysis
- formats: .SAS7BDAT, .SAS7BCAT

In [None]:
import pandas as pd
from sas7bdat import SAS7BDAT

with SAS7BDAT('urbanpop.sas7bdat') as file:
    df_sas = file.to_data_frame()

In [None]:
# Import sas7bdat package
from sas7bdat import SAS7BDAT

# Save file to a DataFrame: df_sas
with SAS7BDAT('sales.sas7bdat') as file:
    df_sas = file.to_data_frame()

# Print head of DataFrame
print(df_sas.head())

# Plot histogram of DataFrame features (pandas and pyplot already imported)
pd.DataFrame.hist(df_sas[['P']])
plt.ylabel('count')
plt.show()

## Stata files
- used in academic social sciences research like epidemiology
- file extension: .dta

In [None]:
import pandas as pd
data = pd.read_stata('urbanpop.dta')

In [None]:
# example
# Import pandas
import pandas as pd

# Load Stata file into a pandas DataFrame: df
df = pd.read_stata('disarea.dta')

# Print the head of the DataFrame df
print(df.head())

# Plot histogram of one column of the DataFrame
pd.DataFrame.hist(df[['disa10']])
plt.xlabel('Extent of disease')
plt.ylabel('Number of coutries')
plt.show()


## HDF5 files
= Hierarchical Data Format version 5
- standard for storing large quantities of numerical data
- size: 100s of GB or terabytes, can scale to exabytes

In [None]:
import h5py
filename = 'LIGO_data.hdf5'
# read file
data = h5py.File(filename, 'r')

print(type(data))
<class 'h5py._hl.file'>

In [None]:
# structure of HDF5 files
for key in data.keys():
    print(key)
meta
quality
strain

print(type(data['meta']))
<class 'h5py._hl.group.Group'>

for key in data['meta'].keys():
    print(key)
# output: Description
#         Detector
print(data['meta']['Description'].value, data['meta']['Detector'].value)
# output: b'Strain data time series from LIGO' b'H1'

In [None]:
# example: extract data and visualize itxs

# Get the HDF5 group: group
group = data['strain']

# Check out keys of group
for key in group.keys():
    print(key)

# Set variable equal to time series data: strain
strain = data['strain']['Strain'].value

# Set number of time points to sample: num_samples
num_samples = 10000

# Set time vector
time = np.arange(0, 1, 1/num_samples)

# Plot data
plt.plot(time, strain[:num_samples])
plt.xlabel('GPS Time (s)')
plt.ylabel('strain')
plt.show()

## MATLAB files
= Matrix Laboratory
- industry standard in engineering and science
- good with linear algebra and matrix capabilities
- proprietary
- file format: .mat

### .mat files load as dictionaries
- keys = MATLAB variable names
- values = objects assigned to variables

In [None]:
# read .mat files
scipy.io.loadmat()

# write .mat files
scipy.io.savemat()

In [2]:
# example
import scipy.io
filename = 'workspace.mat'
mat = scipy.io.loadmat(filename)

print(type(mat))
<class 'dict'>

print(type(mat['x']))
<class 'numpy.ndarray'>

In [None]:
# example: unpacking .mat dictionary
import scipy.io
import matplotlib.pyplot as plt
import numpy as np

# Print the keys of the MATLAB dictionary
print(mat.keys())

# Print the type of the value corresponding to the key 'CYratioCyt'
print(type(mat['CYratioCyt']))

# Print the shape of the value corresponding to the key 'CYratioCyt'
print(np.shape(mat['CYratioCyt']))

# Subset the array and plot it
data = mat['CYratioCyt'][25, 5:]
fig = plt.figure()
plt.plot(data)
plt.xlabel('time (min.)')
plt.ylabel('normalized fluorescence (measure of expression)')
plt.show()

## Pickled files
- these are serialized (=convert object to bytestream)
- good for dictionaries and lists

In [None]:
# example
import pickle

with open('pickled_fruit.pkl', 'rb') as file:
    data = pickle.load(file)
    
print(data)

In [None]:
# example
# Import pickle package
import pickle

# Open pickle file and load data: d
# 'rb' = 'read only' and 'binary'
with open('data.pkl', 'rb') as file:
    d = pickle.load(file)

# Print d
print(d)

# Print datatype of d
print(type(d))

## JSON
- human readable and good for Python dictionaries