# Data Import & Web Scrape


## Basics

In [10]:
file = open('../dataset/medals.csv', 'r')
display(file.read())
display(file)
file.close()
file.closed

'country,Bronze,Gold,Silver\nUnited States,67,137,52\nGermany,67,47,43\nGreat Britain,26,64,55\nRussia,35,50,28\nChina,35,44,30\nFrance,21,20,55\nAustralia,25,23,34\nItaly,24,8,38\nCanada,61,4,4\nJapan,34,17,13\n'

<_io.TextIOWrapper name='../dataset/medals.csv' mode='r' encoding='UTF-8'>

True

In [9]:
# with is a context manager
with open('../dataset/medals.csv', 'r') as file:
    display(file.readline())
    display(file.readline())
    display(file.readline())

display(file)
    
file.closed

'country,Bronze,Gold,Silver\n'

'United States,67,137,52\n'

'Germany,67,47,43\n'

<_io.TextIOWrapper name='../dataset/medals.csv' mode='r' encoding='UTF-8'>

True

## np.loadtxt()

Can import multiple rows and cols, but numpy array can only take one datatype

In [18]:
import numpy as np

filepath=''
skip=0
cols = []

# delimiter can be '\t', ' ', or ','
try:
    np.loadtxt(filepath, delimiter='', skiprows= skip, usecols=cols, dtype = str)
except:
    pass

## np.genfromtxt()

Can import multiple datatypes into a 1D array

In [20]:
# names indicate whether has header row
try:
    np.genfromtxt(filepath, delimiter='', names=True, dtype=None)
except:
    pass

## np.recfromcsv()

Similar to genfromtxt() with default `dtype=None`

In [21]:
try:
    np.genfromtxt(filepath, delimiter='', names=True)
except:
    pass

## pd.read_csv()

In [23]:
import pandas as pd

# nrows defines how many rows to read from the file
# header=None indicate no header row in the file
# comment='#' indicate comment identifiers
# na_values='NaN' indicate Null value identifiers
try:
    pd.read_csv(filepath, nrows=5, header=None, sep='', comment='#', na_values='NaN')
except:
    pass

## pd.ExcelFile()

In [28]:
try:
    xls = pd.ExcelFile(filepath)
    display(xls.sheet_names)
    
    # parse by worksheet index
    # usecols indicate which columns to be parsed
    # names is a list of column names
    df1 = xls.parse(0, usecols=[], skiprows=skip, names=[])
    
    #parse by worksheet name
    df2 = xls.parse(sheet_name)
except:
    pass

In [None]:
try:
    # set sheet_name = None to read all worksheets
    # this returns a dictionary of dataframes, where key is worksheet name
    # value is worksheet content
    xls = pd.read_excel(filepath, sheet_name = None)
    

## pickle

In [26]:
import pickle

try:
    with open(filepath, 'rb') as file:
        pickle.load(file)
except:
    pass

## SAS & Stata

In [30]:
# from sas7bdat import SAS7BDAT

try:
    with SAS7BDAT(filepaht) as file:
        df_sas = file.to_data_frame()
except:
    pass

In [None]:
try:
    df_stata = pd.read_stata(filepath)
except:
    pass

## HDF5

In [32]:
import h5py

try:
    h5py_data = h5py.File(filepath, 'r')
except:
    pass

## MATLAB

In [34]:
import scipy.io

try:
    mat = scipy.io.loadmat(filepath)
except:
    pass

## Relational Database


In [49]:
# Initialize

from sqlalchemy import create_engine

engine = create_engine('sqlite:///Northwind.sqlite')

In [47]:
# Method 1 with full steps
try:
    con = engine.connect()

    rs = con.execute("SELECT * FROM Orders")
    df = pd.DataFrame(rs.fetchall())
    df.columns = rs.keys()

    con.close()
except:
    pass

In [48]:
# Method 2 with context manager
try:
    with engine.connect() as con:
        rs = con.execute("SELECT * FROM Orders")
        df = pd.DataFrame(rs.fetchmany(size=5))
        df.columns = rs.keys()

except:
    pass

In [None]:
# Method 3 using pd.read_sql_query(query, engine)

try:
    df = pd.read_sql_query("SELECT * FROM Orders", engine)

## Data Scrape


### Request

In [50]:
from urllib.request import urlretrieve

url = ''
filename = ''

try:
    urlretrieve(url, filename)
    df = pd.read_csv(filename)
except:
    pass

In [None]:
from urllib.request import urlopen, Request

url = ''

try:
    request = Request(url)
    response = urlopen(request)
    html = response.read()
    response.close()
except:
    pass

In [None]:
import requests

url = ''

try:
    r = request.get(url)
    text = r.text
except:
    pass

In [None]:
from bs4 import BeautifulSoup
import requests

url = ''
try:
    r = requests.get(url)
    html = r.text
    
    soup = BeautifulSoup(html)
    pretty_soup = soup.prettify()
    
    print(soup.title)
    print(soup.get_text())
    
except:
    pass

### API & JSON


In [None]:
import json

try:
    with open(filepath, 'r') as json_file:
        # json.load() returns a dictionary object
        json_data = json.load(json_file)

In [55]:
import requests

http_prefix = 'http://'
api = 'www.omdbapi.com'
query = '?apikey=72bc447a&t=hackers'

url = http_prefix+api+query

try:
    r = requests.get(url)
    json_data = r.json()
    print(json_data.keys())
except:
    pass


dict_keys(['Title', 'Year', 'Rated', 'Released', 'Runtime', 'Genre', 'Director', 'Writer', 'Actors', 'Plot', 'Language', 'Country', 'Awards', 'Poster', 'Ratings', 'Metascore', 'imdbRating', 'imdbVotes', 'imdbID', 'Type', 'DVD', 'BoxOffice', 'Production', 'Website', 'Response'])
