# Data Import & Web Scrape


## Basics

### open()

```python
file = open('../dataset/medals.csv', 'r')
display(file.read())
display(file)
file.close()
file.closed
```

### `with open()` is a context manager

```python
with open('../dataset/medals.csv', 'r') as file:
    display(file.readline())
    display(file.readline())
    display(file.readline())

display(file)
    
file.closed
```

## np.loadtxt()

Can import multiple rows and cols, but numpy array can only take one datatype


```python
import numpy as np

filepath=''
skip=0
cols = []

# delimiter can be '\t', ' ', or ','
try:
    np.loadtxt(filepath, delimiter='', skiprows= skip, usecols=cols, dtype = str)
except:
    pass
```

## np.genfromtxt()

- Can import multiple datatypes into a 1D array

- `names` indicate whether has header row

```python
try:
    np.genfromtxt(filepath, delimiter='', names=True, dtype=None)
except:
    pass
```

## np.recfromcsv()

Similar to genfromtxt() with default `dtype=None`


```python
try:
    np.genfromtxt(filepath, delimiter='', names=True)
except:
    pass
```

## pd.read_csv()


```python
import pandas as pd

# nrows defines how many rows to read from the file
# header=None indicate no header row in the file
# comment='#' indicate comment identifiers
# na_values='NaN' indicate Null value identifiers
try:
    pd.read_csv(filepath, nrows=5, header=None, sep='', comment='#', na_values='NaN', parse_dates = [])
except:
    pass

```

## pd.ExcelFile()
```python
xls = pd.ExcelFile(filepath)
display(xls.sheet_names)

# parse by worksheet index
# usecols indicate which columns to be parsed
# names is a list of column names
df1 = xls.parse(0, usecols=[], skiprows=skip, names=[])

#parse by worksheet name
df2 = xls.parse(sheet_name)
```

## pd.read_excel()

```python
# sheetname can be set to:
# an index, 
# a str of Worksheet name, 
# a str list of Worksheet name: returns a dict where keys are Worksheet names, and values are pd.DataFrame
 
pd.read_excel(filepath, sheetname= , na_values=)
```

## pickle


```python
import pickle

try:
    with open(filepath, 'rb') as file:
        pickle.load(file)
except:
    pass
```

## SAS & Stata

# from sas7bdat import SAS7BDAT

```python
try:
    with SAS7BDAT(filepaht) as file:
        df_sas = file.to_data_frame()
except:
    pass
```


```python
try:
    df_stata = pd.read_stata(filepath)
except:
    pass
```

## HDF5


```python
import h5py

try:
    h5py_data = h5py.File(filepath, 'r')
except:
    pass
```

## MATLAB


```python
import scipy.io

try:
    mat = scipy.io.loadmat(filepath)
except:
    pass
```

## Relational Database


# Initialize


```python
from sqlalchemy import create_engine

engine = create_engine('sqlite:///Northwind.sqlite')
```

# Method 1 with full steps

```python
try:
    con = engine.connect()

    rs = con.execute("SELECT * FROM Orders")
    df = pd.DataFrame(rs.fetchall())
    df.columns = rs.keys()

    con.close()
except:
    pass

```

# Method 2 with context manager

```python
try:
    with engine.connect() as con:
        rs = con.execute("SELECT * FROM Orders")
        df = pd.DataFrame(rs.fetchmany(size=5))
        df.columns = rs.keys()

except:
    pass
```

# Method 3 using pd.read_sql_query(query, engine)


```python
try:
    df = pd.read_sql_query("SELECT * FROM Orders", engine)
    
```

## Data Scrape


### Request


```python
from urllib.request import urlretrieve

url = ''
filename = ''

try:
    urlretrieve(url, filename)
    df = pd.read_csv(filename)
except:
    pass
```


```python
from urllib.request import urlopen, Request

url = ''

try:
    request = Request(url)
    response = urlopen(request)
    html = response.read()
    response.close()
except:
    pass
```


```python
import requests

url = ''

try:
    r = request.get(url)
    text = r.text
except:
    pass
```

### BeautifulSoup

```python
from bs4 import BeautifulSoup
import requests

url = ''
try:
    r = requests.get(url)
    html = r.text
    
    soup = BeautifulSoup(html)
    pretty_soup = soup.prettify()
    
    print(soup.title)
    print(soup.get_text())
    
except:
    pass

```

### API & JSON



```python
import json

try:
    with open(filepath, 'r') as json_file:
        # json.load() returns a dictionary object
        json_data = json.load(json_file)
```


```python
import requests

http_prefix = 'http://'
api = 'www.omdbapi.com'
query = '?apikey=72bc447a&t=hackers'

url = http_prefix+api+query

try:
    r = requests.get(url)
    json_data = r.json()
    print(json_data.keys())
except:
    pass


```