# Importing data 


## Reading from plain text files 

### Reading the whole file at once

In [None]:
filename = '../data/plain text with several lines.txt'

# mode='r' to prevent writting into it. Mode='w' if you want to write
file = open(filename, mode='r')
text = file.read()
file.close()

print(text)

Using the file inside a context makes the reading of the file more concise and less cluttered. Outside of the context, the file is already closed:

In [None]:
with open(filename, 'r') as file:
    print(file.read())

We can read line by line:

In [None]:
with open(filename, 'r') as file:
    print(file.readline())
    print(file.readline())
    print(file.readline())

## Reading flat files

Flat files are text files contaning records (row of fields or attributes). Tabular data.
They usually have a header, but its not mandatory.
The delimiter (character used to separate values) can be a comma (csv), tab or any other character.

In [None]:
titanic_filename = '../data/titanic_sub.csv'

with open(titanic_filename, 'r') as file:
    print(file.readline())
    print(file.readline())
    print(file.readline())

### Reading numeric flat files using numpy

This method applies when the whole dataset to be read is numeric

In [None]:
import numpy as np 

filename='../data/mnist_kaggle_some_rows.csv'

data = np.loadtxt(filename, delimiter=',')

data

In [None]:
# If we would like to skip the header and read a couple of columns only
data = np.loadtxt(filename, delimiter=',', skiprows=1, usecols=[0,2])

### Importing flat files using pandas

The core of pandas is the DataFrame. A matrix has rows and columns. A dataframe has observations and variables.

In [None]:
import pandas as pd 

filename = '../data/cars.csv'

df = pd.read_csv(filename)
#df = pd.read_csv(filename, nrows=5, header=None, sep='\t', comment='#', na_values=['Nothing'])

## Importing other file types

### Picked files
Pickled files are python objects serialized into files.

In [None]:
import pickle

filename='filename.pickle'

my_dict = {'a': 53, 'b': 12}

with open(filename, 'wb') as handle:
    pickle.dump(my_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(filename, 'rb') as handle:
    my_dict_restored = pickle.load(handle)

print(my_dict_restored)

### Excel files

In [None]:
import pandas as pd 

file='../data/battledeath.xlsx'

data = pd.read_excel(file)

data.head()

## Reading SAS, HD5, matlab and other files

## Querying databases

# Importing data from the web

## Urllib package

Provides interface for fetching data across the web

In [None]:
from urllib.request import urlretrieve

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'

urlretrieve(url, 'winequality.csv')

In [None]:
from urllib.request import urlopen, Request

url = 'https://wikipedia.org'

request = Request(url)
response = urlopen(request)
html = response.read()

response.close() 

print(html[:1000])

In [None]:
import requests

url = 'https://wikipedia.org'

r = requests.get(url)
text = r.text

print(text[:1000])

# Scraping the web

HTML is a mix of structured data (predefined model or organized in some manner) and unstructured data.

BeautifulSoup is a python package that helps parsing and extracting structured data from HTML.


In [None]:
from bs4 import BeautifulSoup

import requests

url = 'https://www.crummy.com/software/BeautifulSoup/bs4/doc/'

r = requests.get(url)
html_doc=r.text
soup = BeautifulSoup(html_doc)

print(soup.prettify())

In [None]:
# Import packages
import requests
from bs4 import BeautifulSoup

# Specify url: url
url = 'https://www.python.org/~guido/'

# Package the request, send the request and catch the response: r
r = requests.get(url)

# Extracts the response as html: html_doc
html_doc = r.text

# Create a BeautifulSoup object from the HTML: soup
soup = BeautifulSoup(html_doc)

# Prettify the BeautifulSoup object: pretty_soup
pretty_soup = soup.prettify()

# Print the response
print(pretty_soup)

In [None]:
# Import packages
import requests
from bs4 import BeautifulSoup

# Specify url: url
url = 'https://www.python.org/~guido/'

# Package the request, send the request and catch the response: r
r = requests.get(url)

# Extract the response as html: html_doc
html_doc = r.text

# Create a BeautifulSoup object from the HTML: soup
soup = BeautifulSoup(html_doc)

# Get the title of Guido's webpage: guido_title
guido_title = soup.title

# Print the title of Guido's webpage to the shell
print(guido_title)

# Get Guido's text: guido_text
guido_text = soup.text

# Print Guido's text to the shell
print(guido_text)

In [None]:
# Import packages
import requests
from bs4 import BeautifulSoup

# Specify url
url = 'https://www.python.org/~guido/'

# Package the request, send the request and catch the response: r
r = requests.get(url)

# Extracts the response as html: html_doc
html_doc = r.text

# create a BeautifulSoup object from the HTML: soup
soup = BeautifulSoup(html_doc)

# Print the title of Guido's webpage
print(soup.title)

# Find all 'a' tags (which define hyperlinks): a_tags
a_tags = soup.find_all('a')

# Print the URLs to the shell
for link  in a_tags:
    print(link.get('href'))

# APIs and JSON

In [16]:
import json

with open('cosa.json', 'r') as json_file:
    json_data = json.load(json_file)

print(type(json_data))
print(json_data)

<class 'dict'>
{'nombre': 'El nombre de mi json', 'anho': 1988}


In [18]:
for key, item in json_data.items():
    print(f'{key}: {item}')

nombre: El nombre de mi json
anho: 1988


In [27]:
import requests

import subprocess
def get_password_from_keychain(service, account):
    command = f"/usr/bin/security find-generic-password -s '{service}' -a '{account}' -g -w"
    result = subprocess.run(command, shell=True, capture_output=True)
    password = result.stdout.decode().strip()
    return password
    
url = 'https://www.omdbapi.com/?i=tt3896198&apikey=XXXXXX'

r = requests.get(url)
json_data = r.json()

for key, value in json_data.items():
    print(key+ ':', value)

Title: Guardians of the Galaxy Vol. 2
Year: 2017
Rated: PG-13
Released: 05 May 2017
Runtime: 136 min
Genre: Action, Adventure, Comedy
Director: James Gunn
Writer: James Gunn, Dan Abnett, Andy Lanning
Actors: Chris Pratt, Zoe Saldana, Dave Bautista
Plot: The Guardians struggle to keep together as a team while dealing with their personal family issues, notably Star-Lord's encounter with his father, the ambitious celestial being Ego.
Language: English
Country: United States
Awards: Nominated for 1 Oscar. 15 wins & 60 nominations total
Poster: https://m.media-amazon.com/images/M/MV5BNjM0NTc0NzItM2FlYS00YzEwLWE0YmUtNTA2ZWIzODc2OTgxXkEyXkFqcGdeQXVyNTgwNzIyNzg@._V1_SX300.jpg
Ratings: [{'Source': 'Internet Movie Database', 'Value': '7.6/10'}, {'Source': 'Rotten Tomatoes', 'Value': '85%'}, {'Source': 'Metacritic', 'Value': '67/100'}]
Metascore: 67
imdbRating: 7.6
imdbVotes: 768,933
imdbID: tt3896198
Type: movie
DVD: N/A
BoxOffice: $389,813,101
Production: N/A
Website: N/A
Response: True


''