# Bajando datos de diferentes fuentes. Ejemplos

## Bajar datos de Bicimad

In [1]:
# Importamos librerias
import requests
import zipfile
import pandas as pd

In [2]:
url = "https://pbs.twimg.com/profile_images/616689518968762368/rkhjKqNb.jpg"
image = requests.get(url)

In [3]:
image

<Response [200]>

In [4]:
with open("mutenrroy.jpg", 'wb') as f:
    f.write(image.content)

In [None]:
# https://opendata.emtmadrid.es/getattachment/037ee8a5-b824-43b1-ac7b-76225c783993/201810_Usage_Bicimad.aspx


url = 'https://opendata.emtmadrid.es/getattachment/037ee8a5-b824-43b1-ac7b-76225c783993/201810_Usage_Bicimad.aspx'

r = requests.get(url)

In [None]:
# Guardamos el zip
filename = 'bicis.zip'
with open(filename, 'wb') as f:
   f.write(r.content)

In [None]:
# Extraemos lo que hay dentro
filename = 'bicis.zip'
with zipfile.ZipFile(filename, 'r') as zip_ref:
    zip_ref.extractall(".")

In [None]:
# Leemos los datos
datos_bicis = pd.read_json('201810_Usage_Bicimad.json', lines=True, encoding='latin-1', nrows=10000)

In [None]:
# Vemos qué pinta tienen
datos_bicis.head()

In [None]:
datos_bicis['track']

# Acceso a archivos de la pagina de datos.gob.es
Uno de los mayores catálogos de información pública descargable en España

In [None]:
# https://datos.gob.es/es/catalogo/ea0003337-principales-indicadores-de-sostenibilidad1

url = "https://data.renfe.com/dataset/928d911e-00ca-437b-be3f-068fc1f97800/resource/63d285e8-9f52-410b-8412-8a8c26f5ddb3/download/consumo-de-agua-y-materiales.csv"
# url = "https://estadisticas.mecd.gob.es/EducaJaxiPx/files/_px/es/csv_c/Universitaria/Financiacion_gasto/2018/ingresos/l0/Pres_Ingr_PrevDRN_CA.px"
datos = pd.read_csv(url, sep = ';')
datos

In [None]:
df = pd.read_csv('Pres_Ingr_PrevDRN_CAsc.csv', sep = ';', header = 5, encoding = 'latin-1')

In [None]:
df

In [None]:
datos.columns

In [None]:
url = 'https://opendata.euskadi.eus/contenidos/estadistica/tablas_essec_2020/es_tblestad/data/2016-2019_Tablas_es-.xlsx'
datos_pvasco = pd.read_excel(url, sheet_name='P3', header=2)
datos_pvasco

In [None]:
lista_hojas = ['P1','P2','P3']

for hoja in lista_hojas:
    datos_pvasco = pd.read_excel(url, sheet_name=hoja, header=2)
    datos_pvasco.to_csv('Pais_vasco_hoja_' + str(hoja) + '.csv')

In [None]:
import seaborn as sns
sns.barplot(data = datos, x = 'Año', y = 'Aceites/grasas  (kg)')

## Bajando datos de la calidad del aire

También existe un dataset en Kaggle, pero vamos a ver un ejemplo

In [None]:
import pandas as pd
import requests
import io
import csv

url2 = "http://www.mambiente.munimadrid.es/opendata/horario.txt"

datos=pd.read_csv(url2, header=None)#.content


In [None]:
datos

# OpenStreetMap

In [None]:
# Amenities
# https://wiki.openstreetmap.org/wiki/Key:amenity

import requests
import json
import pandas as pd
overpass_url = "http://overpass-api.de/api/interpreter"

overpass_query = """
[out:json];
node["amenity"="cafe"]
  (40.420801, -3.694702,40.423754, -3.688167); 
out;
"""
response = requests.get(overpass_url, 
                        params={'data': overpass_query})
print(response)
data = response.json()

In [None]:
data

In [None]:
data.keys()

In [None]:
data.get('osm3s')

In [None]:
data['osm3s']

In [None]:
data['osm3s']['copyright']

In [None]:
data['elements'][0].get('tags').get('addr:street', 34)

In [None]:
for element in data['elements']:
    print(element)

In [None]:
def json_to_dataframe(data_entrada):
    elements = data_entrada['elements']
    places = {'lat': [], 'lon': [], 'name': [], 'address': []}

    for i in elements:
        lalitude = i['lat']
        longitude = i['lon']
        name = i['tags'].get('name')
        street = i['tags'].get('addr:street', 'Calle x')
        number = i['tags'].get('addr:housenumber', 999)
        
        places['lat'].append(lalitude)
        places['lon'].append(longitude)
        places['name'].append(name)
        places['address'].append(str(street) + ' ' + str(number))

    df = pd.DataFrame(places)
    return df

In [None]:
lista_responses = [data1, data2, data3, data4]
for data in lista_responses:
    df_return = json_to_dataframe(data)
    df_results = df_results.concat(df_return)

In [None]:
places

## Web scrapping de IMDB

In [None]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

# Download IMDB's Top 250 data
url = 'http://www.imdb.com/chart/top'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

movies = soup.select('td.titleColumn')
links = [a.attrs.get('href') for a in soup.select('td.titleColumn a')]
crew = [a.attrs.get('title') for a in soup.select('td.titleColumn a')]
ratings = [b.attrs.get('data-value') for b in soup.select('td.posterColumn span[name=ir]')]
votes = [b.attrs.get('data-value') for b in soup.select('td.ratingColumn strong')]

imdb = []

# Store each item into dictionary (data), then put those into a list (imdb)
for index in range(0, len(movies)):
    # Seperate movie into: 'place', 'title', 'year'
    movie_string = movies[index].get_text()
    movie = (' '.join(movie_string.split()).replace('.', ''))
    movie_title = movie[len(str(index))+1:-7]
    year = re.search('\((.*?)\)', movie_string).group(1)
    place = movie[:len(str(index))-(len(movie))]
    data = {"movie_title": movie_title,
            "year": year,
            "place": place,
            "star_cast": crew[index],
            "rating": ratings[index],
            "vote": votes[index],
            "link": links[index]}
    imdb.append(data)
df = pd.DataFrame(imdb)
df


#for item in imdb:
#    print(item['place'], '-', item['movie_title'], '('+item['year']+') -', 'Starring:', item['star_cast'])

## Acediendo a datos de twitter
Para esta demo utilizaremos el paquete [tweepy](https://www.tweepy.org/) para atacar datos de Twitter desde Python.

Necesitamos registrarnos en la [web de desarrolladores](https://developer.twitter.com/en/apply-for-access) para obtener unas credenciales.

In [None]:
import tweepy  
import time
import csv

import json

with open('./credentials.json') as f:
    credentials = json.load(f)
    
    
# Credenciales de la web de desarroladores
access_token = credentials['access_token']  
access_token_secret = credentials['access_token_secret']  
consumer_key = credentials['consumer_key']
consumer_secret = credentials['consumer_secret'] 

# Nos autenticamos en la API
try:
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)  
    auth.set_access_token(access_token, access_token_secret)  
    api = tweepy.API(auth,wait_on_rate_limit=True)
    print("Authentication OK")
except:
    print("Error during authentication")

In [None]:
csvFile = open('tweet_covid19.csv', 'a')

#Use csv Writer
csvWriter = csv.writer(csvFile)

for tweet in tweepy.Cursor(api.search,q="#COVID19",count=50,
                           lang="es",
                           since="2020-09-05").items(50):
    
    print(tweet.created_at, tweet.text)
    csvWriter.writerow([tweet.created_at, tweet.text.encode('utf-8')])

## Datos del espacio

In [None]:
'''
https://thespacedevs.com/llapi
https://ll.thespacedevs.com/2.2.0/swagger
'''
space_url = "http://ll.thespacedevs.com/2.2.0"

path = "/astronaut"

url_total = space_url + path

response = requests.get(url_total)

In [None]:
print(response.status_code)
type(response.content)
response.json()

In [None]:
url_total = space_url + path + "/?nationality=Cuban"

response = requests.get(url_total)

In [None]:
url_total

In [None]:
response.json()