-
-
Notifications
You must be signed in to change notification settings - Fork 47
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #16 from scalvov/master
Notebook 05 Webscrapping
- Loading branch information
Showing
1 changed file
with
276 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,276 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Ponemos como ruta principal la carpeta padre de \"soluciones\"\n", | ||
"# para que las rutas relativas queden igual que en la carpeta de notebooks\n", | ||
"import paths\n", | ||
"\n", | ||
"# Otras dependencias\n", | ||
"import requests\n", | ||
"from bs4 import BeautifulSoup\n", | ||
"import re\n", | ||
"import pandas as pd" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"#### SOS: ¡me han baneado!\n", | ||
"\n", | ||
"Opción de usar el html offline que hay en `dat` para seguir con el ejercicio. Descomenta el siguiente código y sigue adelante" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Descomenta esto si te han baneado\n", | ||
"# with open('dat/milanuncios.html') as f:\n", | ||
"# soup = BeautifulSoup(f, 'html.parser')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Ejercicio\n", | ||
"\n", | ||
"Crea un dataframe de pandas en el que cada fila sea un anuncio y tenga como columnas información que consideres relevante: precio, kilómetros, año, cilindrada, texto del anuncio, ..." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"headers = {\n", | ||
" 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'\n", | ||
"}\n", | ||
"\n", | ||
"page = requests.get('https://www.milanuncios.com/motos-de-carretera/duke-390.htm', headers=headers)\n", | ||
"soup = BeautifulSoup(page.content, 'html.parser')\n", | ||
"\n", | ||
"columns = ['producto','tipo_anuncio','precio','texto','cc','ano','kms']\n", | ||
"\n", | ||
"df = pd.DataFrame(columns=columns)\n", | ||
"\n", | ||
"for link in soup.find_all('div', class_='aditem ParticularCardTestABClass'):\n", | ||
" producto = link.find('a', class_='aditem-detail-title').get_text()\n", | ||
" tipo_anuncio = link.find('div', class_='x3').get_text()\n", | ||
" if link.find('div', class_='aditem-price') is None:\n", | ||
" precio = 0\n", | ||
" else:\n", | ||
" precio = link.find('div', class_='aditem-price').get_text().replace('€','')\n", | ||
" texto = link.find('div', class_='tx').get_text()\n", | ||
" if link.find('div', class_='cc tag-mobile') is None:\n", | ||
" cc = 0\n", | ||
" else:\n", | ||
" cc = link.find('div', class_='cc tag-mobile').get_text().replace('cc','')\n", | ||
" if link.find('div', class_='ano tag-mobile') is None:\n", | ||
" ano = 0\n", | ||
" else:\n", | ||
" ano = link.find('div', class_='ano tag-mobile').get_text().replace('año','') #¿Alguna solución mejor que un if para esto?\n", | ||
" if link.find('div', class_='kms tag-mobile') is None:\n", | ||
" kms = 0\n", | ||
" else:\n", | ||
" kms = link.find('div', class_='kms tag-mobile').get_text().replace('kms','')\n", | ||
"\n", | ||
" nueva_fila = {'producto':producto,'tipo_anuncio':tipo_anuncio,'precio':precio,'texto':texto,'cc':cc,'ano':ano,'kms':kms}\n", | ||
" df = df.append(nueva_fila, ignore_index=True)\n", | ||
" \n", | ||
"df" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Ejercicio\n", | ||
"\n", | ||
"Modifica el código anterior para que, además de bajarse la página actual, navegue por el resto de páginas e incorpore también esos anuncios a tu dataframe." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"page = requests.get('https://www.milanuncios.com/motos-de-carretera/duke-390.htm', headers=headers)\n", | ||
"soup = BeautifulSoup(page.content, 'html.parser')\n", | ||
"\n", | ||
"columns = ['producto','tipo_anuncio','precio','texto','cc','ano','kms']\n", | ||
"df = pd.DataFrame(columns=columns)\n", | ||
"nueva_pag = 1\n", | ||
"while len(soup.find_all('div', class_='aditem ParticularCardTestABClass')) > 0:\n", | ||
" for link in soup.find_all('div', class_='aditem ParticularCardTestABClass'):\n", | ||
" producto = link.find('a', class_='aditem-detail-title').get_text()\n", | ||
" tipo_anuncio = link.find('div', class_='x3').get_text()\n", | ||
" if link.find('div', class_='aditem-price') is None:\n", | ||
" precio = 0\n", | ||
" else:\n", | ||
" precio = link.find('div', class_='aditem-price').get_text().replace('€','')\n", | ||
" texto = link.find('div', class_='tx').get_text()\n", | ||
" if link.find('div', class_='cc tag-mobile') is None:\n", | ||
" cc = 0\n", | ||
" else:\n", | ||
" cc = link.find('div', class_='cc tag-mobile').get_text().replace('cc','')\n", | ||
" if link.find('div', class_='ano tag-mobile') is None:\n", | ||
" ano = 0\n", | ||
" else:\n", | ||
" ano = link.find('div', class_='ano tag-mobile').get_text().replace('año','') #¿Alguna solución mejor que un if para esto?\n", | ||
" if link.find('div', class_='kms tag-mobile') is None:\n", | ||
" kms = 0\n", | ||
" else:\n", | ||
" kms = link.find('div', class_='kms tag-mobile').get_text().replace('kms','')\n", | ||
"\n", | ||
" nueva_fila = {'producto':producto,'tipo_anuncio':tipo_anuncio,'precio':precio,'texto':texto,'cc':cc,'ano':ano,'kms':kms}\n", | ||
" df = df.append(nueva_fila, ignore_index=True)\n", | ||
"\n", | ||
" nueva_pag += 1\n", | ||
" page = requests.get('https://www.milanuncios.com/motos-de-carretera/duke-390.htm?pagina='+str(nueva_pag), headers=headers)\n", | ||
" soup = BeautifulSoup(page.content, 'html.parser')\n", | ||
"\n", | ||
"df" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Primero, hacemos una petición para descargar la página de interés (que contiene las cotizaciones de las acciones del IBEX 35 en tiempo _casi_ real)." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Pasos previos al ejercicio que se ejecutan durante el notebook de explicación\n", | ||
"base_url = \"https://www.eleconomista.es/indice/IBEX-35\"\n", | ||
"res = requests.get(base_url)\n", | ||
"contenido = res.content\n", | ||
"soup = BeautifulSoup(contenido, \"html.parser\")\n", | ||
"tablas = soup.find_all('table')\n", | ||
"lineas = [x for tabla in tablas for x in tabla.find_all('tr')]\n", | ||
"datos = [[x.text for x in linea.find_all('td')] for linea in lineas]\n", | ||
"datos = [x for x in datos if len(x) > 0]\n", | ||
"datos = pd.DataFrame(datos)\n", | ||
"datos" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"#### Ejercicio\n", | ||
"\n", | ||
"Usa los elementos `th` de la primera fila de las tablas para extraer nombres para las columnas de la tabla. " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"cabecera = [[x.text for x in linea.find_all('th')] for linea in lineas]\n", | ||
"cabecera = [x for x in cabecera if len(x) > 0]\n", | ||
"cabecera = cabecera[0]\n", | ||
"cabecera" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"#### Ejercicio\n", | ||
"\n", | ||
"Elimina las columnas irrelevantes y cambia los nombres de las columnas por otros breves y sin caracteres extraños o que dificulten el posproceso." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"cabecera = [[x.text for x in linea.find_all('th')] for linea in lineas]\n", | ||
"cabecera = [x for x in cabecera if len(x) > 0]\n", | ||
"cabecera = cabecera[0]\n", | ||
"cabecera.pop(2) #Elimino la columna que tiene el nombre en blanco\n", | ||
"\n", | ||
"cabecera = [re.sub('[()/. ]','',x) for x in cabecera]\n", | ||
"cabecera = [re.sub('ó','o',x) for x in cabecera]\n", | ||
"cabecera = [re.sub('%','pct',x) for x in cabecera]\n", | ||
"cabecera = [re.sub('€','Euros',x) for x in cabecera]\n", | ||
"cabecera[8] = 'Fecha'\n", | ||
"\n", | ||
"datos = datos.drop([2], axis = 1) #Elimino una columna en blanco\n", | ||
"\n", | ||
"datos.columns = cabecera\n", | ||
"\n", | ||
"datos" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"#### Ejercicio\n", | ||
"\n", | ||
"Cambia el formato de las columnas adecuadamente: convierte a numéricas, etc., las columnas que lo requieran." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"datos['VolumenEuros'] = datos['VolumenEuros'].replace(regex=r'\\.',value='')\n", | ||
"\n", | ||
"datos = datos.replace(regex=r'%',value='')\n", | ||
"datos = datos.replace(regex=r',',value='.')\n", | ||
"datos[['Precio','Varpct','VarEuros','VolumenEuros','Capitalizacion1','PER','RentDiv']] = datos[['Precio','Varpct','VarEuros','VolumenEuros','Capitalizacion1','PER','RentDiv']].astype(float)\n", | ||
"\n", | ||
"datos['Fecha'] = datos['Fecha'] + '/2020'\n", | ||
"\n", | ||
"datos['Fecha'] = pd.to_datetime(datos['Fecha'], format='%d/%m/%Y')\n", | ||
"\n", | ||
"datos.dtypes" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "libropython", | ||
"language": "python", | ||
"name": "libropython" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.6" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |