## MONGODB

In [27]:
#https://realpython.com/introduction-to-mongodb-and-python/
#OPCION 1 PYMONGO
#conda install pymongo
import pymongo
from pymongo import MongoClient
#por default
client = MongoClient()
#client = MongoClient('localhost', 27017)

In [6]:
#crear base de datos
db = client.pymongo_test

In [7]:
#crear colección posts e insertar documentos
posts = db.posts
post_data = {
    'title': 'Python and MongoDB',
    'content': 'PyMongo is fun, you guys',
    'author': 'Scott'
}
result = posts.insert_one(post_data)
print('One post: {0}'.format(result.inserted_id))

One post: 5e221fc34b0d724d6cb07557


In [9]:
#inserción multiple
post_2 = {
    'title': 'Virtual Environments',
    'content': 'Use virtual environments, you guys',
    'author': 'Scott'
}
post_3 = {
    'title': 'Learning Python',
    'content': 'Learn Python, it is easy',
    'author': 'Bill'
}
new_result = posts.insert_many([post_2, post_3])
print('Multiple posts: {0}'.format(new_result.inserted_ids))

Multiple posts: [ObjectId('5e2220494b0d724d6cb07558'), ObjectId('5e2220494b0d724d6cb07559')]


In [10]:
#recuperar documentos
bills_post = posts.find_one({'author': 'Bill'})
print(bills_post)

{'title': 'Learning Python', '_id': ObjectId('5e2220494b0d724d6cb07559'), 'content': 'Learn Python, it is easy', 'author': 'Bill'}


In [21]:
#recuperación multiple
scott_posts = posts.find({'author': 'Scott'})
print(scott_posts)

<pymongo.cursor.Cursor object at 0x0000028FB474E908>


In [22]:
#iterar por el cursor
for res in scott_posts:
    print(res)

{'title': 'Python and MongoDB', '_id': ObjectId('5e221fc34b0d724d6cb07557'), 'content': 'PyMongo is fun, you guys', 'author': 'Scott'}
{'title': 'Virtual Environments', '_id': ObjectId('5e2220494b0d724d6cb07558'), 'content': 'Use virtual environments, you guys', 'author': 'Scott'}


In [26]:
#y como manejar expresiones como LIKE
myquery = { "title": { "$regex": "^L" } }
mydoc = posts.find(myquery)
for x in mydoc:
    print(x)

{'title': 'Learning Python', '_id': ObjectId('5e2220494b0d724d6cb07559'), 'content': 'Learn Python, it is easy', 'author': 'Bill'}


In [28]:
#OPCION 2 MONGOENGINE
#conda install mongoengine
from mongoengine import *
connect('mongoengine_test', host='localhost', port=27017)

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, read_preference=Primary())

In [29]:
#definir el documento
import datetime
class Post(Document):
    title = StringField(required=True, max_length=200)
    content = StringField(required=True)
    author = StringField(required=True, max_length=50)
    published = DateTimeField(default=datetime.datetime.now)

In [30]:
#guardar el documento
post_1 = Post(
    title='Sample Post',
    content='Some engaging content',
    author='Scott'
)
post_1.save()       # Insert
print(post_1.title)
post_1.title = 'A Better Post Title'
post_1.save()       # Update
print(post_1.title)

Sample Post
A Better Post Title


In [32]:
#ver resultados
print(post_1.id, post_1.title, post_1.content, post_1.author)

5e2226684b0d724d6cb0755c A Better Post Title Some engaging content Scott


In [31]:
#se respeta el esquema declarado - title required
post_2 = Post(content='Content goes here', author='Michael')
post_2.save()

ValidationError: ValidationError (Post:None) (Field is required: ['title'])

In [None]:
#PRACTICA query JSON en MONGODB
#importar el archivos restaurants.json
#mongoimport --db mongolab --collection restaurants --file C:\data\restaurants.json

In [33]:
from pymongo import MongoClient
client = MongoClient()
#acceso a base de datos
db = client.mongolab
#acceso a coleccion
rest = db.restaurants

In [40]:
#Q1 ver todos los documentos
res = rest.find()
for r in res:
    print(res)

<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54

<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54

<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54

<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54FE198>
<pymongo.cursor.Cursor object at 0x0000028FB54

In [50]:
#Q2 contar los documentos
res2 = rest.find().count()
print(res2)
#opcion con bucle?
resl = rest.find()
count = 0
for c in resl:
    count += 1
print(count)

3772
3772


  


In [52]:
#Q3 mostrar solo algunos campos
res3 = rest.find({},{"restaurant_id" : 1,"name":1,"borough":1,"cuisine" :1});
for q3 in res3:
    print(q3)

{'restaurant_id': '30112340', 'borough': 'Brooklyn', '_id': ObjectId('5e222d40521e03fa2c3ef887'), 'cuisine': 'Hamburgers', 'name': "Wendy'S"}
{'restaurant_id': '30075445', 'borough': 'Bronx', '_id': ObjectId('5e222d40521e03fa2c3ef888'), 'cuisine': 'Bakery', 'name': 'Morris Park Bake Shop'}
{'restaurant_id': '40356018', 'borough': 'Brooklyn', '_id': ObjectId('5e222d40521e03fa2c3ef889'), 'cuisine': 'American ', 'name': 'Riviera Caterer'}
{'restaurant_id': '30191841', 'borough': 'Manhattan', '_id': ObjectId('5e222d40521e03fa2c3ef88a'), 'cuisine': 'Irish', 'name': 'Dj Reynolds Pub And Restaurant'}
{'restaurant_id': '40356068', 'borough': 'Queens', '_id': ObjectId('5e222d40521e03fa2c3ef88b'), 'cuisine': 'Jewish/Kosher', 'name': 'Tov Kosher Kitchen'}
{'restaurant_id': '40356442', 'borough': 'Staten Island', '_id': ObjectId('5e222d40521e03fa2c3ef88c'), 'cuisine': 'Jewish/Kosher', 'name': 'Kosher Island'}
{'restaurant_id': '40356151', 'borough': 'Queens', '_id': ObjectId('5e222d40521e03fa2c3ef

{'restaurant_id': '40526489', 'borough': 'Manhattan', '_id': ObjectId('5e222d41521e03fa2c3efeb3'), 'cuisine': 'American ', 'name': "Deb'S"}
{'restaurant_id': '40526778', 'borough': 'Manhattan', '_id': ObjectId('5e222d41521e03fa2c3efeb4'), 'cuisine': 'Pizza', 'name': "Arturo'S Pizza"}
{'restaurant_id': '40527369', 'borough': 'Manhattan', '_id': ObjectId('5e222d41521e03fa2c3efeb5'), 'cuisine': 'Indian', 'name': 'Diwan-E-Khaas'}
{'restaurant_id': '40527398', 'borough': 'Manhattan', '_id': ObjectId('5e222d41521e03fa2c3efeb6'), 'cuisine': 'Pizza/Italian', 'name': 'Tribeca Pizzeria'}
{'restaurant_id': '40527001', 'borough': 'Manhattan', '_id': ObjectId('5e222d41521e03fa2c3efeb7'), 'cuisine': 'Hamburgers', 'name': 'Island Burgers And Shakes'}
{'restaurant_id': '40527514', 'borough': 'Brooklyn', '_id': ObjectId('5e222d41521e03fa2c3efeb8'), 'cuisine': 'American ', 'name': 'Pit Stop Bar'}
{'restaurant_id': '40528073', 'borough': 'Staten Island', '_id': ObjectId('5e222d41521e03fa2c3efeb9'), 'cuis

{'restaurant_id': '40727609', 'borough': 'Manhattan', '_id': ObjectId('5e222d41521e03fa2c3f0379'), 'cuisine': 'Irish', 'name': 'The Full Shilling'}
{'restaurant_id': '40727750', 'borough': 'Manhattan', '_id': ObjectId('5e222d41521e03fa2c3f037a'), 'cuisine': 'Indian', 'name': 'Utsav Festive India Restaurant'}
{'restaurant_id': '40727820', 'borough': 'Manhattan', '_id': ObjectId('5e222d41521e03fa2c3f037b'), 'cuisine': 'American ', 'name': 'Housing Works Food'}
{'restaurant_id': '40728151', 'borough': 'Manhattan', '_id': ObjectId('5e222d41521e03fa2c3f037c'), 'cuisine': 'Italian', 'name': 'Roc Restaurant'}
{'restaurant_id': '40726984', 'borough': 'Queens', '_id': ObjectId('5e222d41521e03fa2c3f037d'), 'cuisine': 'Greek', 'name': 'Plaza Lounge'}
{'restaurant_id': '40728353', 'borough': 'Staten Island', '_id': ObjectId('5e222d41521e03fa2c3f037e'), 'cuisine': 'American ', 'name': 'Unique Lounge'}
{'restaurant_id': '40728711', 'borough': 'Queens', '_id': ObjectId('5e222d41521e03fa2c3f037f'), 'c

In [55]:
#Q4 exluir ID
res4 = rest.find({},{"restaurant_id" : 1,"name":1,"borough":1,"cuisine" :1, "_id":0});
for q4 in res4:
    print(q4)

{'restaurant_id': '30112340', 'borough': 'Brooklyn', 'cuisine': 'Hamburgers', 'name': "Wendy'S"}
{'restaurant_id': '30075445', 'borough': 'Bronx', 'cuisine': 'Bakery', 'name': 'Morris Park Bake Shop'}
{'restaurant_id': '40356018', 'borough': 'Brooklyn', 'cuisine': 'American ', 'name': 'Riviera Caterer'}
{'restaurant_id': '30191841', 'borough': 'Manhattan', 'cuisine': 'Irish', 'name': 'Dj Reynolds Pub And Restaurant'}
{'restaurant_id': '40356068', 'borough': 'Queens', 'cuisine': 'Jewish/Kosher', 'name': 'Tov Kosher Kitchen'}
{'restaurant_id': '40356442', 'borough': 'Staten Island', 'cuisine': 'Jewish/Kosher', 'name': 'Kosher Island'}
{'restaurant_id': '40356151', 'borough': 'Queens', 'cuisine': 'American ', 'name': 'Brunos On The Boulevard'}
{'restaurant_id': '40356731', 'borough': 'Brooklyn', 'cuisine': 'Ice Cream, Gelato, Yogurt, Ices', 'name': 'Taste The Tropics Ice Cream'}
{'restaurant_id': '40356649', 'borough': 'Brooklyn', 'cuisine': 'American ', 'name': 'Regina Caterers'}
{'resta

{'restaurant_id': '40513414', 'borough': 'Manhattan', 'cuisine': 'Bottled beverages, including water, sodas, juices, etc.', 'name': "Paddy Maguire'S Ale House"}
{'restaurant_id': '40513609', 'borough': 'Manhattan', 'cuisine': 'American ', 'name': 'Skylight Diner'}
{'restaurant_id': '40513633', 'borough': 'Manhattan', 'cuisine': 'Italian', 'name': 'Arte Cafe'}
{'restaurant_id': '40513416', 'borough': 'Staten Island', 'cuisine': 'Hamburgers', 'name': 'Burger King'}
{'restaurant_id': '40513662', 'borough': 'Brooklyn', 'cuisine': 'Mexican', 'name': 'Piaxtla Es Mexico Deli'}
{'restaurant_id': '40513885', 'borough': 'Brooklyn', 'cuisine': 'American ', 'name': 'Anopoli Family Restaurant'}
{'restaurant_id': '40513798', 'borough': 'Manhattan', 'cuisine': 'American ', 'name': 'Soho Grand Hotel'}
{'restaurant_id': '40513757', 'borough': 'Manhattan', 'cuisine': 'Indian', 'name': 'Vatan'}
{'restaurant_id': '40514147', 'borough': 'Manhattan', 'cuisine': 'American ', 'name': 'Creative Edge Caterers'}

{'restaurant_id': '40674598', 'borough': 'Manhattan', 'cuisine': 'American ', 'name': 'Cafe Express'}
{'restaurant_id': '40676348', 'borough': 'Manhattan', 'cuisine': 'Café/Coffee/Tea', 'name': 'Starbucks Coffee'}
{'restaurant_id': '40676502', 'borough': 'Brooklyn', 'cuisine': 'American ', 'name': 'Burger King'}
{'restaurant_id': '40676550', 'borough': 'Manhattan', 'cuisine': 'Irish', 'name': 'The Black Sheep'}
{'restaurant_id': '40676801', 'borough': 'Manhattan', 'cuisine': 'Mexican', 'name': 'Iguana Restaurant'}
{'restaurant_id': '40676541', 'borough': 'Manhattan', 'cuisine': 'Italian', 'name': 'Teodora'}
{'restaurant_id': '40676884', 'borough': 'Brooklyn', 'cuisine': 'American ', 'name': 'New Dyker Restaurant'}
{'restaurant_id': '40677036', 'borough': 'Manhattan', 'cuisine': 'Italian', 'name': 'Le Zie Trattoria'}
{'restaurant_id': '40677193', 'borough': 'Queens', 'cuisine': 'Pizza', 'name': 'Big Jons Pizza & Restaurant'}
{'restaurant_id': '40677255', 'borough': 'Manhattan', 'cuisine

{'restaurant_id': '40861140', 'borough': 'Manhattan', 'cuisine': 'American ', 'name': 'The Black Duck Restaurant'}
{'restaurant_id': '40861103', 'borough': 'Queens', 'cuisine': 'Pizza', 'name': "Mario'S Pizza"}
{'restaurant_id': '40861244', 'borough': 'Brooklyn', 'cuisine': 'Italian', 'name': 'Bella Luna'}
{'restaurant_id': '40861201', 'borough': 'Manhattan', 'cuisine': 'Vegetarian', 'name': 'Red Bamboo'}
{'restaurant_id': '40861252', 'borough': 'Manhattan', 'cuisine': 'American ', 'name': 'Sweet & Vicious'}
{'restaurant_id': '40859224', 'borough': 'Manhattan', 'cuisine': 'American ', 'name': 'Mad River Bar & Grille'}
{'restaurant_id': '40861669', 'borough': 'Brooklyn', 'cuisine': 'Chinese', 'name': 'New Victory Restaurant'}
{'restaurant_id': '40861694', 'borough': 'Brooklyn', 'cuisine': 'American ', 'name': 'Spiros Restautrant'}
{'restaurant_id': '40861868', 'borough': 'Brooklyn', 'cuisine': 'Egyptian', 'name': 'Mr Falafael Restaurant'}
{'restaurant_id': '40862111', 'borough': 'Manhat

In [73]:
#Q5 todos los restaurantes en BRONX
res5 = rest.find({'borough':"Bronx"})
for q5 in res5:
    print(q5)

{'restaurant_id': '30075445', 'cuisine': 'Bakery', 'grades': [{'date': datetime.datetime(2014, 3, 3, 0, 0), 'grade': 'A', 'score': 2}, {'date': datetime.datetime(2013, 9, 11, 0, 0), 'grade': 'A', 'score': 6}, {'date': datetime.datetime(2013, 1, 24, 0, 0), 'grade': 'A', 'score': 10}, {'date': datetime.datetime(2011, 11, 23, 0, 0), 'grade': 'A', 'score': 9}, {'date': datetime.datetime(2011, 3, 10, 0, 0), 'grade': 'B', 'score': 14}], 'address': {'zipcode': '10462', 'building': '1007', 'coord': [-73.856077, 40.848447], 'street': 'Morris Park Ave'}, '_id': ObjectId('5e222d40521e03fa2c3ef888'), 'borough': 'Bronx', 'name': 'Morris Park Bake Shop'}
{'restaurant_id': '40357217', 'cuisine': 'American ', 'grades': [{'date': datetime.datetime(2014, 5, 28, 0, 0), 'grade': 'A', 'score': 11}, {'date': datetime.datetime(2013, 6, 19, 0, 0), 'grade': 'A', 'score': 4}, {'date': datetime.datetime(2012, 6, 15, 0, 0), 'grade': 'A', 'score': 3}], 'address': {'zipcode': '10460', 'building': '2300', 'coord': [

In [70]:
#Q6 limitar resultados anteriores a 3
res6 = rest.find({'borough':"Bronx"}).limit(3)
for q6 in res6:
    print(q6)

{'restaurant_id': '30075445', 'cuisine': 'Bakery', 'grades': [{'date': datetime.datetime(2014, 3, 3, 0, 0), 'grade': 'A', 'score': 2}, {'date': datetime.datetime(2013, 9, 11, 0, 0), 'grade': 'A', 'score': 6}, {'date': datetime.datetime(2013, 1, 24, 0, 0), 'grade': 'A', 'score': 10}, {'date': datetime.datetime(2011, 11, 23, 0, 0), 'grade': 'A', 'score': 9}, {'date': datetime.datetime(2011, 3, 10, 0, 0), 'grade': 'B', 'score': 14}], 'address': {'zipcode': '10462', 'building': '1007', 'coord': [-73.856077, 40.848447], 'street': 'Morris Park Ave'}, '_id': ObjectId('5e222d40521e03fa2c3ef888'), 'borough': 'Bronx', 'name': 'Morris Park Bake Shop'}
{'restaurant_id': '40357217', 'cuisine': 'American ', 'grades': [{'date': datetime.datetime(2014, 5, 28, 0, 0), 'grade': 'A', 'score': 11}, {'date': datetime.datetime(2013, 6, 19, 0, 0), 'grade': 'A', 'score': 4}, {'date': datetime.datetime(2012, 6, 15, 0, 0), 'grade': 'A', 'score': 3}], 'address': {'zipcode': '10460', 'building': '2300', 'coord': [

In [76]:
#Q7 restaurants con score mayor a 90
res7 = rest.find({'grades':{'$elemMatch':{"score":{'$gt': 90}}}})
for q7 in res7:
    print(q7)

{'restaurant_id': '40372466', 'cuisine': 'American ', 'grades': [{'date': datetime.datetime(2014, 8, 22, 0, 0), 'grade': 'A', 'score': 11}, {'date': datetime.datetime(2014, 3, 28, 0, 0), 'grade': 'C', 'score': 131}, {'date': datetime.datetime(2013, 9, 25, 0, 0), 'grade': 'A', 'score': 11}, {'date': datetime.datetime(2013, 4, 8, 0, 0), 'grade': 'B', 'score': 25}, {'date': datetime.datetime(2012, 10, 15, 0, 0), 'grade': 'A', 'score': 11}, {'date': datetime.datetime(2011, 10, 19, 0, 0), 'grade': 'A', 'score': 13}], 'address': {'zipcode': '10019', 'building': '65', 'coord': [-73.9782725, 40.7624022], 'street': 'West   54 Street'}, '_id': ObjectId('5e222d40521e03fa2c3ef9e3'), 'borough': 'Manhattan', 'name': "Murals On 54/Randolphs'S"}
{'restaurant_id': '40381295', 'cuisine': 'Indian', 'grades': [{'date': datetime.datetime(2014, 9, 15, 0, 0), 'grade': 'A', 'score': 5}, {'date': datetime.datetime(2014, 1, 14, 0, 0), 'grade': 'A', 'score': 8}, {'date': datetime.datetime(2013, 5, 30, 0, 0), 'gr

In [79]:
#PRACTICA Solo aquellos restaurants de cocina italiana - usar regex
#SOLUCION
regex = "Italian"
res8 = rest.find({'cuisine': {"$regex": regex}})
for q8 in res8:
    print(q8)

{'restaurant_id': '40364305', 'cuisine': 'Italian', 'grades': [{'date': datetime.datetime(2014, 2, 25, 0, 0), 'grade': 'A', 'score': 12}, {'date': datetime.datetime(2013, 6, 27, 0, 0), 'grade': 'A', 'score': 7}, {'date': datetime.datetime(2012, 12, 3, 0, 0), 'grade': 'A', 'score': 10}, {'date': datetime.datetime(2011, 11, 9, 0, 0), 'grade': 'A', 'score': 12}], 'address': {'zipcode': '11209', 'building': '10004', 'coord': [-74.03400479999999, 40.6127077], 'street': '4 Avenue'}, '_id': ObjectId('5e222d40521e03fa2c3ef8bf'), 'borough': 'Brooklyn', 'name': 'Philadelhia Grille Express'}
{'restaurant_id': '40364373', 'cuisine': 'Italian', 'grades': [{'date': datetime.datetime(2014, 9, 16, 0, 0), 'grade': 'A', 'score': 13}, {'date': datetime.datetime(2014, 2, 24, 0, 0), 'grade': 'A', 'score': 10}, {'date': datetime.datetime(2013, 5, 3, 0, 0), 'grade': 'A', 'score': 10}, {'date': datetime.datetime(2012, 8, 20, 0, 0), 'grade': 'A', 'score': 7}, {'date': datetime.datetime(2012, 2, 13, 0, 0), 'gra

## SCRAPING

In [3]:
#INTRO APIs
#https://medium.com/quick-code/absolute-beginners-guide-to-slaying-apis-using-python-7b380dc82236
import requests
request = requests.get('http://api.open-notify.org')
print(request.status_code)

200


In [5]:
#si no existe
request2 = requests.get('http://api.open-notify.org/fake-endpoint')
print(request2.status_code)

404


In [9]:
people = requests.get('http://api.open-notify.org/astros.json')
print(people.text)
print(type(people.text))

{"people": [{"name": "Christina Koch", "craft": "ISS"}, {"name": "Alexander Skvortsov", "craft": "ISS"}, {"name": "Luca Parmitano", "craft": "ISS"}, {"name": "Andrew Morgan", "craft": "ISS"}, {"name": "Oleg Skripochka", "craft": "ISS"}, {"name": "Jessica Meir", "craft": "ISS"}], "number": 6, "message": "success"}
<class 'str'>


In [10]:
people_json  = people.json()
print(people_json)
print(type(people_json))

{'message': 'success', 'people': [{'craft': 'ISS', 'name': 'Christina Koch'}, {'craft': 'ISS', 'name': 'Alexander Skvortsov'}, {'craft': 'ISS', 'name': 'Luca Parmitano'}, {'craft': 'ISS', 'name': 'Andrew Morgan'}, {'craft': 'ISS', 'name': 'Oleg Skripochka'}, {'craft': 'ISS', 'name': 'Jessica Meir'}], 'number': 6}
<class 'dict'>


In [11]:
#nro de personas en el espacio
print("Number of people in space:",people_json['number'])
#imprimir los nombres
for p in people_json['people']:
    print(p['name'])

Number of people in space: 6
Christina Koch
Alexander Skvortsov
Luca Parmitano
Andrew Morgan
Oleg Skripochka
Jessica Meir


In [None]:
#Beautiful Soup CASO 1
#https://realpython.com/beautiful-soup-web-scraper-python/

In [24]:
#caso monster
#https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia
import requests

URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)
print(page.text[:250])

<!DOCTYPE html>
<html xmlns="https://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
        <script>
        //Collect all document ready function or handler and put it in queue where it will be executed once the Jquery is loaded
       


In [28]:
#install beautifulsoup
#conda install beautifulsoup4
import requests
from bs4 import BeautifulSoup

URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')

In [29]:
#acceder a los elementos de HTML
results = soup.find(id='ResultsContainer')
print(results.prettify())

<div class="mux-custom-scroll" data-extend="left" data-mux="customScroll" data-target="html" id="ResultsContainer">
 <div class="scrollable" id="ResultsScrollable">
  <script type="application/ld+json">
   {"@context":"https://schema.org","@type":"ItemList","mainEntityOfPage":{
            "@type":"CollectionPage","@id":"https://www.monster.com/jobs/search/?q=Software-Developer&amp;where=Australia"
            }
            ,"itemListElement":[

                 {"@type":"ListItem","position":1,"url":"https://job-openings.monster.com/innovation-center-software-engineer-perth-wa-us-cisco/f805926f-ec30-4b5b-a4b0-e391598c008c"}
                    ,
                 {"@type":"ListItem","position":2,"url":""}
                    ,
                 {"@type":"ListItem","position":3,"url":"https://job-openings.monster.com/sql-bi-ssrs-ssis-developer-for-blackboard-nyc-new-york-wa-us-lancesoft-inc/56d17f16-f07d-4271-abda-b80155837c80"}
                    ,
                 {"@type":

In [31]:
#mas precision
job_elems = results.find_all('section', class_='card-content')
for job_elem in job_elems:
    print(job_elem, end='\n'*2)

<section class="card-content" data-jobid="f805926f-ec30-4b5b-a4b0-e391598c008c" onclick="MKImpressionTrackingMouseDownHijack(this, event)">
<div class="flex-row">
<div class="mux-company-logo thumbnail"></div>
<div class="summary">
<header class="card-header">
<h2 class="title"><a data-bypass="true" data-m_impr_a_placement_id="JSR2CW" data-m_impr_j_cid="11" data-m_impr_j_coc="xwashnlxx" data-m_impr_j_jawsid="411264851" data-m_impr_j_jobid="214050534" data-m_impr_j_jpm="2" data-m_impr_j_jpt="3" data-m_impr_j_lat="0" data-m_impr_j_lid="532" data-m_impr_j_long="0" data-m_impr_j_occid="11892" data-m_impr_j_p="1" data-m_impr_j_postingid="f805926f-ec30-4b5b-a4b0-e391598c008c" data-m_impr_j_pvc="monster" data-m_impr_s_t="t" data-m_impr_uuid="39ab9f65-b9b5-41df-a0a4-b31c9073485f" href="https://job-openings.monster.com/innovation-center-software-engineer-perth-wa-us-cisco/f805926f-ec30-4b5b-a4b0-e391598c008c" onclick="clickJobTitle('plid=532&amp;pcid=11&amp;poccid=11892','Software Developer',''

In [32]:
for job_elem in job_elems:
    # Each job_elem is a new BeautifulSoup object.
    # You can use the same methods on it as you did before.
    title_elem = job_elem.find('h2', class_='title')
    company_elem = job_elem.find('div', class_='company')
    location_elem = job_elem.find('div', class_='location')
    print(title_elem)
    print(company_elem)
    print(location_elem)
    print('------------')

<h2 class="title"><a data-bypass="true" data-m_impr_a_placement_id="JSR2CW" data-m_impr_j_cid="11" data-m_impr_j_coc="xwashnlxx" data-m_impr_j_jawsid="411264851" data-m_impr_j_jobid="214050534" data-m_impr_j_jpm="2" data-m_impr_j_jpt="3" data-m_impr_j_lat="0" data-m_impr_j_lid="532" data-m_impr_j_long="0" data-m_impr_j_occid="11892" data-m_impr_j_p="1" data-m_impr_j_postingid="f805926f-ec30-4b5b-a4b0-e391598c008c" data-m_impr_j_pvc="monster" data-m_impr_s_t="t" data-m_impr_uuid="39ab9f65-b9b5-41df-a0a4-b31c9073485f" href="https://job-openings.monster.com/innovation-center-software-engineer-perth-wa-us-cisco/f805926f-ec30-4b5b-a4b0-e391598c008c" onclick="clickJobTitle('plid=532&amp;pcid=11&amp;poccid=11892','Software Developer',''); clickJobTitleSiteCat('{&quot;events.event48&quot;:&quot;true&quot;,&quot;eVar25&quot;:&quot;Innovation Center-Software Engineer&quot;,&quot;eVar66&quot;:&quot;Monster&quot;,&quot;eVar67&quot;:&quot;JSR2CW&quot;,&quot;eVar26&quot;:&quot;xwashnlxx_Cisco&quot;,

In [36]:
#solo extraer texto
for job_elem in job_elems:
    title_elem = job_elem.find('h2', class_='title')
    company_elem = job_elem.find('div', class_='company')
    location_elem = job_elem.find('div', class_='location')
    if title_elem:
        print(title_elem.text)
    if company_elem:
        print(company_elem.text)
    if location_elem:
        print(location_elem.text)
    print('-------------------')

Innovation Center-Software Engineer


Cisco





Perth, WA


-------------------
-------------------
SQL BI (SSRS, SSIS) developer for Blackboard - NYC


LanceSoft Inc





New york, WA


-------------------
Software Engineer - Imagery Warehouse Services - Perth, Western Australia


EagleView Technologies





Perth, WA


-------------------
Python Developer


LanceSoft Inc





Woodlands, WA


-------------------
Software Development Manager - Sydney, New South Wales


MRI Software





Sydney, NSW


-------------------
Senior Sales Engineer


Zuora





Sydney Australia, NSW


-------------------
-------------------
Runner


Vacasa





The Blue Mountains, ON


-------------------
Senior Sales Engineer


Zuora





Melbourne, VIC


-------------------
Senior Practice Manager - IES (WA)


Blue Ocean Ventures





New York, WA


-------------------
Systems Administrator - Sydney, New South Wales


MRI Software





Sydney, NSW


-------------------
Softwar

In [37]:
#mejorar la presentacion
for job_elem in job_elems:
    title_elem = job_elem.find('h2', class_='title')
    company_elem = job_elem.find('div', class_='company')
    location_elem = job_elem.find('div', class_='location')
    if None in (title_elem, company_elem, location_elem):
        continue
    print(title_elem.text.strip())
    print(company_elem.text.strip())
    print(location_elem.text.strip())
    print()

Innovation Center-Software Engineer
Cisco
Perth, WA

SQL BI (SSRS, SSIS) developer for Blackboard - NYC
LanceSoft Inc
New york, WA

Software Engineer - Imagery Warehouse Services - Perth, Western Australia
EagleView Technologies
Perth, WA

Python Developer
LanceSoft Inc
Woodlands, WA

Software Development Manager - Sydney, New South Wales
MRI Software
Sydney, NSW

Senior Sales Engineer
Zuora
Sydney Australia, NSW

Runner
Vacasa
The Blue Mountains, ON

Senior Sales Engineer
Zuora
Melbourne, VIC

Senior Practice Manager - IES (WA)
Blue Ocean Ventures
New York, WA

Systems Administrator - Sydney, New South Wales
MRI Software
Sydney, NSW

Software Applications Developers, Software Developers, Sr. Software Applications Developers and Sr.
Avant-Garde
Framingham, MA

Strategic Account Executive
Zuora
Sydney, NSW



In [39]:
#extraer atributos
python_jobs = results.find_all('h2',
                               string=lambda text: "python" in text.lower())

for p_job in python_jobs:
    link = p_job.find('a')['href']
    print(p_job.text.strip())
    print("Apply here: {}".format(link))

Python Developer
Apply here: https://job-openings.monster.com/python-developer-woodlands-wa-us-lancesoft-inc/4755ec59-d0db-4ce9-8385-b4df7c1e9f7c


In [1]:
#Beautiful Soup CASO 2 - peliculas
#https://medium.com/@kimdang229/python-and-beautifulsoup-web-scraping-tutorial-1d47e7a38fab

In [2]:
import requests
from bs4 import BeautifulSoup

In [4]:
#top 25 https://www.imdb.com/list/ls024149810/
#obtener pagina
url = 'https://www.imdb.com/list/ls024149810/'
r = requests.get(url)
print(r.status_code)

200


In [5]:
#crear objeto
soup = BeautifulSoup(r.content, 'html.parser')

In [None]:
#inspeccionar en chrome
#<div class="lister-item-content">

In [10]:
#extrar titulo - objeto completo
titles = soup.find_all('h3',{'class':'lister-item-header'})
for title in titles:
    print(title)

<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt0068646/">El padrino</a>
<span class="lister-item-year text-muted unbold">(1972)</span>
</h3>
<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">2.</span>
<a href="/title/tt0111161/">Sueños de fuga</a>
<span class="lister-item-year text-muted unbold">(1994)</span>
</h3>
<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">3.</span>
<a href="/title/tt0033467/">Citizen Kane</a>
<span class="lister-item-year text-muted unbold">(1941)</span>
</h3>
<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">4.</span>
<a href="/title/tt0080684/">El imperio contraataca</a>
<span class="lister-item-year text-muted unbold">(1980)</span>
</h3>
<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">5.</span>
<a href="/title/tt0167260/">El señor de los anillos: El retorno 

In [9]:
#extraer solo el titulo
movie_title = []
for title in titles:
    movietitle = title.find('a', href = True).get_text()
    movie_title.append(movietitle)
print(movie_title)

['El padrino', 'Sueños de fuga', 'Citizen Kane', 'El imperio contraataca', 'El señor de los anillos: El retorno del rey', 'Batman - El caballero de la noche', '12 hombres en pugna', 'La lista de Schindler', 'Lo bueno, lo malo y lo feo', 'El padrino II', 'Tiempos violentos', 'El club de la pelea', 'Psicosis', '2001: Odisea del espacio', 'Metropolis', 'La guerra de las galaxias', 'El señor de los anillos: La comunidad del anillo', 'Terminator 2: el juicio final', 'Matrix', 'Los cazadores del arca perdida', 'Casablanca', 'El mago de Oz', 'Los siete samuráis', 'Forrest Gump', 'El origen']


In [21]:
#extraer el rating directamente con .text
movie_rating = []
ratings = soup.find_all('div',{'class':'ipl-rating-star small'})
for rating in ratings:
    movie_rating.append(rating.text.strip())
print(movie_rating)

['9.2', '9.3', '8.3', '8.7', '8.9', '9', '8.9', '8.9', '8.8', '9', '8.9', '8.8', '8.5', '8.3', '8.3', '8.6', '8.8', '8.5', '8.7', '8.4', '8.5', '8', '8.6', '8.8', '8.8']


In [23]:
#PRACTICA extraer genero
#SOLUCION
movie_genre = []
for genre in soup.findAll('span', attrs= {'class':'genre'}):
    genre = genre.get_text()
    movie_genre.append(genre.strip())
print(movie_genre)

['Crime, Drama', 'Drama', 'Drama, Mystery', 'Action, Adventure, Fantasy', 'Adventure, Drama, Fantasy', 'Action, Crime, Drama', 'Drama', 'Biography, Drama, History', 'Western', 'Crime, Drama', 'Crime, Drama', 'Drama', 'Horror, Mystery, Thriller', 'Adventure, Sci-Fi', 'Drama, Sci-Fi', 'Action, Adventure, Fantasy', 'Adventure, Drama, Fantasy', 'Action, Sci-Fi', 'Action, Sci-Fi', 'Action, Adventure', 'Drama, Romance, War', 'Adventure, Family, Fantasy', 'Action, Adventure, Drama', 'Drama, Romance', 'Action, Adventure, Sci-Fi']


In [25]:
#extraer runtime
movie_runtime = []
for runtime in soup.findAll('span', attrs= {'class':'runtime'}):
    runtime = runtime.get_text()
    movie_runtime.append(runtime.strip())
print(movie_runtime)

['175 min', '142 min', '119 min', '124 min', '201 min', '152 min', '96 min', '195 min', '161 min', '202 min', '154 min', '139 min', '109 min', '149 min', '153 min', '121 min', '178 min', '137 min', '136 min', '115 min', '102 min', '102 min', '207 min', '142 min', '148 min']


In [26]:
#crear un dataframe a partir de las listas
import pandas as pd 

df = pd.DataFrame(list(zip(movie_title, movie_rating, movie_genre, movie_runtime)), 
               columns =['Title', 'Rating', 'Genre', 'Runtime']) 
df

Unnamed: 0,Title,Rating,Genre,Runtime
0,El padrino,9.2,"Crime, Drama",175 min
1,Sueños de fuga,9.3,Drama,142 min
2,Citizen Kane,8.3,"Drama, Mystery",119 min
3,El imperio contraataca,8.7,"Action, Adventure, Fantasy",124 min
4,El señor de los anillos: El retorno del rey,8.9,"Adventure, Drama, Fantasy",201 min
5,Batman - El caballero de la noche,9.0,"Action, Crime, Drama",152 min
6,12 hombres en pugna,8.9,Drama,96 min
7,La lista de Schindler,8.9,"Biography, Drama, History",195 min
8,"Lo bueno, lo malo y lo feo",8.8,Western,161 min
9,El padrino II,9.0,"Crime, Drama",202 min
