# Advanced Querying Mongo

Importing libraries and setting up connection

In [2]:
from pymongo import MongoClient

dbName = "companies"
mongodbURL = f"mongodb://localhost/{dbName}"
print(mongodbURL)
client = MongoClient(mongodbURL, connectTimeoutMS=2000,serverSelectionTimeoutMS=2000)
db = client.get_database()

mongodb://localhost/companies


In [3]:
cur = db.list_collections()
for collection in cur:
    print(collection)

{'name': 'companies', 'type': 'collection', 'options': {}, 'info': {'readOnly': False, 'uuid': UUID('2b3b6df9-70f0-45fe-9456-0fd22d6970b8')}, 'idIndex': {'v': 2, 'key': {'_id': 1}, 'name': '_id_', 'ns': 'companies.companies'}}


### 1. All the companies whose name match 'Babelgum'. Retrieve only their `name` field.

In [10]:
# Your Code
for c in db.companies.find({"name":"Babelgum"}):
    print(c["name"])

Babelgum


### 2. All the companies that have more than 5000 employees. Limit the search to 20 companies and sort them by **number of employees**.

In [159]:
# Your Code
%time
more = [(c["name"],c["number_of_employees"]) for c in db.companies.find({"number_of_employees": {"$gt": 5000} })]
more=sorted(more, key=lambda m: m[1], reverse=True)
more[:20]

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.58 µs


[('Siemens', 405000),
 ('IBM', 388000),
 ('Toyota', 320000),
 ('PayPal', 300000),
 ('Nippon Telegraph and Telephone Corporation', 227000),
 ('Samsung Electronics', 221726),
 ('Accenture', 205000),
 ('Tata Consultancy Services', 200300),
 ('Flextronics International', 200000),
 ('Safeway', 186000),
 ('Sony', 180500),
 ('LG', 177000),
 ('Ford', 171000),
 ('Boeing', 160000),
 ('Digital Equipment Corporation', 140000),
 ('Nokia', 125000),
 ('MItsubishi Electric', 107000),
 ('MItsubishi Electric', 107000),
 ('Comcast', 100000),
 ('Bertelsmann', 100000)]

He hecho ambas queries para ver la diferencia de velocidad, que al contrario de lo esperado es mas rapida la primera.

In [203]:
%time
z = db.companies.find({"number_of_employees": {"$gt": 5000} }, {"_id":0,"name":1, "number_of_employees":1}).sort([("number_of_employees", DESCENDING)]).limit(20)
list(z)

CPU times: user 10 µs, sys: 1 µs, total: 11 µs
Wall time: 21.2 µs


[{'name': 'Siemens', 'number_of_employees': 405000},
 {'name': 'IBM', 'number_of_employees': 388000},
 {'name': 'Toyota', 'number_of_employees': 320000},
 {'name': 'PayPal', 'number_of_employees': 300000},
 {'name': 'Nippon Telegraph and Telephone Corporation',
  'number_of_employees': 227000},
 {'name': 'Samsung Electronics', 'number_of_employees': 221726},
 {'name': 'Accenture', 'number_of_employees': 205000},
 {'name': 'Tata Consultancy Services', 'number_of_employees': 200300},
 {'name': 'Flextronics International', 'number_of_employees': 200000},
 {'name': 'Safeway', 'number_of_employees': 186000},
 {'name': 'Sony', 'number_of_employees': 180500},
 {'name': 'LG', 'number_of_employees': 177000},
 {'name': 'Ford', 'number_of_employees': 171000},
 {'name': 'Boeing', 'number_of_employees': 160000},
 {'name': 'Digital Equipment Corporation', 'number_of_employees': 140000},
 {'name': 'Nokia', 'number_of_employees': 125000},
 {'name': 'MItsubishi Electric', 'number_of_employees': 107000}

### 3. All the companies founded between 2000 and 2005, both years included. Retrieve only the `name` and `founded_year` fields.

In [243]:
# Your Code
%time
founded = [(c["name"],c["founded_year"]) for c in db.companies.find({"founded_year":{"$gte":2000,"$lte":2005}})]
founded = (sorted(founded, key=lambda e: e[1], reverse=True))
print(founded[:10],"\n\n\n",founded[-10:])

CPU times: user 6 µs, sys: 1e+03 ns, total: 7 µs
Wall time: 12.6 µs
[('Omnidrive', 2005), ('Zoho', 2005), ('Helio', 2005), ('Wetpaint', 2005), ('Wesabe', 2005), ('Jangl SMS', 2005), ('Jingle Networks', 2005), ('LifeLock', 2005), ('Jajah', 2005), ('YouTube', 2005)] 


 [('PaymentOne', 2000), ('optionsXpress', 2000), ('Skywire Software', 2000), ('Sirific Wireless', 2000), ('LogicLibrary', 2000), ('Vigilos', 2000), ('Block Shield', 2000), ('Netrake', 2000), ('EnterSys Group', 2000), ('EnteGreat', 2000)]


Aqui he hecho dos queries tb para repertir el mismo experimento y en este caso es mas rapida la 2.
Sin embargo es algo aleatorio ya que he tirado los dos ejercicios varias veces y de vez cuando es mas rapida una u otra

In [242]:
%time
b = db.companies.find({"founded_year":{"$gte":2000,"$lte":2005}},{"_id":0,"name":1, "founded_year":1}).sort([("founded_year", DESCENDING)])
list(b)

CPU times: user 6 µs, sys: 1 µs, total: 7 µs
Wall time: 12.9 µs


[{'name': 'Omnidrive', 'founded_year': 2005},
 {'name': 'Zoho', 'founded_year': 2005},
 {'name': 'Helio', 'founded_year': 2005},
 {'name': 'Wetpaint', 'founded_year': 2005},
 {'name': 'Wesabe', 'founded_year': 2005},
 {'name': 'Jangl SMS', 'founded_year': 2005},
 {'name': 'Jingle Networks', 'founded_year': 2005},
 {'name': 'LifeLock', 'founded_year': 2005},
 {'name': 'Jajah', 'founded_year': 2005},
 {'name': 'YouTube', 'founded_year': 2005},
 {'name': 'Revision3', 'founded_year': 2005},
 {'name': 'iSkoot', 'founded_year': 2005},
 {'name': 'ClipBlast!', 'founded_year': 2005},
 {'name': 'Zlio', 'founded_year': 2005},
 {'name': 'RockYou', 'founded_year': 2005},
 {'name': 'Dailymotion', 'founded_year': 2005},
 {'name': 'Instructables', 'founded_year': 2005},
 {'name': 'Slide', 'founded_year': 2005},
 {'name': 'Swivel', 'founded_year': 2005},
 {'name': 'TripHub', 'founded_year': 2005},
 {'name': 'Netvibes', 'founded_year': 2005},
 {'name': 'Bebo', 'founded_year': 2005},
 {'name': 'PBworks',

### 4. All the companies that had a Valuation Amount of more than 100.000.000 and have been founded before 2010. Retrieve only the `name` and `ipo` fields.

In [210]:
# Your Code
from pandas.io.json import json_normalize

val_am = db.companies.find({"$and":[{"ipo.valuation_amount":{"$gt":100000000}},{"founded_year":{"$lte":2010}}]},{"_id":0,"name":1,"ipo":1}).sort([("ipo.valuation_amount", DESCENDING)])
df = pd.DataFrame(json_normalize(val_am))
df

Unnamed: 0,name,ipo.valuation_amount,ipo.valuation_currency_code,ipo.pub_year,ipo.pub_month,ipo.pub_day,ipo.stock_symbol
0,Facebook,104000000000,USD,2012.0,5.0,18.0,NASDAQ:FB
1,Amazon,100000000000,USD,1997.0,5.0,,NASDAQ:AMZN
2,Twitter,18100000000,USD,2013.0,11.0,7.0,NYSE:TWTR
3,Groupon,12800000000,USD,2011.0,11.0,7.0,NASDAQ:GRPN
4,Tencent,11000000000,USD,2004.0,6.0,16.0,HK:0700
5,Western Digital,9430000000,USD,,,,NYSE:WDC
6,LinkedIn,9310000000,USD,2011.0,7.0,20.0,NYSE:LNKD
7,BMC Software,6000000000,USD,1988.0,8.0,12.0,NASDAQ:BMC
8,Rackspace,5440000000,USD,2011.0,11.0,7.0,RAX
9,Baidu,4000000000,USD,2005.0,8.0,5.0,NASDAQ:BIDU


### 5. All the companies that have less than 1000 employees and have been founded before 2005. Order them by the number of employees and limit the search to 10 companies.

In [193]:
# Your Code
import pandas as pd
comps_less = db.companies.find({"$and":[{"number_of_employees":{"$lt":1000}},{"founded_year":{"$lt":2005}}]},{"name":1,"number_of_employees":1,"founded_year":1}).sort([("number_of_employees", DESCENDING)]).limit(10)
df = pd.DataFrame(comps_less)
df

Unnamed: 0,_id,name,number_of_employees,founded_year
0,52cdef7d4bab8bd675298933,Infinera Corporation,974,2000
1,52cdef7e4bab8bd67529ac95,NorthPoint Communications Group,948,1997
2,52cdef7f4bab8bd67529be17,888 Holdings,931,1997
3,52cdef7c4bab8bd6752986a2,Forrester Research,903,1983
4,52cdef7e4bab8bd67529af6d,SonicWALL,900,1991
5,52cdef7e4bab8bd67529b21b,Webmetrics,900,1999
6,52cdef7e4bab8bd67529b3bd,Cornerstone OnDemand,881,1999
7,52cdef7c4bab8bd675297de0,Yelp,800,2004
8,52cdef7c4bab8bd675297efd,ZoomInfo,800,2000
9,52cdef7c4bab8bd675297ef5,MySpace,800,2003


### 6. All the companies that don't include the `partners` field.

In [204]:
# Your Code
partners_not = db.companies.find({"partners":{"$exists":False}})
list(partners_not)

[]

### 7. All the companies that have a null type of value on the `category_code` field.

In [213]:
# Your Code
cat_null = db.companies.find({"category_code": {"$type": 10}},{"_id":0,"name":1,"category_code":1})
df = pd.DataFrame(cat_null)
df.head(30)

Unnamed: 0,name,category_code
0,Collective,
1,Snimmer,
2,KoolIM,
3,Level9 Media,
4,VidKing,
5,Drigg,
6,SpaceTime,
7,Touch Clarity,
8,MMDAYS,
9,Inside Group,


### 8. All the companies that have at least 100 employees but less than 1000. Retrieve only the `name` and `number of employees` fields.

In [221]:
# Your Code
min_max = db.companies.find({"$and":[{"number_of_employees":{"$gte":100}},{"number_of_employees":{"$lt":1000}}]},{"_id":0,"name":1,"number_of_employees":1}).sort([("number_of_employees", DESCENDING)])
df= pd.DataFrame(min_max)
print(df.head(),"\n\n",df.tail())

                              name  number_of_employees
0                      Datamonitor                  984
1             Infinera Corporation                  974
2                              Box                  950
3  NorthPoint Communications Group                  948
4                     888 Holdings                  931 

                            name  number_of_employees
912                 ZoomSystems                  100
913                       Exent                  100
914                    Mashable                  100
915  Applied Language Solutions                  100
916                    NextLabs                  100


### 9. Order all the companies by their IPO price in a descending order.

In [226]:
# Your Code
#he puesto limitie porque es muy grande el db y falla
order_comp = db.companies.find({},{"name":1,"_id":0,"ipo.valuation_amount":1}).sort([("ipo.valuation_amount", DESCENDING)]).limit(10)
list(order_comp)

[{'name': 'GREE', 'ipo': {'valuation_amount': 108960000000}},
 {'name': 'Facebook', 'ipo': {'valuation_amount': 104000000000}},
 {'name': 'Amazon', 'ipo': {'valuation_amount': 100000000000}},
 {'name': 'Twitter', 'ipo': {'valuation_amount': 18100000000}},
 {'name': 'Groupon', 'ipo': {'valuation_amount': 12800000000}},
 {'name': 'Tencent', 'ipo': {'valuation_amount': 11000000000}},
 {'name': 'Western Digital', 'ipo': {'valuation_amount': 9430000000}},
 {'name': 'LinkedIn', 'ipo': {'valuation_amount': 9310000000}},
 {'name': 'BMC Software', 'ipo': {'valuation_amount': 6000000000}},
 {'name': 'Rackspace', 'ipo': {'valuation_amount': 5440000000}}]

### 10. Retrieve the 10 companies with more employees, order by the `number of employees`

In [244]:
# Your Code
more_emp = db.companies.find({},{"name":1,"_id":0,"number_of_employees":1}).sort([("number_of_employees",DESCENDING)]).limit(10)
list(more_emp)

[{'name': 'Siemens', 'number_of_employees': 405000},
 {'name': 'IBM', 'number_of_employees': 388000},
 {'name': 'Toyota', 'number_of_employees': 320000},
 {'name': 'PayPal', 'number_of_employees': 300000},
 {'name': 'Nippon Telegraph and Telephone Corporation',
  'number_of_employees': 227000},
 {'name': 'Samsung Electronics', 'number_of_employees': 221726},
 {'name': 'Accenture', 'number_of_employees': 205000},
 {'name': 'Tata Consultancy Services', 'number_of_employees': 200300},
 {'name': 'Flextronics International', 'number_of_employees': 200000},
 {'name': 'Safeway', 'number_of_employees': 186000}]

### 11. All the companies founded on the second semester of the year. Limit your search to 1000 companies.

In [247]:
# Your Code
sec_sem = db.companies.find({"founded_month":{"$gt":6}},{"_id":0,"name":1,"founded_month":1}).limit(1000)
df = pd.DataFrame(sec_sem)
df.head(30)

Unnamed: 0,name,founded_month
0,Omnidrive,11
1,Digg,10
2,Zoho,9
3,Wetpaint,10
4,Joost,10
5,Plaxo,11
6,Powerset,10
7,Kyte,12
8,Thoof,12
9,Wesabe,12


### 12. All the companies founded before 2000 that have an acquisition amount of more than 10.000.00

In [261]:
# Your Code
found_bef = db.companies.find({"$and":[{"founded_year":{"$lt":2000}},{"acquisition.price_amount":{"$gt":10000}}]},{"_id":0,"name":1,"founded_year":1,"acquisition.price_amount":1})
df=pd.DataFrame(json_normalize(found_bef))
df.head(30)

Unnamed: 0,name,founded_year,acquisition.price_amount
0,SideStep,1999,180000000
1,Recipezaar,1999,25000000
2,Cyworld,1999,7140000
3,Snapfish,1999,300000000
4,PayPal,1998,1500000000
5,Neopets,1999,160000000
6,Zappos,1999,1200000000
7,Alibaba,1999,1000000000
8,Sun Microsystems,1982,7400000000
9,Lastminute,1998,1008000000


### 13. All the companies that have been acquired after 2010, order by the acquisition amount, and retrieve only their `name` and `acquisition` field.

In [260]:
# Your Code
after = db.companies.find({"acquisition.acquired_year":{'$gt':2010}},{"_id":0,"name":1,"acquisition.acquired_year":1,"acquisition.price_amount":1}).sort("acquisition.price_amount",DESCENDING)
df=pd.DataFrame(json_normalize(after))
df.head(30)

Unnamed: 0,name,acquisition.price_amount,acquisition.acquired_year
0,Goodrich Corporation,18400000000.0,2011
1,LSI,6600000000.0,2013
2,National Semiconductor,6500000000.0,2011
3,Ariba,4300000000.0,2012
4,NetLogic Microsystems,3700000000.0,2011
5,SuccessFactors,3400000000.0,2012
6,Qualcomm Atheros,3100000000.0,2011
7,Global Crossing,3000000000.0,2011
8,Global Crossing,3000000000.0,2011
9,Sourcefire,2700000000.0,2013


### 14. Order the companies by their `founded year`, retrieving only their `name` and `founded year`.

In [263]:
# Your Code
#Executor error during find command :: caused by :: Sort operation used more than the maximum 33554432 bytes of 
#RAM. Add an index, or specify a smaller limit

year_f = db.companies.find({},{"_id":0,"name":1,"founded_year":1}).sort([("founded_year",DESCENDING)]).limit(40)
list(year_f)

[{'name': 'Fixya', 'founded_year': 2013},
 {'name': 'Wamba', 'founded_year': 2013},
 {'name': 'Advaliant', 'founded_year': 2013},
 {'name': 'Fluc', 'founded_year': 2013},
 {'name': 'iBazar', 'founded_year': 2013},
 {'name': 'Gimigo', 'founded_year': 2013},
 {'name': 'SEOGroup', 'founded_year': 2013},
 {'name': 'Clowdy', 'founded_year': 2013},
 {'name': 'WhosCall', 'founded_year': 2013},
 {'name': 'Pikk', 'founded_year': 2013},
 {'name': 'Tongxue', 'founded_year': 2013},
 {'name': 'Shopseen', 'founded_year': 2013},
 {'name': 'VistaGen Therapeutics', 'founded_year': 2013},
 {'name': 'PeekYou', 'founded_year': 2012},
 {'name': 'headr', 'founded_year': 2012},
 {'name': 'Pinger', 'founded_year': 2012},
 {'name': 'Widgetbox', 'founded_year': 2012},
 {'name': 'Mobiluck', 'founded_year': 2012},
 {'name': 'Skydeck', 'founded_year': 2012},
 {'name': 'Simplicant', 'founded_year': 2012},
 {'name': 'Springleap', 'founded_year': 2012},
 {'name': 'Jumbuck Entertainment', 'founded_year': 2012},
 {'nam

### 15. All the companies that have been founded on the first seven days of the month, including the seventh. Sort them by their `acquisition price` in a descending order. Limit the search to 10 documents.

In [265]:
# Your Code
seven_d = db.companies.find({"founded_day":{"$lte":7}},{"_id":0,"name":1,"founded_day":1,"acquisition.price_amount":1}).sort("acquisition.price_amount",DESCENDING).limit(10)
list(seven_d)

[{'name': 'Netscape',
  'founded_day': 4,
  'acquisition': {'price_amount': 4200000000}},
 {'name': 'PayPal',
  'founded_day': 1,
  'acquisition': {'price_amount': 1500000000}},
 {'name': 'Zappos',
  'founded_day': 1,
  'acquisition': {'price_amount': 1200000000}},
 {'name': 'Alibaba',
  'founded_day': 1,
  'acquisition': {'price_amount': 1000000000}},
 {'name': 'Postini',
  'founded_day': 2,
  'acquisition': {'price_amount': 625000000}},
 {'name': 'Danger',
  'founded_day': 1,
  'acquisition': {'price_amount': 500000000}},
 {'name': 'Clearwell Systems',
  'founded_day': 6,
  'acquisition': {'price_amount': 410000000}},
 {'name': 'PrimeSense',
  'founded_day': 1,
  'acquisition': {'price_amount': 345000000}},
 {'name': 'Amobee',
  'founded_day': 1,
  'acquisition': {'price_amount': 321000000}},
 {'name': 'BlueLithium',
  'founded_day': 1,
  'acquisition': {'price_amount': 300000000}}]

### 16. All the companies on the 'web' `category` that have more than 4000 employees. Sort them by the amount of employees in ascending order.

In [267]:
# Your Code
web = db.companies.find({"$and":[{"number_of_employees":{"$gt":4000}},{"category_code":"web"}]},{"_id":0,"name":1,"number_of_employees":1,"category_code":1}).sort([("number_of_employees",ASCENDING)])
list(web)

[{'name': 'Expedia', 'category_code': 'web', 'number_of_employees': 4400},
 {'name': 'AOL', 'category_code': 'web', 'number_of_employees': 8000},
 {'name': 'Webkinz', 'category_code': 'web', 'number_of_employees': 8657},
 {'name': 'Rakuten', 'category_code': 'web', 'number_of_employees': 10000},
 {'name': 'Los Angeles Times Media Group',
  'category_code': 'web',
  'number_of_employees': 10000},
 {'name': 'Groupon', 'category_code': 'web', 'number_of_employees': 10000},
 {'name': 'Yahoo!', 'category_code': 'web', 'number_of_employees': 13600},
 {'name': 'eBay', 'category_code': 'web', 'number_of_employees': 15000},
 {'name': 'Experian', 'category_code': 'web', 'number_of_employees': 15500}]

### 17. All the companies whose acquisition amount is more than 10.000.000, and currency is 'EUR'.

In [269]:
# Your Code
rich = db.companies.find({"$and":[{"acquisition.price_amount":{"$gt":10000000}},{"acquisition.price_currency_code":"EUR"}]},{"_id":0,"name":1,"acquisition.price_amount":1,"acquisition.price_currency_code":1})
list(rich)

[{'name': 'ZYB',
  'acquisition': {'price_amount': 31500000, 'price_currency_code': 'EUR'}},
 {'name': 'Apertio',
  'acquisition': {'price_amount': 140000000, 'price_currency_code': 'EUR'}},
 {'name': 'Greenfield Online',
  'acquisition': {'price_amount': 40000000, 'price_currency_code': 'EUR'}},
 {'name': 'Webedia',
  'acquisition': {'price_amount': 70000000, 'price_currency_code': 'EUR'}},
 {'name': 'Wayfinder',
  'acquisition': {'price_amount': 24000000, 'price_currency_code': 'EUR'}},
 {'name': 'Tuenti Technologies',
  'acquisition': {'price_amount': 70000000, 'price_currency_code': 'EUR'}},
 {'name': 'BioMed Central',
  'acquisition': {'price_amount': 43400000, 'price_currency_code': 'EUR'}}]

### 18. All the companies that have been acquired on the first trimester of the year. Limit the search to 10 companies, and retrieve only their `name` and `acquisition` fields.

In [272]:
# Your Code
first_tri = db.companies.find({"acquisition.acquired_month":{"$lt":4}},{"_id":0,"name":1,"acquisition.acquired_month":1}).limit(10)
list(first_tri)

[{'name': 'Kyte', 'acquisition': {'acquired_month': 1}},
 {'name': 'NetRatings', 'acquisition': {'acquired_month': 2}},
 {'name': 'Livestream', 'acquisition': {'acquired_month': 1}},
 {'name': 'blogTV', 'acquisition': {'acquired_month': 3}},
 {'name': 'iContact', 'acquisition': {'acquired_month': 2}},
 {'name': 'Coghead', 'acquisition': {'acquired_month': 2}},
 {'name': 'Dailymotion', 'acquisition': {'acquired_month': 2}},
 {'name': 'Flickr', 'acquisition': {'acquired_month': 3}},
 {'name': 'BabyCenter', 'acquisition': {'acquired_month': 3}},
 {'name': 'Netvibes', 'acquisition': {'acquired_month': 2}}]

# Bonus
### 19. All the companies that have been founded between 2000 and 2010, but have not been acquired before 2011.

In [277]:
# Your Code
between = db.companies.find({"$and":[{"founded_year":{"$gte":2000}},{"founded_year":{"$lte":2010}},{"acquisition.acquired_year":{"$lt":2011}}]},{"_id":0,"name":1,"founded_year":1,"acquisition.acquired_year":1})
df=pd.DataFrame(json_normalize(between))
df.head(40)

Unnamed: 0,name,founded_year,acquisition.acquired_year
0,Helio,2005,2008
1,StumbleUpon,2002,2009
2,Joost,2006,2009
3,Plaxo,2002,2008
4,Powerset,2006,2008
5,Veoh,2004,2010
6,Jajah,2005,2009
7,YouTube,2005,2006
8,Livestream,2007,2008
9,GrandCentral,2006,2007


### 20. All the companies that have been 'deadpooled' after the third year.

In [304]:
# Your Code
deadpool = db.companies.find({"deadpooled_year":{"$gt":1}},{"_id":0,"name":1,"founded_year":1,"deadpooled_year":1})
df=pd.DataFrame(deadpool)
df["substract"]=df.deadpooled_year-df.founded_year
df.loc[df['substract'] > 3].head(50)

Unnamed: 0,name,founded_year,deadpooled_year,substract
3,Babelgum,2007.0,2013,6.0
5,Thoof,2006.0,2013,7.0
7,Wesabe,2005.0,2010,5.0
9,Stickam,2006.0,2013,7.0
10,AllPeers,2004.0,2008,4.0
11,EQO,2006.0,2012,6.0
12,AllofMP3,2000.0,2007,7.0
13,SellABand,2006.0,2010,4.0
14,Zlio,2005.0,2011,6.0
15,Jaiku,2006.0,2012,6.0
