In [2]:
import requests
from lxml import html
from urllib.parse import urljoin
from pymongo import MongoClient

## requesting the html code from website

https://coinmarketcap.com/

In [3]:
response = requests.get("https://coinmarketcap.com/", 
                        headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36"})
response

<Response [200]>

**Note:** One important thing to note about this website is that the table data is generated via jquery using a package called [datatables](https://datatables.net/). Jquery is a library that is based on javascript. And our request module doesnot understand javascript and jquery. So the html page that we will get in the response might differ from the actual site.

We can disable the javacript in the browser and then reload the page to get rid of javascript. Accordingly we can use the site without javascript for further scrapping. 

For scrapping java enabled websites, we will use [splash](https://splash.readthedocs.io/en/stable/).

In [4]:
print(response.text)

<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8" /><meta http-equiv="x-ua-compatible" content="ie=edge" /><meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no, shrink-to-fit=no" /><link rel="preconnect" href="https://s2.coinmarketcap.com" crossorigin="anonymous" /><link rel="dns-prefetch" href="https://s2.coinmarketcap.com" crossorigin="anonymous" /><link rel="preconnect" href="https://s3.coinmarketcap.com" crossorigin="anonymous" /><link rel="dns-prefetch" href="https://s3.coinmarketcap.com" crossorigin="anonymous" /><link rel="preconnect" href="//www.googletagmanager.com" /><link rel="preload" href="https://s2.coinmarketcap.com/static/cloud/fonts/cmc-v2/CMC-V2.woff" as="font" type="font/woff2" crossorigin="anonymous" /><meta name="google-site-verification" content="EDc1reqlQ-zAgeRrrgAxRXNK-Zs9JgpE9a0wdaoSO9A" /><meta property="og:image" content="https://s2.coinmarketcap.com/static/cloud/img/splash_600x315_1.png?_=e1b1014" /><

Just in case we get ```UnicodeEncodeError```. Then we can replace ```text``` to ```content``` like this:

```python
response.content
```
this will return the html code in bytes

In [5]:
tree = html.fromstring(response.text)
tree

<Element html at 0x1b2963af548>

## Scrapping the data

In [6]:
currencies = tree.xpath("//table[@class='cmc-table cmc-table___11lFC cmc-table-homepage___2_guh ']/tbody/tr")
len(currencies)

100

In [22]:
data = []
count = 0

for currency in currencies:
    count += 1
    name = currency.xpath(".//td[3]/div/a/div/div/p/text() | .//td[3]/a/span[2]/text()")[0]
    price = currency.xpath(".//td[4]/div/a/text() | .//td[4]/span/text()")
    if len(price)==2:
        price = "".join(price)
    elif len(price)==1:
        price = price[0]
        
    c = {
        "_id":count,
        "name":name,
        "price":price
    }
    
    data.append(c)

In [23]:
data[0:10]

[{'_id': 1, 'name': 'Bitcoin', 'price': '$54,519.17'},
 {'_id': 2, 'name': 'Ethereum', 'price': '$2,539.18'},
 {'_id': 3, 'name': 'Binance Coin', 'price': '$560.90'},
 {'_id': 4, 'name': 'XRP', 'price': '$1.40'},
 {'_id': 5, 'name': 'Tether', 'price': '$0.9998'},
 {'_id': 6, 'name': 'Cardano', 'price': '$1.29'},
 {'_id': 7, 'name': 'Dogecoin', 'price': '$0.2725'},
 {'_id': 8, 'name': 'Polkadot', 'price': '$34.44'},
 {'_id': 9, 'name': 'Uniswap', 'price': '$39.67'},
 {'_id': 10, 'name': 'Litecoin', 'price': '$253.11'}]

## Recursive scraping

making a recursive function to scrap all the other data in the all the pages of the website:

In [31]:
all_currencies = []

def scrape(url,count=0):
       
    response = requests.get(url, 
                        headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36"})
    
    tree = html.fromstring(response.text)
    
    currencies = tree.xpath("//table[@class='cmc-table cmc-table___11lFC cmc-table-homepage___2_guh ']/tbody/tr")
    
    for currency in currencies:
        count += 1
        name = currency.xpath(".//td[3]/div/a/div/div/p/text() | .//td[3]/a/span[2]/text()")[0]
        price = currency.xpath(".//td[4]/div/a/text() | .//td[4]/span/text()")
        if len(price)==2:
            price = "".join(price)
        elif len(price)==1:
            price = price[0]

        c = {
            "_id":count,
            "name":name,
            "price":price
        }

        all_currencies.append(c)
    
    
    next_page = tree.xpath("//a[contains(@class,'chevron') and contains(@aria-label,'Next')]/@href")
    if len(next_page)!=0:
        next_page_url = urljoin(base=url,url=next_page[0])
        scrape(next_page_url, count=len(all_currencies))
    
    return all_currencies

**Note:** The id is set manually and not scraped from the website, as the javascript for the site is disabled while scraping which doesnot generate the id for all the currencies. 

In [32]:
%%time
url = "https://coinmarketcap.com"
data = scrape(url)
print(len(data))

4855
Wall time: 12.5 s


## storing the data into mongodb cloud

Connection string:

```mmongodb://kavyajeetbora:<password>@cluster0-shard-00-00.b6lqy.mongodb.net:27017,cluster0-shard-00-01.b6lqy.mongodb.net:27017,cluster0-shard-00-02.b6lqy.mongodb.net:27017/myFirstDatabase?ssl=true&replicaSet=atlas-9j4a8j-shard-0&authSource=admin&retryWrites=true&w=majority```

In [12]:
help(MongoClient)

Help on class MongoClient in module pymongo.mongo_client:

class MongoClient(pymongo.common.BaseObject)
 |  A client-side representation of a MongoDB cluster.
 |  
 |  Instances can represent either a standalone MongoDB server, a replica
 |  set, or a sharded cluster. Instances of this class are responsible for
 |  maintaining up-to-date state of the cluster, and possibly cache
 |  resources related to this, including background threads for monitoring,
 |  and connection pools.
 |  
 |  Method resolution order:
 |      MongoClient
 |      pymongo.common.BaseObject
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __enter__(self)
 |  
 |  __eq__(self, other)
 |      Return self==value.
 |  
 |  __exit__(self, exc_type, exc_val, exc_tb)
 |  
 |  __getattr__(self, name)
 |      Get a database by name.
 |      
 |      Raises :class:`~pymongo.errors.InvalidName` if an invalid
 |      database name is used.
 |      
 |      :Parameters:
 |        - `name`: the name of the data

```python
client = MongoClient("mongodb://kavyajeetbora:maruti0646@cluster0-shard-00-00.b6lqy.mongodb.net:27017,cluster0-shard-00-01.b6lqy.mongodb.net:27017,cluster0-shard-00-02.b6lqy.mongodb.net:27017/myFirstDatabase?ssl=true&replicaSet=atlas-9j4a8j-shard-0&authSource=admin&retryWrites=true&w=majority")
```

Open the connection using this code. And always close the connection by:

```python
client.close()
```

In [52]:
def insert_data_to_db(currency_data):
    
    # connect to mongodb cloud server
    client = MongoClient("mongodb://kavyajeetbora:bora1992@cluster0-shard-00-00.jicto.mongodb.net:27017,cluster0-shard-00-01.jicto.mongodb.net:27017,cluster0-shard-00-02.jicto.mongodb.net:27017/myFirstDatabase?ssl=true&replicaSet=atlas-r5ojsc-shard-0&authSource=admin&retryWrites=true&w=majority")
    # create a database
    db = client["currencies"]
    # create a table / collection in the db
    collection = db["prices"]
    # insert the data into collection
    for currency in currency_data:
        record = collection.find_one({"_id":currency["_id"]})
        if record:
            if record["name"] == currency["name"] and record["price"] != currency["price"]:
                ## update the price
                collection.replace_one({"_id":record["_id"]}, currency)
                print(f"Old item: {record}, new item: {currency}")
        else:
            collection.insert_one(currency)
#         collection.insert_many(currency_data) # to insert many documents at a time
    # finally close the connection
    client.close()

In [None]:
%%time
insert_data_to_db(data)

In [44]:
def search_item(_id=1):
    client = MongoClient("mongodb://kavyajeetbora:bora1992@cluster0-shard-00-00.jicto.mongodb.net:27017,cluster0-shard-00-01.jicto.mongodb.net:27017,cluster0-shard-00-02.jicto.mongodb.net:27017/myFirstDatabase?ssl=true&replicaSet=atlas-r5ojsc-shard-0&authSource=admin&retryWrites=true&w=majority")
    # call the collection 
    collection = client["currencies"]["prices"]
    record = collection.find_one({"_id":_id})
    print(record, type(record))
    client.close()
    
search_item(_id=10000)

None <class 'NoneType'>


In [43]:
data[0]['_id']

1