In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline 
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
import pymongo
from pymongo import MongoClient
client = MongoClient()
client = MongoClient('localhost', 27017)

In [8]:
from __future__ import print_function
import tweepy
import json
from pymongo import MongoClient
 
MONGO_HOST= 'mongodb://localhost/twitterdb'  # assuming you have mongoDB installed locally
                                             # and a database called 'twitterdb'
 
WORDS = ['#insurance', '#AI', '#datascience', '#machinelearning', '#auto', '#reinsurance']
 
CONSUMER_KEY = %env TWITTER_CONSUMER_KEY
CONSUMER_SECRET = %env TWITTER_CONSUMER_SECRET
ACCESS_TOKEN =%env TWITTER_ACCESS_TOKEN
ACCESS_TOKEN_SECRET = %env TWITTER_ACCESS_TOKEN_SECRET


## MongoDB Example - reinsurance loss data analysis , JSON BLOB and read Twitter 

<!-- PELICAN_BEGIN_SUMMARY -->

MongoDB is a document-oriented database. Instead of storing your data in tables made out of individual rows,
like a relational database does, it stores your data in collections made out of individual documents.
In MongoDB, a document is a big JSON blob with no particular format or schema.

You can have all your data in one single collection.

<!-- PELICAN_END_SUMMARY -->

**MongoDB Example**
- Create a document styple data in MongoDB
  - example: reinsurance treaty by reinsurer/year/treaty type        
- Create a document style data in MongoDB from Json API format
  - example: realtime stock price and realtime sector data        
- Read Twitter data into MongoDB  
  - example : read twitter page about what type of insurance people are interesting buying

**Making a Connection with MongoClient**
-  Below link is an introduction tutorial to working with MongoDB and PyMongo
-  ref: http://api.mongodb.com/python/current/tutorial.html

In [4]:
# Making a Connection with MongoClient
# Import pymongo
import pymongo

# The first step when working with PyMongo is to create a MongoClient to the running mongod instance
from pymongo import MongoClient

# Connect on the default host and port.
client = MongoClient()

# We can also specify the host and port explicitly
client = MongoClient('localhost', 27017)

**Getting a Database**
- A single instance of MongoDB can support multiple independent databases.
- When working with PyMongo you access databases using attribute style access on MongoClient instances.
- Database name can not use attribute style access (ie. test-data), but "test_data" is okay

In [5]:
# Set up a collection name "test_database" in MongoDB
db = client.test_database

collection = db.test_collection

collection

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'test_database'), 'test_collection')

#### **Document - example: reinsurance treaty by reinsurer/year/treaty type**
- Data in MongoDB is represented (and stored) using JSON-style documents. 
- In PyMongo we use dictionaries to represent documents.
- As an example, the following dictionary might be used to represent a reinsurance treaty type by reinsurer and year.
- Documents can contain native Python types (like datetime.datetime instances) which will be automatically converted to and from the appropriate BSON types.

In [6]:
import datetime
treaty = {"reinsurer": "AIG",
        "treaty": "XOL layer",
        "tags": ["reinsurer", "treaty", "year"],
        "date": datetime.datetime.utcnow()}

#### *Inserting a Document*
- When a document is inserted a special key, "_id", is automatically added if the document doesn’t already contain an "_id" key.
- The value of "_id" must be unique across the collection. insert_one() returns an instance of InsertOneResult. 

In [7]:
# To insert a document into a collection we can use the insert_one() method:
treaties = db.treaties
treaty_id = treaties.insert_one(treaty).inserted_id
treaty_id

ObjectId('5b80259a760df7342060e4b8')

**After inserting the first document, the posts collection has actually been created on the server.**
- We can verify this by listing all of the collections in our database:

In [8]:
db.list_collection_names()

['profiles', 'treaties', 'census', 'sector_data', 'stock_data']

In [9]:
treaty2 = {"reinsurer": "Swiss Re",
        "treaty": "Clash Layer",
        "tags": ["reinsurer", "treaty", "year"],
        "date": datetime.datetime.utcnow()}

treaty_id = treaties.insert_one(treaty2).inserted_id
treaty_id

ObjectId('5b80259e760df7342060e4b9')

In [10]:
treaty3 = {"reinsurer": "Parter Re",
        "treaty": "XOL 2nd Layer",
        "tags": ["reinsurer", "treaty", "year"],
        "date": datetime.datetime.utcnow()}

treaty_id = treaties.insert_one(treaty3).inserted_id
treaty_id

ObjectId('5b8025a2760df7342060e4ba')

#### * Getting a Single Document With find_one()*
- The most basic type of query that can be performed in MongoDB is find_one().
- This method returns a single document matching a query (or None if there are no matches).
- It is useful when you know there is only one matching document, or
- are only interested in the first match. Here we use find_one() to get the first document from the posts collection:

In [11]:
import pprint
pprint.pprint(treaties.find_one())

{'_id': ObjectId('5b7ed794760df705188fc61f'),
 'date': datetime.datetime(2018, 8, 23, 15, 49, 36, 898000),
 'reinsurer': 'AIG',
 'tags': ['reinsurer', 'treaty', 'year'],
 'treaty': 'XOL layer'}


In [12]:
pprint.pprint(treaties.find_one({"reinsurer": "AIG"}))

{'_id': ObjectId('5b7ed794760df705188fc61f'),
 'date': datetime.datetime(2018, 8, 23, 15, 49, 36, 898000),
 'reinsurer': 'AIG',
 'tags': ['reinsurer', 'treaty', 'year'],
 'treaty': 'XOL layer'}


In [13]:
pprint.pprint(treaties.find_one({"reinsurer": "ACE"}))

None


#### * Querying By ObjectId*
- We can also find a post by its _id, which in our example is an ObjectId:

In [14]:
treaty_id  ##output is an object

ObjectId('5b8025a2760df7342060e4ba')

In [15]:
pprint.pprint(treaties.find_one({"_id": treaty_id}))

{'_id': ObjectId('5b8025a2760df7342060e4ba'),
 'date': datetime.datetime(2018, 8, 24, 15, 34, 58, 292000),
 'reinsurer': 'Parter Re',
 'tags': ['reinsurer', 'treaty', 'year'],
 'treaty': 'XOL 2nd Layer'}


#### * Note that an ObjectId is not the same as its string representation:*

In [16]:
treaty_id_as_str = str(treaty_id)
treaty_id_as_str  ## output is a string

'5b8025a2760df7342060e4ba'

In [17]:
treaties.find_one({"_id": treaty_id_as_str}) # No result

**Bulk Inserts**
- we can also perform bulk insert operations, by passing a list as the first argument to insert_many(). 
- This will insert each document in the list, sending only a single command to the server.
- New_treaty has a different “shape” than the other posts - there is no "tags" field and we’ve added a new field, "title".
- This is what we mean when we say that MongoDB is schema-free.

In [18]:
# new_treaty has a different “shape” than the other posts 
# there is no "tags" field and we’ve added a new field, "retenion". 

new_treaty = [{"reinsurer": "AIG",
              "treaty": "XOL Layer 2018",
              "tags": ["bulk", "insert"],
              "date": datetime.datetime(2018, 11, 12, 11, 14)},
              {"reinsurer": "Munich Re",
               "treaty": "QS 2018",
               "retenion": "QS 20% for US business",
               "date": datetime.datetime(2018, 11, 10, 10, 45)}]

result = treaties.insert_many(new_treaty)
result.inserted_ids

[ObjectId('5b8025b7760df7342060e4bb'), ObjectId('5b8025b7760df7342060e4bc')]

*The result from insert_many() now returns two ObjectId instances, one for each inserted document.*

**Query data**
- use find()
- use find_one()

In [19]:
for treaty in treaties.find():
     pprint.pprint(treaty)

{'_id': ObjectId('5b7ed794760df705188fc61f'),
 'date': datetime.datetime(2018, 8, 23, 15, 49, 36, 898000),
 'reinsurer': 'AIG',
 'tags': ['reinsurer', 'treaty', 'year'],
 'treaty': 'XOL layer'}
{'_id': ObjectId('5b7ed797760df705188fc620'),
 'date': datetime.datetime(2018, 8, 23, 15, 49, 43, 818000),
 'reinsurer': 'Swiss Re',
 'tags': ['reinsurer', 'treaty', 'year'],
 'treaty': 'Clash Layer'}
{'_id': ObjectId('5b7ed79a760df705188fc621'),
 'date': datetime.datetime(2018, 8, 23, 15, 49, 46, 797000),
 'reinsurer': 'Parter Re',
 'tags': ['reinsurer', 'treaty', 'year'],
 'treaty': 'XOL 2nd Layer'}
{'_id': ObjectId('5b7ed7ad760df705188fc622'),
 'date': datetime.datetime(2018, 11, 12, 11, 14),
 'reinsurer': 'AIG',
 'tags': ['bulk', 'insert'],
 'treaty': 'XOL Layer 2018'}
{'_id': ObjectId('5b7ed7ad760df705188fc623'),
 'date': datetime.datetime(2018, 11, 10, 10, 45),
 'reinsurer': 'Munich Re',
 'retenion': 'QS 20% for US business',
 'treaty': 'QS 2018'}
{'_id': ObjectId('5b80259a760df7342060e4b8

In [20]:
# Use find() for query

# Find all treaty in the collection
# for treaty in treaties.find():
#      pprint.pprint(treaty)

# Find all reinsurer-AIG treaty:
for treaty in treaties.find({"reinsurer": "AIG"}):
    pprint.pprint(treaty)

{'_id': ObjectId('5b7ed794760df705188fc61f'),
 'date': datetime.datetime(2018, 8, 23, 15, 49, 36, 898000),
 'reinsurer': 'AIG',
 'tags': ['reinsurer', 'treaty', 'year'],
 'treaty': 'XOL layer'}
{'_id': ObjectId('5b7ed7ad760df705188fc622'),
 'date': datetime.datetime(2018, 11, 12, 11, 14),
 'reinsurer': 'AIG',
 'tags': ['bulk', 'insert'],
 'treaty': 'XOL Layer 2018'}
{'_id': ObjectId('5b80259a760df7342060e4b8'),
 'date': datetime.datetime(2018, 8, 24, 15, 34, 47, 804000),
 'reinsurer': 'AIG',
 'tags': ['reinsurer', 'treaty', 'year'],
 'treaty': 'XOL layer'}
{'_id': ObjectId('5b8025b7760df7342060e4bb'),
 'date': datetime.datetime(2018, 11, 12, 11, 14),
 'reinsurer': 'AIG',
 'tags': ['bulk', 'insert'],
 'treaty': 'XOL Layer 2018'}


In [21]:
# Find one reinsurer-AIG treaty:
pprint.pprint(treaties.find_one({"reinsurer": "AIG"}))

{'_id': ObjectId('5b7ed794760df705188fc61f'),
 'date': datetime.datetime(2018, 8, 23, 15, 49, 36, 898000),
 'reinsurer': 'AIG',
 'tags': ['reinsurer', 'treaty', 'year'],
 'treaty': 'XOL layer'}


#### * Counting*
- If we just want to know how many documents match a query we can perform a count() operation instead of a full query.
- We can get a count of all of the documents in a collection:

In [22]:
# How many AIG treaty?
treaties.count_documents(
  filter={"reinsurer": "AIG"} 
)


# How many treaties?
treaties.count_documents(
  filter={} 
)

4

10

** Range Queries**
- MongoDB supports many different types of advanced queries.
- As an example, lets perform a query where we limit results to treaties older than a certain date, but also sort the results by reinsurers

In [23]:
d = datetime.datetime(2018, 11, 12, 12)
for treaty in treaties.find({"date": {"$lt": d}}).sort("reinsurer"):
    pprint.pprint(treaty)

{'_id': ObjectId('5b7ed794760df705188fc61f'),
 'date': datetime.datetime(2018, 8, 23, 15, 49, 36, 898000),
 'reinsurer': 'AIG',
 'tags': ['reinsurer', 'treaty', 'year'],
 'treaty': 'XOL layer'}
{'_id': ObjectId('5b7ed7ad760df705188fc622'),
 'date': datetime.datetime(2018, 11, 12, 11, 14),
 'reinsurer': 'AIG',
 'tags': ['bulk', 'insert'],
 'treaty': 'XOL Layer 2018'}
{'_id': ObjectId('5b80259a760df7342060e4b8'),
 'date': datetime.datetime(2018, 8, 24, 15, 34, 47, 804000),
 'reinsurer': 'AIG',
 'tags': ['reinsurer', 'treaty', 'year'],
 'treaty': 'XOL layer'}
{'_id': ObjectId('5b8025b7760df7342060e4bb'),
 'date': datetime.datetime(2018, 11, 12, 11, 14),
 'reinsurer': 'AIG',
 'tags': ['bulk', 'insert'],
 'treaty': 'XOL Layer 2018'}
{'_id': ObjectId('5b7ed7ad760df705188fc623'),
 'date': datetime.datetime(2018, 11, 10, 10, 45),
 'reinsurer': 'Munich Re',
 'retenion': 'QS 20% for US business',
 'treaty': 'QS 2018'}
{'_id': ObjectId('5b8025b7760df7342060e4bc'),
 'date': datetime.datetime(2018,

#### * Indexing*
- Adding indexes can help accelerate certain queries and can also add additional functionality to querying and storing documents.
- In this example, we’ll demonstrate how to create a unique index on a key that rejects documents whose value for that key already exists in the index.
- First, we’ll need to create the index:

In [24]:
 result = db.profiles.create_index([('reinsurer_id', pymongo.ASCENDING)],
                                  unique=True)
 sorted(list(db.profiles.index_information()))

['_id_', 'reinsurer_id_1']

**Notice that we have two indexes now**
- one is the index on _id that MongoDB creates automatically,
- and the other is the index on reinsurer_id we just created.

In [26]:
 # set up some user profiles
    
 user_profiles = [
     {'reinsurer_id': 278, 'reinsurer': 'Atlantic Re'},
     {'reinsurer_id': 275, 'reinsurer': 'XL Re'}]
 result = db.profiles.insert_many(user_profiles)

In [None]:
 # DuplicateKeyError: E11000 duplicate key error collection: test_database.profiles index: user_id_1 dup key: { : 235 }
 new_profile = {'user_id': 213, 'reinsurer': 'XL American'}
 duplicate_profile = {'user_id': 235, 'reinsurer': 'SCOR S.E'}
 result = db.profiles.insert_one(new_profile)  # This is fine.
 result = db.profiles.insert_one(duplicate_profile)

**Create a document style data in MongoDB from Json API format**
- example: realtime intraday MSFT stock price
- example: realtime sector stock return 

In [27]:
realtime_stock_data={
"Meta Data": {
"1 Information": "Intraday (5min) open, high, low, close prices and volume",
"2 Symbol": "MSFT",
"3 Last Refreshed": "2018-08-22 15:55:00",
"4 Interval": "5min",
"5 Output Size": "Compact",
"6 Time Zone": "US/Eastern"
},
"Time Series (5min)": {
"2018-08-22 15:55:00": {
"1 open": "107.1500",
"2 high": "107.2100",
"3 low": "107.0500",
"4 close": "107.0500",
"5 volume": "970838"
}
}
}

In [28]:
stock_data = db.stock_data
stock_data_id = stock_data.insert_one(realtime_stock_data).inserted_id
stock_data_id

ObjectId('5b8025e7760df7342060e4c1')

In [29]:
realtime_sector_data={
    "Meta Data": {
        "Information": "US Sector Performance (realtime & historical)",
        "Last Refreshed": "02:05 PM ET 08/23/2018"
    },
    "Rank A: Real-Time Performance": {
        "Information Technology": "0.51%",
        "Consumer Discretionary": "-0.04%",
        "Utilities": "-0.11%",
        "Telecommunication Services": "-0.13%",
        "Health Care": "-0.20%",
        "Real Estate": "-0.25%",
        "Consumer Staples": "-0.27%",
        "Industrials": "-0.48%",
        "Financials": "-0.54%",
        "Energy": "-0.57%",
        "Materials": "-0.86%"
    },
    "Rank B: 1 Day Performance": {
        "Energy": "1.20%",
        "Information Technology": "0.48%",
        "Consumer Discretionary": "0.12%",
        "Health Care": "0.11%",
        "Financials": "-0.26%",
        "Materials": "-0.45%",
        "Consumer Staples": "-0.63%",
        "Real Estate": "-0.65%",
        "Utilities": "-0.78%",
        "Industrials": "-0.93%",
        "Telecommunication Services": "-2.02%"
    },
    "Rank C: 5 Day Performance": {
        "Energy": "3.27%",
        "Industrials": "2.23%",
        "Consumer Discretionary": "2.21%",
        "Materials": "2.02%",
        "Financials": "1.93%",
        "Health Care": "1.71%",
        "Telecommunication Services": "1.09%",
        "Information Technology": "0.84%",
        "Consumer Staples": "0.76%",
        "Real Estate": "0.16%",
        "Utilities": "-0.35%"
    },
    "Rank D: 1 Month Performance": {
        "Telecommunication Services": "6.53%",
        "Health Care": "5.38%",
        "Industrials": "2.75%",
        "Real Estate": "2.62%",
        "Consumer Staples": "2.57%",
        "Financials": "2.45%",
        "Utilities": "1.92%",
        "Consumer Discretionary": "1.90%",
        "Materials": "0.90%",
        "Information Technology": "0.57%",
        "Energy": "-1.17%"
    },
    "Rank E: 3 Month Performance": {
        "Health Care": "9.80%",
        "Consumer Staples": "9.18%",
        "Utilities": "8.72%",
        "Consumer Discretionary": "8.41%",
        "Real Estate": "8.28%",
        "Telecommunication Services": "6.02%",
        "Information Technology": "5.55%",
        "Financials": "0.49%",
        "Industrials": "0.15%",
        "Materials": "-2.15%",
        "Energy": "-5.60%"
    },
    "Rank F: Year-to-Date (YTD) Performance": {
        "Information Technology": "16.10%",
        "Consumer Discretionary": "15.79%",
        "Health Care": "10.55%",
        "Energy": "2.53%",
        "Financials": "1.05%",
        "Utilities": "1.04%",
        "Real Estate": "0.96%",
        "Industrials": "0.75%",
        "Materials": "-2.75%",
        "Consumer Staples": "-5.33%",
        "Telecommunication Services": "-6.21%"
    },
    "Rank G: 1 Year Performance": {
        "Information Technology": "31.09%",
        "Consumer Discretionary": "29.36%",
        "Energy": "20.32%",
        "Health Care": "16.84%",
        "Financials": "15.29%",
        "Industrials": "12.18%",
        "Materials": "9.50%",
        "Real Estate": "3.09%",
        "Telecommunication Services": "-0.85%",
        "Utilities": "-2.24%",
        "Consumer Staples": "-2.32%"
    },
    "Rank H: 3 Year Performance": {
        "Information Technology": "95.67%",
        "Consumer Discretionary": "54.45%",
        "Financials": "46.80%",
        "Industrials": "45.21%",
        "Materials": "36.15%",
        "Health Care": "27.81%",
        "Utilities": "18.65%",
        "Energy": "18.31%",
        "Consumer Staples": "13.63%",
        "Telecommunication Services": "5.73%"
    },
    "Rank I: 5 Year Performance": {
        "Information Technology": "151.29%",
        "Consumer Discretionary": "98.24%",
        "Health Care": "85.23%",
        "Financials": "75.87%",
        "Industrials": "67.23%",
        "Materials": "45.10%",
        "Utilities": "43.92%",
        "Consumer Staples": "35.93%",
        "Telecommunication Services": "4.47%",
        "Energy": "-6.33%"
    },
    "Rank J: 10 Year Performance": {
        "Consumer Discretionary": "284.77%",
        "Information Technology": "252.24%",
        "Health Care": "176.38%",
        "Industrials": "107.68%",
        "Consumer Staples": "90.16%",
        "Financials": "74.45%",
        "Materials": "50.53%",
        "Utilities": "41.62%",
        "Telecommunication Services": "21.26%",
        "Energy": "-3.80%"
    }
}

In [30]:
sector_data = db.sector_data
sector_data_id = sector_data.insert_one(realtime_sector_data).inserted_id
sector_data_id

ObjectId('5b8025ef760df7342060e4c2')

**Collect data from Twitter into MongoDB**
- ref:  http://pythondata.com/collecting-storing-tweets-with-python-and-mongodb/

In [None]:
from __future__ import print_function
import tweepy
import json
from pymongo import MongoClient
 
MONGO_HOST= 'mongodb://localhost/twitterdb'  # assuming you have mongoDB installed locally
                                             # and a database called 'twitterdb'
 
WORDS = ['#insurance', '#AI', '#datascience', '#machinelearning', '#auto', '#reinsurance']
 
CONSUMER_KEY = %env TWITTER_CONSUMER_KEY
CONSUMER_SECRET = %env TWITTER_CONSUMER_SECRET
ACCESS_TOKEN =%env TWITTER_ACCESS_TOKEN
ACCESS_TOKEN_SECRET = %env TWITTER_ACCESS_TOKEN_SECRET

 
 
class StreamListener(tweepy.StreamListener):    
    #This is a class provided by tweepy to access the Twitter Streaming API. 
 
    def on_connect(self):
        # Called initially to connect to the Streaming API
        print("You are now connected to the streaming API.")
 
    def on_error(self, status_code):
        # On error - if an error occurs, display the error / status code
        print('An Error has occured: ' + repr(status_code))
        return False
 
    def on_data(self, data):
        #This is the meat of the script...it connects to your mongoDB and stores the tweet
        try:
            client = MongoClient(MONGO_HOST)
            
            # Use twitterdb database. If it doesn't exist, it will be created.
            db = client.twitterdb
    
            # Decode the JSON from Twitter
            datajson = json.loads(data)
            
            #grab the 'created_at' data from the Tweet to use for display
            created_at = datajson['created_at']
 
            #print out a message to the screen that we have collected a tweet
            print("Tweet collected at " + str(created_at))
            
            #insert the data into the mongoDB into a collection called twitter_search
            #if twitter_search doesn't exist, it will be created.
            db.twitter_search.insert(datajson)
        except Exception as e:
           print(e)
 
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
#Set up the listener. The 'wait_on_rate_limit=True' is needed to help with Twitter API rate limiting.
listener = StreamListener(api=tweepy.API(wait_on_rate_limit=True)) 
streamer = tweepy.Stream(auth=auth, listener=listener)
print("Tracking: " + str(WORDS))
streamer.filter(track=WORDS)

  

Tracking: ['#insurance', '#AI', '#datascience', '#machinelearning', '#auto', '#reinsurance']
You are now connected to the streaming API.
Tweet collected at Fri Aug 24 15:36:42 +0000 2018




Tweet collected at Fri Aug 24 15:36:43 +0000 2018
Tweet collected at Fri Aug 24 15:36:43 +0000 2018
Tweet collected at Fri Aug 24 15:36:44 +0000 2018
Tweet collected at Fri Aug 24 15:36:45 +0000 2018
Tweet collected at Fri Aug 24 15:36:49 +0000 2018
Tweet collected at Fri Aug 24 15:36:50 +0000 2018
Tweet collected at Fri Aug 24 15:36:51 +0000 2018
Tweet collected at Fri Aug 24 15:36:51 +0000 2018
