# DA320 Assignment 7: Mongo Charts
Jon Kaimmer  
DA320  
Winter2022


 ### Introduction
Lets import our chirp data and then chart it. 

In [13]:
#IMPORTS

import pymongo
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import json as json

import plotly.express as px


# import warnings
# warnings.filterwarnings('ignore') #Ignore the seaborn warnings...

#METHODS
def connectToMongoDB():
    with open(credentialLocation, 'r') as myFile:   #open seperate file that stores passwords in JSON array format
        data = myFile.read()                        #read file into memory
        credentialDict = json.loads(data)           #parse json file into a python dictionary
        
        return(credentialDict['MONGO']['mongo_conn_str'])

#FIELDS
credentialLocation = r"C:\Users\\jonat\\OneDrive\Documents\GitHub\\DA320\credentials.json"

sns.set(rc = {'figure.figsize':(40,8)})

### Read MongoDB connection string from my credentials.json file

In [14]:
MONGOconnectionString = connectToMongoDB()
client = pymongo.MongoClient(MONGOconnectionString)
db = client.admin

serverStatusResult=db.command('serverStatus')
#print(serverStatusResult)

### Query MongoDB

In [15]:
db = client['MoviesDB'] #<- MoviesDB is the mongoCLUSTER
chirpCollection = db['movies'] # <-movies is the chirps collection within the mongoCluster

query = {'comment' : 'I hate ice cream'}
print(chirpCollection.find_one(query))

{'_id': ObjectId('6201da43350c4e4f36592a2e'), 'name': 'Kingston Gutierrez', 'date': '2019-04-28T20:16:13.931629-07:00', 'comment': 'I hate ice cream', 'location': {'latitude': 51.185218811035156, 'longitude': -114.47618865966797, 'country': 'CA', 'region': 'AB'}, 'likes': 12, 'responses': 6}


### Create a simple pipeline: match to "i hate ice cream" and group on the month field. 

In [16]:
mongoPipeline = [
    { 
        '$match': { 'likes': { '$gte': 10 } }
    }, {
        '$addFields': 
            {
                'Year': {'$toInt': {'$substr': ['$date', 0, 4]}}, 
                'Month': {'$toInt': {'$substr': ['$date', 5, 2]}}, 
                'Day': {'$toInt': {'$substr': ['$date', 8, 2]}}
            }
    }, {
        '$set': 
            {
            'subject': {
                '$switch': {
                    'branches': 
                    [
                        {'case': {'$gte': [{ '$indexOfCP': [ '$comment', 'hiking'] }, 0] }, 'then': 'Hiking'}, 
                        {'case': {'$gte': [{'$indexOfCP': [ '$comment', 'camping'] }, 0] }, 'then': 'Camping'}, 
                        {'case': {'$gte': [{'$indexOfCP': ['$comment', 'ice cream']}, 0] }, 'then': 'Ice cream' }, 
                        {'case': {'$gte': [{'$indexOfCP': ['$comment', 'tacos']}, 0] }, 'then': 'Tacos'}, 
                        {'case': {'$gte': [{'$indexOfCP': [ '$comment', 'walks on the beach' ] }, 0]}, 'then': 'Walks on the beach'}, 
                        {'case': { '$gte': [ {'$indexOfCP': [ '$comment', 'skiing'] }, 0]}, 'then': 'Skiing' }
                    ],'default': 'DID NOT MATCH'
                }
            }
        }
    }, {
        '$set': {
            'sentiment': {
                '$switch': {
                    'branches': 
                    [
                        {'case': {'$gte': [{'$indexOfCP': ['$comment', 'I love'] }, 0] }, 'then': 1}, 
                        {'case': {'$gte': [ {'$indexOfCP': ['$comment', 'Maybe I']}, 0 ]}, 'then': 0.3}, 
                        {'case': {'$gte': [{'$indexOfCP': ['$comment', 'I like']}, 0] }, 'then': 0.6}, 
                        {'case': {'$gte': [{'$indexOfCP': ['$comment', 'I think']}, 0]}, 'then': 0.1}, 
                        {'case': {'$gte': [{'$indexOfCP': ['$comment', 'I hate']}, 0]}, 'then': -0.6},
                        {'case': {'$gte': [{'$indexOfCP': ['$comment', 'really hate']}, 0]}, 'then': -1}
                        ],'default': 'DID NOT MATCH'
                }
            }
        }
    }, {
        '$group': {
            '_id': {
                'subject': '$subject', 
                'year': '$Year', 
                'month': '$Month'
            }, 
            'chirpCount': {'$sum': 1}, 
            'averageSentiment': {'$avg': '$sentiment'}, 
            'chirps': {
                '$push': {
                    'name': '$name', 
                    'comment': '$comment', 
                    'sentiment': '$sentiment', 
                    'location': '$location'
                }
            }
        }
    }  
]

results = chirpCollection.aggregate(mongoPipeline)



### We then need to clean the data coming out of our data pipeline
- First lets break out the '_id' JSON Object into their own columns.
- Then we will rename those columns and reindex them. 

In [17]:
### Normalize data using pandas
#
# This data has JSON objects nestled within it. To start we will need break out the ['subject', 'year', 'month'] fields that are nestled behind '_.id". Basically i had created a multilayered key for my _id index in MongoDB. I need to now break that out into a long form datastructure.
# We can do that with .json_normalize built in pandas funciton. 
# Note that the _id column is a JSON object while the chirps column is a JSON array.

df= pd.json_normalize(results, sep='>')
df

Unnamed: 0,chirpCount,averageSentiment,chirps,_id>subject,_id>year,_id>month
0,517,0.084720,"[{'name': 'Ella Herrera', 'comment': 'I love i...",Ice cream,2019,11
1,1050,0.115714,"[{'name': 'Ariana Barajas', 'comment': 'I real...",Skiing,2019,7
2,220,0.123182,"[{'name': 'Ryan Velasquez', 'comment': 'I like...",Skiing,2020,3
3,558,0.118817,"[{'name': 'Anthony Snyder', 'comment': 'I real...",Ice cream,2019,5
4,748,0.152674,"[{'name': 'Brantley Kerr', 'comment': 'I like ...",Walks on the beach,2019,9
...,...,...,...,...,...,...
73,1101,0.113896,"[{'name': 'Giselle Thompson', 'comment': 'I li...",Hiking,2019,9
74,982,0.107332,"[{'name': 'Jayden Snyder', 'comment': 'I hate ...",Skiing,2019,9
75,512,0.102148,"[{'name': 'Luke Jacobson', 'comment': 'I reall...",Ice cream,2019,12
76,231,0.118615,"[{'name': 'Harlow Murphy', 'comment': 'I like ...",Hiking,2020,3


In [18]:

#we want to rename these three columns. We are doing this so that when we chart this data downbelow, we will be able to use "dot notation" to access the columns. if there is a period in the name of the column it causes us issues. 
#   _id>subject -> subject
#   _id>year -> year
#   _id>month -> month
df = df.rename( columns = 
    { 
        '_id>subject':'subject',
        '_id>year':'year',
        '_id>month':'month',
    }
)
#and now lets reorder our columns useing dataFrame.reindex
df = df.reindex(columns=['subject', 'year', 'month', 'chirpCount', 'averageSentiment', 'chirps'])
df

Unnamed: 0,subject,year,month,chirpCount,averageSentiment,chirps
0,Ice cream,2019,11,517,0.084720,"[{'name': 'Ella Herrera', 'comment': 'I love i..."
1,Skiing,2019,7,1050,0.115714,"[{'name': 'Ariana Barajas', 'comment': 'I real..."
2,Skiing,2020,3,220,0.123182,"[{'name': 'Ryan Velasquez', 'comment': 'I like..."
3,Ice cream,2019,5,558,0.118817,"[{'name': 'Anthony Snyder', 'comment': 'I real..."
4,Walks on the beach,2019,9,748,0.152674,"[{'name': 'Brantley Kerr', 'comment': 'I like ..."
...,...,...,...,...,...,...
73,Hiking,2019,9,1101,0.113896,"[{'name': 'Giselle Thompson', 'comment': 'I li..."
74,Skiing,2019,9,982,0.107332,"[{'name': 'Jayden Snyder', 'comment': 'I hate ..."
75,Ice cream,2019,12,512,0.102148,"[{'name': 'Luke Jacobson', 'comment': 'I reall..."
76,Hiking,2020,3,231,0.118615,"[{'name': 'Harlow Murphy', 'comment': 'I like ..."


### Better. Now we can graph our Data

In [19]:
fig = px.bar(df, x='month', y='chirpCount', facet_col='subject')
fig.show()

In [20]:
fig = px.scatter(df, x='month', y='averageSentiment', trendline='ols', title='Canadians overall sentiment towards things they chose to Chirp about')
fig.update_traces(
    line=dict(width=3, color='gray')
)
fig.show()