# DA320 Assignment 7: Mongo Charts
Jon Kaimmer  
DA320  
Winter2022


 ### Introduction
Lets import some chirp data and try and get it into a chart.

In [481]:
#IMPORTS
import os
import pymongo
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import json as json
import pprint
import plotly.express as px


# import warnings
# warnings.filterwarnings('ignore') #Ignore the seaborn warnings...

#METHODS
def connectToMongoDB():
    with open(credentialLocation, 'r') as myFile:   #open seperate file that stores passwords in JSON array format
        data = myFile.read()                        #read file into memory
        credentialDict = json.loads(data)           #parse json file into a python dictionary
        
        return(credentialDict['MONGO']['mDBconnectionString'])

#FIELDS
credentialLocation = r"C:\Users\\jonat\\OneDrive\Documents\GitHub\\DA320\credentials.json"

sns.set(rc = {'figure.figsize':(40,8)})

### GET USER INPUT

In [482]:
#GET USER INPUT
# userInput = input('Enter your value: ')
# print(userInput)

### Read MongoDB connection string from my credentials.json file

In [483]:
MONGOconnectionString = connectToMongoDB()
client = pymongo.MongoClient(MONGOconnectionString)
db = client.admin

serverStatusResult=db.command('serverStatus')
print(serverStatusResult)



### Query MongoDB

In [484]:
db = client['MoviesDB'] #<- MoviesDB is the mongoCLUSTER
chirpCollection = db['movies'] # <-movies is the chirps collection within the mongoCluster

query = {'comment' : 'I hate ice cream'}
print(chirpCollection.find_one(query))

{'_id': ObjectId('6201da43350c4e4f36592a2e'), 'name': 'Kingston Gutierrez', 'date': '2019-04-28T20:16:13.931629-07:00', 'comment': 'I hate ice cream', 'location': {'latitude': 51.185218811035156, 'longitude': -114.47618865966797, 'country': 'CA', 'region': 'AB'}, 'likes': 12, 'responses': 6}


### Create a simple pipeline: match to "i hate ice cream" and group on the month field. 

In [485]:
simplePipeline = [
    { 
        '$match': { 'likes': { '$gte': 10 } }
    }, {
        '$addFields': 
            {
                'Year': {'$toInt': {'$substr': ['$date', 0, 4]}}, 
                'Month': {'$toInt': {'$substr': ['$date', 5, 2]}}, 
                'Day': {'$toInt': {'$substr': ['$date', 8, 2]}}
            }
        }, {
        '$set': 
            {
            'subject': {
                '$switch': {
                    'branches': 
                    [
                        {'case': {'$gte': [{ '$indexOfCP': [ '$comment', 'hiking'] }, 0] }, 'then': 'Hiking'}, 
                        {'case': {'$gte': [{'$indexOfCP': [ '$comment', 'camping'] }, 0] }, 'then': 'Camping'}, 
                        {'case': {'$gte': [{'$indexOfCP': ['$comment', 'ice cream']}, 0] }, 'then': 'Ice cream' }, 
                        {'case': {'$gte': [{'$indexOfCP': ['$comment', 'tacos']}, 0] }, 'then': 'Tacos'}, 
                        {'case': {'$gte': [{'$indexOfCP': [ '$comment', 'walks on the beach' ] }, 0]}, 'then': 'Walks on the beach'}, 
                        {'case': { '$gte': [ {'$indexOfCP': [ '$comment', 'skiing'] }, 0]}, 'then': 'Skiing' }
                    ],'default': 'DID NOT MATCH'
                }
            }
        }
    }, {
        '$set': {
            'sentiment': {
                '$switch': {
                    'branches': 
                    [
                        {'case': {'$gte': [{'$indexOfCP': ['$comment', 'I love'] }, 0] }, 'then': 1}, 
                        {'case': {'$gte': [ {'$indexOfCP': ['$comment', 'Maybe I']}, 0 ]}, 'then': 0.3}, 
                        {'case': {'$gte': [{'$indexOfCP': ['$comment', 'I like']}, 0] }, 'then': 0.6}, 
                        {'case': {'$gte': [{'$indexOfCP': ['$comment', 'I think']}, 0]}, 'then': 0.1}, 
                        {'case': {'$gte': [{'$indexOfCP': ['$comment', 'I hate']}, 0]}, 'then': -0.6},
                        {'case': {'$gte': [{'$indexOfCP': ['$comment', 'really hate']}, 0]}, 'then': -1}
                        ],'default': 'DID NOT MATCH'
                }
            }
        }
    }, 
    # Not working as it is.... It is hard to access dictionaries within PANDAS DATAFRAMES...
    {
        '$group': {
            '_id': {
                'subject': '$subject', 
                'year': '$Year', 
                'month': '$Month'
            }, 
            'chirpCount': {'$sum': 1}, 
            'averageSentiment': {'$avg': '$sentiment'}, 
            'chirps': {
                '$push': {
                    'name': '$name', 
                    'comment': '$comment', 
                    'sentiment': '$sentiment', 
                    'location': '$location'
                }
            }
        }
    }  
]

results = chirpCollection.aggregate(simplePipeline)



### My aggregation looks like this prior to being cleaned
Notice that the _id column is a single JSON Object whereas the chirps column is a JSON Array of each of the chrips.

In [486]:
data_df = pd.DataFrame(list(results))
data_df

Unnamed: 0,_id,chirpCount,averageSentiment,chirps
0,"{'subject': 'Ice cream', 'year': 2020, 'month'...",600,0.100000,"[{'name': 'Samara Blake', 'comment': 'I like i..."
1,"{'subject': 'Walks on the beach', 'year': 2019...",731,0.116142,"[{'name': 'Elias Hopkins', 'comment': 'I hate ..."
2,"{'subject': 'Walks on the beach', 'year': 2020...",754,0.114058,"[{'name': 'Melody Gilmore', 'comment': 'Maybe ..."
3,"{'subject': 'Camping', 'year': 2020, 'month': 2}",812,0.102094,"[{'name': 'Parker Solomon', 'comment': 'I real..."
4,"{'subject': 'Skiing', 'year': 2020, 'month': 1}",1028,0.102432,"[{'name': 'Christian Landry', 'comment': 'I li..."
...,...,...,...,...
73,"{'subject': 'Camping', 'year': 2019, 'month': 9}",838,0.119093,"[{'name': 'Cruz Stein', 'comment': 'Maybe I li..."
74,"{'subject': 'Tacos', 'year': 2019, 'month': 7}",685,0.094891,"[{'name': 'Kayla Coffey', 'comment': 'I really..."
75,"{'subject': 'Camping', 'year': 2020, 'month': 1}",850,0.122824,"[{'name': 'Luna Chen', 'comment': 'I hate camp..."
76,"{'subject': 'Ice cream', 'year': 2019, 'month'...",578,0.079585,"[{'name': 'Jordan Esparza', 'comment': 'I hate..."


In [487]:
#You can use a for loop to print each row of data
# for chirp in results:
#     print(f"{chirp['_id']['subject']} {chirp['_id']['month']}/{chirp['_id']['year']}\n  averageSentiment: {chirp['averageSentiment']}\n  chirpCount: {chirp['chirpCount']}\n" )

### So lets clean this data.
- First lets break out the '_id' JSON Object into their own columns.
- Then we will rename those columns and reindex them. 

In [504]:
### Normalize data using pandas
#
# This data has JSON objects nestled within it. To start we will need break out the ['subject', 'year', 'month'] fields that are nestled behind '_.id". Basically i had created a multilayered key for my _id index in MongoDB. I need to now break that out into a long form datastructure.
# We can do that with .json_normalize built in pandas funciton. 
# Note that the _id column is a JSON object while the chirps column is a JSON array.
df = pd.json_normalize(
        data_df["_id","averageSentiment"]
        # ,
        # data_df["_id"],
        # meta=["chirpCount","averageSentiment"]
    )
df

0
1
2
3


In [505]:

#we want to rename these three columns. We are doing this so that when we chart this data downbelow, we will be able to use "dot notation" to access the columns. if there is a period in the name of the column it causes us issues. 
#   _id.subject -> subject
#   _id.year -> year
#   _id.month -> month
df = df.rename( columns = 
    { 
        '_id.subject':'subject',
        '_id.year':'year',
        '_id.month':'month',
    }#, inplace=True #inplace=True means that the original dataframe is being modified, as opposed to a new dataframe being created. 
)
#and now lets reorder our columns useing dataFrame.reindex
df = df.reindex(columns=['subject', 'year', 'month', 'chirpCount', 'averageSentiment', 'chirps'])
df

Unnamed: 0,subject,year,month,chirpCount,averageSentiment,chirps
0,,,,,,
1,,,,,,
2,,,,,,
3,,,,,,


- Finally, lets handle the JSON array and create a new dataframe just for that data.

### This data is in LONG FORMAT
This is not Tidy data, the last line of code ...json_normalize... effectively breaks apart and expands (into new columns) the _id column that we created in our MongoDB aggregation. 

In [490]:
data

Unnamed: 0,subject,year,month,chirpCount,averageSentiment,chirps
0,Ice cream,2019,6,,,
1,Tacos,2020,3,,,
2,Hiking,2020,2,,,
3,Skiing,2019,4,,,
4,Hiking,2019,9,,,
...,...,...,...,...,...,...
73,Tacos,2019,12,,,
74,Skiing,2019,3,,,
75,Camping,2019,8,,,
76,Skiing,2019,5,,,


### and then lets also make a chirps repository of all chirp data

In [491]:
#chirps_df = pd.json_normalize(data, record_path='chirps', meta=[])

### NOT IN USE

In [492]:

# Lets insert our mongoDB aggregation data into a pandas Dataframe called 'monthlyHatesData'.
# Pandas dataframe makes it easier to graph
monthlyHatesData = pd.DataFrame(list(data))

#pandas_df.head() shows us what the data looks like in the pandas dataframe
#monthlyHatesData.head()


### NOT IN USE

### Once its in pandas we can graph it just like we did for Assignment 4: Jupyter Charts

In [493]:
# sns.scatterplot(
#     data=monthlyHatesData,  
#                #X-axis: month
#     y='averageSentiment'     #Y-axis: aggregate of all icecream hates in that month
#     )
# plt.title('Who hates icecream?', size= 24)      #Title
# plt.xlabel('Month', size= 18)                   #X-Label
# #plt.ylabel('Hates per Month',size= 18)          #Y-Label
# #plt.ylim((0,230))                               #Force y-lim to go 0:230
# plt.xticks(fontsize = 15)                       #X-tick size
# #plt.yticks(fontsize = 15)                       #Y-tick size
# plt.show()

### NOT IN USE


In [494]:
data

Unnamed: 0,subject,year,month,chirpCount,averageSentiment,chirps
0,Ice cream,2019,6,,,
1,Tacos,2020,3,,,
2,Hiking,2020,2,,,
3,Skiing,2019,4,,,
4,Hiking,2019,9,,,
...,...,...,...,...,...,...
73,Tacos,2019,12,,,
74,Skiing,2019,3,,,
75,Camping,2019,8,,,
76,Skiing,2019,5,,,


### .... STARTING UP AGAIN
Lets plot in plotly express

In [495]:
#
#
#   GRAPHING OUR DATA
#
#
#   Data
#      DataFrame = 'data'
#
#  Scatterplot does not feel like the right thing to be graphing here...

fig = px.scatter(data, x='_id.month', y='chirpCount', color='_id.subject', size='averageSentiment')
fig.show()

ValueError: Value of 'x' is not the name of a column in 'data_frame'. Expected one of ['subject', 'year', 'month', 'chirpCount', 'averageSentiment', 'chirps'] but received: _id.month

In [None]:
#   Lets try a barplot
#
#   GRAPHING OUR DATA
#
#
#   Data
#      DataFrame = 'data'
#
#

fig = px.bar(data, x='_id.month', y='chirpCount', color='_id.subject')
fig.show()

In [None]:
#   Lets try transposing the data....
#
#   GRAPHING OUR DATA
#
#
#   Data
#      DataFrame = 'data'
#
#

fig = px.bar(data, x=data.index, y='chirpCount', color='_id.subject')
fig.show()

In [None]:
fig = px.bar(data, x='_id.month', y='chirpCount', facet_col='_id.subject')
fig.show()

### POTENTIAL FINAL GRAPH
Overall: Canadian's sentiment towards things 

In [None]:
fig = px.scatter(data, x='_id.month', y='averageSentiment', trendline='ols', title='Canadians overall sentiment towards things they chose to Chirp about')
fig.update_traces(
    line=dict(width=3, color='gray')
)
fig.show()

### potential final graph
only hiking, camping and icecream

In [None]:
(px.scatter(data, x="_id.month", y="averageSentiment", color="_id.subject",
            facet_col="_id.year", trendline="ols",
            title="Things canadians chirp about and how they feel about them")
 .update_layout(title_font_size=24)
 .update_xaxes(showgrid=False)
 .update_traces(
     line=dict(dash="dot", width=4),
     selector=dict(type="scatter", mode="lines"))
).show()

### Filter my returned values (into a new dataframe) only of the requested subjects for this assignment

In [None]:
subjects = ['ice cream', 'hiking', 'camping']
filtered_df = data[data._id.subject.isin(subjects)]
filtered_df