In [1]:
import numpy as np
import pandas as pd
from pymongo import MongoClient

## Connect with MongoDB Atlas

In [2]:
client = MongoClient("mongodb://admin:qsRaxTMgTha6@artworkcluster-shard-00-00-vwxbu.gcp.mongodb.net:27017,artworkcluster-shard-00-01-vwxbu.gcp.mongodb.net:27017,artworkcluster-shard-00-02-vwxbu.gcp.mongodb.net:27017/test?ssl=true&replicaSet=ArtworkCluster-shard-0&authSource=admin&retryWrites=true&w=majority")

In [3]:
client.list_database_names()

['test', 'admin', 'local']

In [4]:
db = client['test']

In [5]:
db.list_collection_names()

['datas']

In [6]:
collection = db['datas']

#### Add field to the collection

In [None]:
#collection.update_many({},{"$set":{"experimentType" : "encode"}})

### Count ratings by user

In [7]:
list(collection.aggregate([
    {"$group" : {"_id":"$userId", "count":{"$sum":1}}}
]))

[{'_id': '100561984088717929575', 'count': 125},
 {'_id': '111553330425312850168', 'count': 27},
 {'_id': '117735700953378982421', 'count': 101},
 {'_id': '113935743558357074871', 'count': 124},
 {'_id': '103493283851958013338', 'count': 110},
 {'_id': '117136090827957222409', 'count': 100},
 {'_id': '100526816082185541678', 'count': 111},
 {'_id': '117452108939167495023', 'count': 125},
 {'_id': '111357961831498142289', 'count': 43},
 {'_id': '107112297045154154350', 'count': 40},
 {'_id': '117423035042823268423', 'count': 1},
 {'_id': '115125251787499369788', 'count': 11},
 {'_id': '115158042225217374702', 'count': 25},
 {'_id': '102632666783477615198', 'count': 75},
 {'_id': '105929426555581141067', 'count': 25},
 {'_id': '104468069636511146338', 'count': 73},
 {'_id': '106353877872766676678', 'count': 60},
 {'_id': '115019552271918435777', 'count': 25},
 {'_id': '115233687945562551998', 'count': 98},
 {'_id': '101586306503910736613', 'count': 97},
 {'_id': '112434306864764922196', 

### Count ratings by experiment type

In [8]:
list(collection.aggregate([
    {"$group" : {"_id":"$experimentType", "count":{"$sum":1}}}
]))

[{'_id': 'social_graph', 'count': 759}, {'_id': 'encode', 'count': 929}]

### Count ratings by user and reference artwork

In [None]:
list(collection.aggregate([
    {"$match": { "userId": "106353877872766676678" } },
    {"$group" : {"_id":{"userId":"$userId", "sourceArtworkId":"$sourceArtworkId", "experimentType":"$experimentType"}, "count":{"$sum":1}}}
]))

### Counts rating by reference artwork 

In [9]:
list(collection.aggregate([
    {"$group" : {"_id":"$sourceArtworkId", "count":{"$sum":1}}}
]))

[{'_id': 43809, 'count': 240},
 {'_id': 7066, 'count': 503},
 {'_id': 33965, 'count': 361},
 {'_id': 5629, 'count': 351},
 {'_id': 36457, 'count': 233}]

## Load to Dataframe

In [10]:
df_data = pd.DataFrame(list(collection.find()))
df_data = df_data.drop(['__v'], axis=1)
df_data.head()

Unnamed: 0,_id,createdAt,experimentType,id,ratedArtworkId,rating,sourceArtworkId,updatedAt,userId
0,5d6d2ffc96c99f0008942167,2019-09-02 15:06:36.784,encode,1.063539e+24,7066,Agree,7066,2019-09-02 15:06:36.784,106353877872766676678
1,5d6d324d96c99f0008942168,2019-09-02 15:16:29.666,encode,1.063539e+24,30499,Agree,7066,2019-09-02 15:16:29.666,106353877872766676678
2,5d6d326596c99f000894216a,2019-09-02 15:16:53.417,encode,1.063539e+24,14825,Agree,7066,2019-09-02 15:16:53.417,106353877872766676678
3,5d6d558396c99f000894216b,2019-09-02 17:46:43.374,encode,1.071123e+24,7066,Agree,7066,2019-09-02 17:46:43.374,107112297045154154350
4,5d6d559396c99f000894216c,2019-09-02 17:46:59.130,encode,1.071123e+24,30499,Agree,7066,2019-09-02 17:46:59.130,107112297045154154350


**Dump database values**

In [11]:
df_data.to_csv('ratings.csv', index=False)

## Plot metrics from ratings dsitribution

In [None]:
import matplotlib.pyplot as plt
import matplotlib.pylab as plb
import seaborn as sns

In [None]:
df_data_encode = df_data[df_data['experimentType'] == 'encode']
df_data_social_graph = df_data[df_data['experimentType'] == 'social_graph']
df_data_social_graph.shape

### Plot total users per approach

In [None]:
users = [len(df_data_encode['userId'].unique()), len(df_data_social_graph['userId'].unique())]
plt.bar([0, 1], users, tick_label=['encode', 'social graph'], color=['C0', 'C1'])
plt.title('Total users group by approach')

In [None]:
def group_by_artwork(df_data):

    df_artworks = df_data.groupby(['sourceArtworkId', 'ratedArtworkId','rating']).agg({'id':'count'})
    df_artworks = df_artworks.rename(columns={'id':'count'})
    return df_artworks

In [None]:
df_artworks = group_by_artwork(df_data_encode)

In [None]:
df_artworks_source_artwork = df_artworks.loc[7066]
df_artworks_source_artwork.head()

In [None]:
df_artworks_source_artwork = df_artworks_source_artwork.reset_index()
df_artworks_source_artwork.head()

In [None]:
plt.figure(figsize=(12,10))
g = sns.barplot(x="ratedArtworkId", y="count",hue='rating', data=df_artworks_source_artwork)
g.set_xticklabels(g.get_xticklabels(), rotation=90)