In [1]:
from pymongo import MongoClient
from pymongo import errors
from pprint import pprint
import numpy as np
import re

In [2]:
def connect_database():
    client = MongoClient('localhost', 27017)
    return client

In [3]:
client = connect_database()

In [4]:
perfume_database = client.Perfume_Database

In [5]:
pprint(perfume_database.list_collection_names())

['Cleared_Crawled_Perfumes',
 'Fragrances_Backup_02/22/2024',
 'Scraped_Perfumes',
 'Extra01',
 'Fragrances_Backup_02/24/2024',
 'Fragrances_Backup_02/16/2024',
 'Extra03',
 'Metadata',
 'Perfumes',
 'Extra02',
 'Fragrances',
 'Fragrances_Quality',
 'Crawled_Perfumes']


In [6]:
fragrances_quality_collection = perfume_database.Fragrances_Quality
fragrances_collection = perfume_database.Fragrances

load the first analysis before any quality improvement

In [7]:
quality_analysis = fragrances_quality_collection.find_one(sort=[("Date", -1)])

In [8]:
pprint(quality_analysis)

{'Date': datetime.datetime(2024, 2, 16, 23, 5, 48, 74000),
 '_id': ObjectId('65cfea4c1d1d2de3ce3f6a5c'),
 'count': 1000,
 'fields': [{'count': 1000,
             'hasDuplicates': False,
             'name': '_id',
             'path': ['_id'],
             'probability': 1,
             'type': 'ObjectId',
             'types': [{'bsonType': 'ObjectId',
                        'count': 1000,
                        'hasDuplicates': False,
                        'name': 'ObjectId',
                        'path': ['_id'],
                        'probability': 1,
                        'unique': 1000,
                        'values': ['65c3d3942be6a0a6d64f654e',
                                   '65c3d3942be6a0a6d64f5fac',
                                   '65c3d3942be6a0a6d64f64e7',
                                   '65c3ecc62be6a0a6d64f676e',
                                   '65a6abcabcd0c58b31680cc5',
                                   '65a5afe6bacca2969b0ed42c',
            

                                   'complemented by delicious notes of '
                                   'caramel. Perfumer Olivier Cresp created a '
                                   'new composition as he added pralines to '
                                   'the fresh, floral-fruity original. Top '
                                   'notes: lemon, raspberry, satsuma mandarin\n'
                                   'Heart: gardenia, jasmine\n'
                                   'Base: praline\n'
                                   '\xa0\n'
                                   'Les Sorbets de Nina is available as a 50 '
                                   'and 80 ml Eau de Toilette.\xa0.',
                                   ' The carnal sensuality of a voluptuous '
                                   'bunch of spices. In the trail of this '
                                   'oriental gourmand, mystery is tinted with '
                                   'eroticism that the mythical year

In [9]:
print(quality_analysis.keys())

dict_keys(['_id', 'count', 'fields', 'Date'])


In [10]:
available_fields = [field["name"] for field in quality_analysis["fields"]]

A list of all fields present in the dataset

In [11]:
print(available_fields)

['_id', 'base notes', 'company', 'description', 'gender', 'gender_vote', 'image', 'link', 'longevity', 'main accords', 'middle notes', 'name', 'notes', 'number_votes', 'price', 'price value', 'rating', 'sillage', 'top notes', 'url', 'year']


## Completeness

In [12]:
def calculate_field_completeness(field, analysis_document):
    
    find_field = lambda f: next((idx for idx, i in enumerate(analysis_document["fields"]) if i["name"] == f), None)
    fields = analysis_document["fields"]
    field = fields[find_field(field)]
    
    types = field["types"]
    
    undefined_probability = next((i["probability"] for i in types if i["name"] == "Undefined"), 0)
    
    return undefined_probability

In [13]:
for field in [field["name"] for field in quality_analysis["fields"]]:
    print(f"The field \"{field}\"'s completeness is: {round(1 - calculate_field_completeness(field, quality_analysis), 2)}")

The field "_id"'s completeness is: 1
The field "base notes"'s completeness is: 0.52
The field "company"'s completeness is: 1
The field "description"'s completeness is: 0.97
The field "gender"'s completeness is: 0.5
The field "gender_vote"'s completeness is: 0.5
The field "image"'s completeness is: 0.97
The field "link"'s completeness is: 0.03
The field "longevity"'s completeness is: 0.5
The field "main accords"'s completeness is: 0.5
The field "middle notes"'s completeness is: 0.52
The field "name"'s completeness is: 1
The field "notes"'s completeness is: 0.47
The field "number_votes"'s completeness is: 0.5
The field "price"'s completeness is: 0.03
The field "price value"'s completeness is: 0.5
The field "rating"'s completeness is: 0.5
The field "sillage"'s completeness is: 0.5
The field "top notes"'s completeness is: 0.52
The field "url"'s completeness is: 0.5
The field "year"'s completeness is: 0.03


## Redundancy

## Consistency

In [14]:
types_dict = {field['name']: field['type'] for field in quality_analysis["fields"]}

The fields can have multiple formats, below is different bson data types that the fields have.

In [15]:
types_dict

{'_id': 'ObjectId',
 'base notes': ['Array', 'Undefined'],
 'company': 'String',
 'description': ['String', 'Undefined'],
 'gender': ['String', 'Undefined'],
 'gender_vote': ['Document', 'Undefined'],
 'image': ['String', 'Undefined'],
 'link': ['Array', 'Undefined'],
 'longevity': ['Document', 'Undefined'],
 'main accords': ['Document', 'Undefined'],
 'middle notes': ['Array', 'Undefined'],
 'name': 'String',
 'notes': ['String', 'Undefined'],
 'number_votes': ['Int32', 'String', 'Undefined'],
 'price': ['String', 'Undefined'],
 'price value': ['Document', 'Undefined'],
 'rating': ['Double', 'String', 'Undefined'],
 'sillage': ['Document', 'Undefined'],
 'top notes': ['Array', 'Undefined'],
 'url': ['String', 'Undefined'],
 'year': ['Int32', 'Undefined']}

### Notes Consistency

Inorder to assess notes consistency, all notes are extracted using the following pipeline.

In [13]:
extract_unique_notes = [
    {
        '$project': {
            'all_notes': {
                '$concatArrays': [
                    {
                        '$ifNull': [
                            '$top notes', []
                        ]
                    }, {
                        '$ifNull': [
                            '$base notes', []
                        ]
                    }, {
                        '$ifNull': [
                            '$middle notes', []
                        ]
                    }, {
                        '$ifNull': [
                            '$notes', []
                        ]
                    }
                ]
            }
        }
    }, {
        '$set': {
            'length': {
                '$size': '$all_notes'
            }
        }
    }, {
        '$sort': {
            'length': -1
        }
    }, {
        '$unwind': {
            'path': '$all_notes'
        }
    }, {
        '$group': {
            '_id': '$all_notes', 
            'count': {
                '$count': {}
            }
        }
    }, {
        '$sort': {
            'count': -1
        }
    }, {
        '$project': {
            'note': '$_id', 
            '_id': 0
        }
    }
]

In [22]:
notes_result = fragrances_collection.aggregate(extract_unique_notes)

In [23]:
notes_list = list(notes_result)

In [24]:
note_literal_list = [note["note"] for note in notes_list]

In [25]:
print(f"The number of unprocessed unique notes present in Fragrances collection are {len(note_literal_list)}")

The number of unprocessed unique notes present in Fragrances collection are 4580


Check the trimmed and lowercased notes count

In [18]:
processed_notes_pipeline_count = [
    {
        '$project': {
            'all_notes': {
                '$concatArrays': [
                    {
                        '$ifNull': [
                            '$top notes', []
                        ]
                    }, {
                        '$ifNull': [
                            '$base notes', []
                        ]
                    }, {
                        '$ifNull': [
                            '$middle notes', []
                        ]
                    }, {
                        '$ifNull': [
                            '$notes', []
                        ]
                    }
                ]
            }
        }
    }, {
        '$set': {
            'length': {
                '$size': '$all_notes'
            }
        }
    }, {
        '$sort': {
            'length': -1
        }
    }, {
        '$unwind': {
            'path': '$all_notes'
        }
    }, {
        '$group': {
            '_id': {
                '$trim': {
                    'input': {
                        '$toLower': '$all_notes'
                    }
                }
            }, 
            'count': {
                '$sum': 1
            }, 
            'variants': {
                '$push': '$all_notes'
            }
        }
    }, {
        '$count': 'count'
    }
]

In [29]:
print(f"The number of unique lowercased and trimmed notes are: {fragrances_collection.aggregate(processed_notes_pipeline_count).next()['count']}")

The number of unique lowercased and trimmed notes are: 3671


An analysis is required after reparsing the notes field

In [14]:
notes_result = fragrances_collection.aggregate(extract_unique_notes)

In [15]:
notes_list = list(notes_result)

In [16]:
note_literal_list = [note["note"] for note in notes_list]

In [17]:
print(f"The number of unprocessed unique notes present in Fragrances collection are {len(note_literal_list)}")

The number of unprocessed unique notes present in Fragrances collection are 4115


Applying the previous processing of lowercasing and trimming gives us the following result:

In [19]:
print(f"The number of unique lowercased and trimmed notes are: {fragrances_collection.aggregate(processed_notes_pipeline_count).next()['count']}")

The number of unique lowercased and trimmed notes are: 3353


#### Calculate Similarity

In [49]:
import itertools
import pandas as pd
from difflib import SequenceMatcher

In [59]:
# similarity_matrix = pd.DataFrame(index=note_literal_list_lowercase, columns=note_literal_list_lowercase)

In [51]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [52]:
def calculate_similarity(note1, note2):
  # Replace this with your actual similarity function
  # This is just a placeholder example
    similarity = SequenceMatcher(None, note1, note2).ratio()
    return similarity

In [53]:
def all_pairs_similarity(notes, similarity_func, similarity_matrix):
    
    combos = itertools.combinations(notes, 2)
    
    for note1, note2 in combos:
        similarity_matrix.loc[note1, note2] = similarity_func(note1, note2)

    return similarity_matrix

In [61]:
# all_pairs_similarity(note_literal_list_lowercase, calculate_similarity, similarity_matrix)

Unnamed: 0,musk,bergamot,vanilla,amber,sandalwood,patchouli,jasmine,rose,cedar,musk.1,...,snow flake accord,white currant,pink pepper essence,smoked cedar with a little bit of pollution (benzoin,birch essence,namasoma,cipriol,haitian vetyver root,graphite,nanah mint
musk,1.0,0.166667,0.0,0.222222,0.142857,0.153846,0.181818,0.25,0.0,1.0,...,0.190476,0.117647,0.086957,0.071429,0.117647,0.333333,0.0,0.0,0.0,0.142857
bergamot,0.166667,1.0,0.133333,0.461538,0.222222,0.235294,0.133333,0.166667,0.307692,0.166667,...,0.16,0.380952,0.148148,0.1,0.190476,0.375,0.266667,0.285714,0.125,0.333333
vanilla,0.0,0.133333,1.0,0.166667,0.352941,0.25,0.285714,0.0,0.166667,0.0,...,0.25,0.2,0.076923,0.135593,0.1,0.266667,0.285714,0.148148,0.266667,0.352941
amber,0.222222,0.461538,0.166667,1.0,0.133333,0.142857,0.5,0.222222,0.4,0.222222,...,0.272727,0.111111,0.166667,0.105263,0.222222,0.307692,0.166667,0.24,0.307692,0.266667
sandalwood,0.142857,0.222222,0.352941,0.133333,1.0,0.210526,0.235294,0.142857,0.266667,0.142857,...,0.222222,0.173913,0.137931,0.193548,0.173913,0.222222,0.117647,0.266667,0.111111,0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
namasoma,,,,,,,,,,,...,,,,,,,0.133333,0.142857,0.125,0.333333
cipriol,,,,,,,,,,,...,,,,,,,,0.222222,0.133333,0.117647
haitian vetyver root,,,,,,,,,,,...,,,,,,,,,0.285714,0.266667
graphite,,,,,,,,,,,...,,,,,,,,,,0.444444


In [63]:
# similarity_matrix.to_csv("Similarity_Matrix.csv")