In [1]:
from pymongo import MongoClient
from pymongo import errors
from pprint import pprint
import numpy as np
import re

In [2]:
def connect_database():
    client = MongoClient('localhost', 27017)
    return client

In [3]:
client = connect_database()

In [4]:
client.list_database_names()

['Kitchen_test',
 'Perfume_Database',
 'WorldDevelopmentIndicators',
 'admin',
 'config',
 'local']

In [5]:
perfume_database = client.Perfume_Database

In [6]:
perfume_database.list_collection_names()

['Cleared_Crawled_Perfumes',
 'Scraped_Perfumes',
 'Extra01',
 'Fragrances_Backup_02/16/2024',
 'Extra03',
 'Metadata',
 'Perfumes',
 'Extra02',
 'Fragrances',
 'Fragrances_Quality',
 'Crawled_Perfumes']

In [7]:
fragrances_collection = perfume_database.Fragrances

## Clearing Fragrances Dataset

trim and lowercase company names

In [35]:
pipeline = []

In [36]:
clean_name = {
    '$set': {
        'name': {
            '$toLower': {
                '$trim': {
                    'input': {
                        '$toString': '$name'
                    }
                }
            }
        }
    }
}

In [37]:
clean_company = {
    '$set': {
        'company': {
            '$toLower': {
                '$trim': {
                    'input': {
                        '$toString': '$company'
                    }
                }
            }
        }
    }
}

In [38]:
merge = {
    "$merge": "Fragrances"
}

In [39]:
pipeline.extend([clean_name, clean_company, merge])

In [40]:
pipeline

[{'$set': {'name': {'$toLower': {'$trim': {'input': {'$toString': '$name'}}}}}},
 {'$set': {'company': {'$toLower': {'$trim': {'input': {'$toString': '$company'}}}}}},
 {'$merge': 'Fragrances'}]

In [41]:
fragrances_collection.count_documents({})

4563

In [42]:
fragrances_collection.aggregate(pipeline)

<pymongo.command_cursor.CommandCursor at 0x273ff8901d0>

In [43]:
fragrances_collection.count_documents({})

4563

## Detect Duplicates

In [8]:
duplicate_detection_pipeline = []

In [9]:
group_by_name_company = {
    '$group': {
        '_id': {
            'name': '$name', 
            'company': '$company'
        }, 
        'ids': {
            '$push': '$_id'
        },
        'count': {
            '$count': {}
        }
    }
}

In [10]:
find_duplicates = {
    '$match': {
        'count': {
            '$gt': 1
        }
    }
}

In [11]:
duplicate_detection_pipeline.extend([group_by_name_company, find_duplicates])

In [27]:
result = fragrances_collection.aggregate(duplicate_detection_pipeline)

In [28]:
to_delete_ids = []

In [29]:
for doc in result:
    to_delete_ids.extend(doc['ids'][1:])

In [30]:
to_delete_ids

[ObjectId('65c3d3942be6a0a6d64f648e'),
 ObjectId('65a6842180d8b716415b35cf'),
 ObjectId('65a6842780d8b716415b35d2'),
 ObjectId('65a6842d80d8b716415b35d5'),
 ObjectId('65c3d3942be6a0a6d64f61d2'),
 ObjectId('65c3d3942be6a0a6d64f6338'),
 ObjectId('65c3d3942be6a0a6d64f618a'),
 ObjectId('65a75c9a96d56388983d0998')]

In [31]:
delete_query = {
    '_id': {
        '$in': to_delete_ids
    }
}

In [32]:
test_res = fragrances_collection.find(delete_query)

In [33]:
for i in test_res:
    pprint(i)

{'_id': ObjectId('65a6842180d8b716415b35cf'),
 'base notes': ['Oakmoss', 'Patchouli', 'Amber'],
 'company': 'frapin',
 'description': 'The Orchid Man by Frapin is a Aromatic fragrance for women '
                'and men. The Orchid Man was launched in 2015. The nose behind '
                'this fragrance is Jérôme Epinette. Top notes are Bergamot and '
                'Black Pepper; middle notes are Leather and Jasmine; base '
                'notes are Oakmoss, Patchouli and Amber. The Orchid Man was a '
                'nickname of the French boxer Georges Carpentier, who was a '
                'multi-talented person. After the boxing ring, he took to the '
                'stage. Carpentier the showman went from Paris to Hollywood. '
                'He experienced the heights of Wall Street and then its '
                'collapse. A modern man who was always ready for new '
                'adventures, the Orchid Man opened one of the first cocktail '
                'bars in 

In [34]:
fragrances_collection.delete_many(delete_query)

DeleteResult({'n': 8, 'ok': 1.0}, acknowledged=True)

deleting was applied. double check duplicates

In [None]:
duplicate_detection_pipeline.append({'$count': 'count'})

In [45]:
try:
    fragrances_collection.aggregate(duplicate_detection_pipeline).next()
except StopIteration:
    print("None")

None


## Address Consistency Issues

### Rating Field 
 There are documents with values of "NA". since they are not available, the field is going to be removed. using update and unset operator.

In [28]:
rating_toBeRemoved = {"rating": "NA"}

In [30]:
remove_rating_field = {'$unset': {"rating": {}}}

In [31]:
fragrances_collection.count_documents(rating_toBeRemoved)

12

In [32]:
result = fragrances_collection.update_many(rating_toBeRemoved, remove_rating_field)

In [33]:
pprint(result)

UpdateResult({'n': 12, 'nModified': 12, 'ok': 1.0, 'updatedExisting': True}, acknowledged=True)


In [34]:
fragrances_collection.count_documents(rating_toBeRemoved)

0

### Link and URL
Since they are the same fields, the values of url are added to an array and renamed to 