In [1]:
from datetime import datetime

from pymongo import MongoClient, InsertOne, UpdateOne
import mlflow
from omegaconf import OmegaConf



import urllib.request
import tempfile
from gcsfs import GCSFileSystem



In [2]:

def mongo_client(config_file_path):
    """Funcion to turn mongo client ON"""
    config = OmegaConf.load(config_file_path)
    server_ip = config.main.server_ip
    port = config.main.port
    if config.main.login:
        user = config.main.user
        passwd = config.main.passwd
        client = MongoClient(f'mongodb://{server_ip}:{port}',
                            username=user,
                            password=passwd)
    else:
        client = MongoClient(f'mongodb://{server_ip}:{port}')

    return client



class FileHandler():
    """Class to get file and save it to GCP."""
    def __init__(self, gc_project):
        self.file_system = GCSFileSystem(project=gc_project)

    def spot_download_helper(self, url, dest_name):
        """Function to get file and save it to GCP."""
        dest_file = 'tv_spots_store/'+dest_name
        print(f"FILENAME:  {dest_file}")
        self.file_system.touch(dest_file)
        with self.file_system.open(dest_file, mode='wb') as file:
            with tempfile.NamedTemporaryFile() as bin_file:
                try:
                    urllib.request.urlretrieve(url, bin_file.name)
                    data = bin_file.read()
                    file.write(data)
                except Exception as e:
                    return str(e)
            file_url = file.info()
        return (dest_file, file_url)

In [3]:
# START SERVER IF NEEDED - docker run --name testing-mongo -p 27017:27017 mongo:latest
server = mongo_client('conf/mongo-config.yaml')
db = server.TV_scan


In [4]:
#Is it necessary to update?


mlflow.run('components/spotreg_update')


2023/02/07 17:10:26 INFO mlflow.projects.docker: === Building docker image register_update:9513e08 ===
2023/02/07 17:10:26 INFO mlflow.projects.utils: === Created directory /tmp/tmp30x_b1nr for downloading remote URIs passed to arguments of type 'path' ===
2023/02/07 17:10:26 INFO mlflow.projects.backend.local: === Running command 'docker run --rm -v /home/laertio/projects/dev/tv_scan/mlruns:/mlflow/tmp/mlruns -v /home/laertio/projects/dev/tv_scan/mlruns/0/c93f455ed5bb4f78ab145e6fdc003a4f/artifacts:/home/laertio/projects/dev/tv_scan/mlruns/0/c93f455ed5bb4f78ab145e6fdc003a4f/artifacts -v /home/laertio/projects/dev/tv_scan/components/spotreg_update/data_in:/data_in -v /home/laertio/projects/dev/tv_scan/conf:/conf -e MLFLOW_RUN_ID=c93f455ed5bb4f78ab145e6fdc003a4f -e MLFLOW_TRACKING_URI=file:///mlflow/tmp/mlruns -e MLFLOW_EXPERIMENT_ID=0 register_update:9513e08 python run.py tv_scan_test input_sample.xlsx /conf/mongo-config.yaml spots_inventory --wandblogin=ea7d4ed782356f66850586612f115565

<mlflow.projects.submitted_run.LocalSubmittedRun at 0x7fb088f1abf0>

In [None]:
#Bellow the name of the collections
#db['spots_inventory']
#db['Proc_Jobs']

In [5]:
# https://www.programmingfunda.com/filter-records-in-mongodb-using-python/
# simple query - {'field1':'value', 'field2':'value'}
# number comparison - {"field": {"$gt": 25}} OR {"field": {"$lt": 25}} OR {"field": {"$lt": 25, "$gt": 25}}
# in list of values {'field':{$in:['item1', 'item2']}}
# Regex - {"field": {"$regex": "^M"}}
################
# MORE OPERATORS - https://www.mongodb.com/docs/manual/reference/operator/query/
################
filtering = {'BrandDesc':'PEUGEOT'}

In [11]:
cursor = db['spots_inventory'].find(filtering)

In [7]:
#INSPECT CURSOR
docs_toinspect = []
for doc in cursor[1:5]:
    # Building parameters
    docs_toinspect.append(doc)



<pymongo.cursor.Cursor at 0x7fb088f1b010>

In [8]:
docs_toinspect

[{'_id': ObjectId('63e2860c6b85a157d7697c01'),
  'SpotCode': 417291,
  'SpotDesc': 'LIDER DE VENDAS EM PT 2021.MELHOR MARCA AUTOMOVEL 9\\ANO',
  'VehicleDesc': 'SIC',
  'BrandDesc': 'PEUGEOT',
  'SubBrandDesc': nan,
  'SectorDesc': 'INDUSTRIA AUTOMOVEL',
  'CategoryDesc': 'VEICULOS A MOTOR',
  'ClassDesc': 'AUTOMOVEIS',
  'SubClassDesc': 'OUTROS OU DIVERSOS',
  'AdvertiserDesc': 'PEUGEOT PORTUGAL AUTOMOVEIS SA',
  'MediaFile': 417291,
  'MediaFileOldUrl': 'http://e-sol.mediamonitor.pt/SOLFILES/TELEVISAO/20220103/417291.wmv'},
 {'_id': ObjectId('63e2860c6b85a157d7697c02'),
  'SpotCode': 417315,
  'SpotDesc': 'LIDER DE VENDAS PT 2021.MELHOR MARCA AUTO 9\\ANO-SH SICNOT',
  'VehicleDesc': 'SIC Noticias',
  'BrandDesc': 'PEUGEOT',
  'SubBrandDesc': nan,
  'SectorDesc': 'INDUSTRIA AUTOMOVEL',
  'CategoryDesc': 'VEICULOS A MOTOR',
  'ClassDesc': 'AUTOMOVEIS',
  'SubClassDesc': 'OUTROS OU DIVERSOS',
  'AdvertiserDesc': 'PEUGEOT PORTUGAL AUTOMOVEIS SA',
  'MediaFile': 417315,
  'MediaFileOldUrl

In [12]:
docs_toproc = []
for doc in cursor:
    # Building parameters
    doc_id = doc['_id']
    docs_toproc.append(doc_id)
    

In [14]:
job_reference = 'test_peugeot'
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")



db['Proc_Jobs'].insert_one({'doc_list': docs_toproc, 'j_ref':job_reference, 'status':'Pending Processing.', 'ins_timestamp':now})

<pymongo.results.InsertOneResult at 0x7fb08aa64760>

In [15]:
# GET DATA


run_parameters = {'job_reference':job_reference}



mlflow.run('components/get_data', parameters=run_parameters)

2023/02/07 17:13:24 INFO mlflow.projects.docker: === Building docker image get_data:9513e08 ===
2023/02/07 17:13:25 INFO mlflow.projects.utils: === Created directory /tmp/tmpgdm64ioe for downloading remote URIs passed to arguments of type 'path' ===
2023/02/07 17:13:25 INFO mlflow.projects.backend.local: === Running command 'docker run --rm -v /home/laertio/projects/dev/tv_scan/mlruns:/mlflow/tmp/mlruns -v /home/laertio/projects/dev/tv_scan/mlruns/0/7b961d90746c4b2ca50810aa1e0f4b84/artifacts:/home/laertio/projects/dev/tv_scan/mlruns/0/7b961d90746c4b2ca50810aa1e0f4b84/artifacts -v /home/laertio/projects/dev/tv_scan/conf:/conf -e MLFLOW_RUN_ID=7b961d90746c4b2ca50810aa1e0f4b84 -e MLFLOW_TRACKING_URI=file:///mlflow/tmp/mlruns -e MLFLOW_EXPERIMENT_ID=0 get_data:9513e08 python run.py tv_scan_test spots_inventory /conf/mongo-config.yaml test_peugeot --wandblogin ea7d4ed782356f66850586612f115565b6a8a0c3' in run with ID '7b961d90746c4b2ca50810aa1e0f4b84' === 
wandb: W&B API key is configured. U

<mlflow.projects.submitted_run.LocalSubmittedRun at 0x7fb088dc9900>

In [17]:
# FEATURE EXTRACTION


run_parameters = {'job_reference':job_reference}

mlflow.run('components/feature_extraction', parameters=run_parameters)


2023/02/07 17:21:12 INFO mlflow.projects.docker: === Building docker image feature_extraction:9513e08 ===
2023/02/07 17:21:13 INFO mlflow.projects.utils: === Created directory /tmp/tmpjup9avb4 for downloading remote URIs passed to arguments of type 'path' ===
2023/02/07 17:21:13 INFO mlflow.projects.backend.local: === Running command 'docker run --rm -v /home/laertio/projects/dev/tv_scan/mlruns:/mlflow/tmp/mlruns -v /home/laertio/projects/dev/tv_scan/mlruns/0/95ab41bbdd8a42499b2c3ff97c1c17f6/artifacts:/home/laertio/projects/dev/tv_scan/mlruns/0/95ab41bbdd8a42499b2c3ff97c1c17f6/artifacts -v /home/laertio/projects/dev/tv_scan/conf:/conf -e MLFLOW_RUN_ID=95ab41bbdd8a42499b2c3ff97c1c17f6 -e MLFLOW_TRACKING_URI=file:///mlflow/tmp/mlruns -e MLFLOW_EXPERIMENT_ID=0 feature_extraction:9513e08 python run.py tv_scan_test spots_inventory /conf/mongo-config.yaml test_peugeot tiny --wandblogin ea7d4ed782356f66850586612f115565b6a8a0c3' in run with ID '95ab41bbdd8a42499b2c3ff97c1c17f6' === 
wandb: W&B

<mlflow.projects.submitted_run.LocalSubmittedRun at 0x7fb088260d60>

In [None]:
# CHECK - 