<a href="https://colab.research.google.com/github/hjtb/Data-Validation/blob/main/Product_Validation_Script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PRODUCT VALIDATION SCRIPT
# Authors - David Leon (@Dleon) and William Holton (@Wholton)
## Intro
### The following manual validation needs to be added to the products publication process to avoid failing tests:

1.   Product URL needs to start with https:// or http://
2.   Company URL starts with https://www.linkedin.com/company/
3.   Company URL needs to be in the format of https://www.linkedin.com/company/<vanity_name> instead of [..]<company_id>
4.   Showcase page URL needs to start with https://www.linkedin.com/showcase/<vanity_name>
5.   Product Category ID corresponds to category/categories assigned to product
6.   Ensure Product URL is NOT a PDF (i.e. does not end with ‘.pdf’)
**New Validations/Validations to be updated following on from 1st publication**
7.   Add validation to ensure product categories assigned to products are NOT Group Representatives
8.   Identify required fields and add a check to ensure all of those fields are filled
9.   Validation that isActive = T, and isDeprecated = F for New Products, and find errors where we have T/T or F/F. (Possible check for deprecations)

10.  Ensure all characters are unicode. (i.e. No special characters like Äô)
11.  Ensure Product Skill ID is valid and not 0

## Install and import relevant packages, and create main Data Objects

In [None]:
# Install pygsheets to be able to connect to the spreadsheet:
!pip install pygsheets &> /dev/null 
!pip install validators
!pip install pandas==1.3  # We will need this version to use explode function on multiple columns



In [None]:
import pygsheets
import numpy as np
import pandas as pd
from pygsheets.datarange import DataRange
import pprint
import validators

In [None]:
# Get connection through Service Account credentials (Google APIs):
google_credentials = pygsheets.authorize(service_account_file=
                         './product-data-validation-5a7794651578.json')

In [None]:
# Will Credentials
# google_credentials = pygsheets.authorize(client_secret='./client_secret_will.json')

In [None]:
# Open staging sheet (publication days):
# pipeline_spreadsheet = google_credentials.open('Product Publication Staging Sheet')
# specific_pipeline_tab = pipeline_spreadsheet.worksheet_by_title('New Products')
# specific_pipeline_tab = pipeline_spreadsheet.worksheet_by_title('Product Changes')  
# for now uncomment this one and comment the one above depending on which tab you want to validate for

In [None]:
# Open selected spreadsheet:
pipeline_spreadsheet = google_credentials.open('Catalog + Product Pipeline')
# Actual current publication date sheet to process (comment if just testing/developing):
# specific_pipeline_tab = pipeline_spreadsheet.worksheet_by_title('FY22Q2 Review Period 2 (08/11-)')
# Testing/Dev tab (comment if just testing actual publication):
specific_pipeline_tab = pipeline_spreadsheet.worksheet_by_title('Will & David (FY22Q1 Review Period 2 (08/06-09/09)')

In [None]:
specific_pipeline_tab.rows

In [None]:
# Convert Pipeline sheet into pandas dataframe:
pipeline_sheet_dataframe = specific_pipeline_tab.get_as_df()
pipeline_sheet_dataframe.index += 2  # Shift indexes to match the original doc
pipeline_sheet_dataframe.head(2)

In [None]:
# Open Golden Category Status spreadsheet:
golden_spreadsheet = google_credentials.open('Golden Category Status ')
specific_golden_tab = golden_spreadsheet.worksheet_by_title('Golden Product Categories')

In [None]:
# Convert Golden sheet into pandas dataframe:
golden_sheet_dataframe = specific_golden_tab.get_as_df()
golden_sheet_dataframe.head(2)

## 1.  Product URL needs to start with https:// or http://



In [None]:
# First get the column of product URLs
product_urls = pipeline_sheet_dataframe['Product URLS']
product_urls_dataframe = product_urls.to_frame()

#### Use Validators method to validate URLs

In [None]:
# Use validators package to validate urls and assign true and false values in new column called isURLValid
def isUrlValid(url):
    return True if validators.url(url) else False
product_urls_dataframe['isURLValid'] = product_urls_dataframe['Product URLS'].apply(isUrlValid)

In [None]:
# Get rows where url is not deemed valid
product_urls_dataframe.loc[product_urls_dataframe['isURLValid'] == False]

#### [WILL] Highlight the cells with errors in red


In [None]:
# Get rows where url is not deemed valid
error_row_indexes = product_urls_dataframe.loc[product_urls_dataframe['isURLValid'] == False].index
for row in error_row_indexes:
  # Select the cell we wish to change the color of using the row index and the column value O for product Urls
  model_cell = specific_pipeline_tab.cell(f'O{row}')
  #model_row = specific_pipeline_tab.fields(f'A{row}',f'O{row}') Unable to reference the full row but will get there
  # Change the selected cells to red
  model_cell.color = (1, 0.1, 0.1, 0.5)
  # model_row.color = (0.5, 0.1, 0.1, 0.5)

#### Use pandas method to validate URLs

In [None]:
# Get the rows that don't pass the 'http'/'https' validation criteria:
pipeline_sheet_dataframe.loc[product_urls.str.startswith('http','https') == False, :]

In [None]:
prod_url_errs = pipeline_sheet_dataframe.loc[product_urls.str.startswith('http','https') == False, :]
prod_url_errs

In [None]:
# Mark in red such cells (thought indexing by product name):

## 2. Company URL starts with "https://www.linkedin.com/company/" and is deemed a valid url by the Validators package


In [None]:
# Get the company urls column
company_urls = pipeline_sheet_dataframe.loc[:, 'Comp URL']
company_urls.head()

In [None]:
# get all company urls that don't start with https://www.linkedin.com/company/
comp_url_errs = pipeline_sheet_dataframe.loc[company_urls.str.startswith('https://www.linkedin.com/company/') == False, :]
# count how many urls are incorrect
comp_url_errs

#### Use Validators method to validate URLs

In [None]:
# Use validators package to validate urls and assign true and false values in new column called isURLValid
company_urls_dataframe = company_urls.to_frame()
def isUrlValid(url):
    return True if validators.url(url) else False
company_urls_dataframe['isURLValid'] = company_urls_dataframe['Comp URL'].apply(isUrlValid)

In [None]:
# Get rows where url is not deemed valid
company_urls_dataframe.loc[company_urls_dataframe['isURLValid'] == False]

## 3. Company URL needs to be in the format of `https://www.linkedin.com/company/'vanity_name'` instead of `[..]'company_id'`


In [None]:
# Get the part of the url that comes after the 'company/' address:
vanity_name_bit = company_urls.str.split('https://www.linkedin.com/company/', expand=True)
vanity_name_bit.head()

In [None]:
# Keep just the vanity name part and turn to series object:
vanity_name_bit = vanity_name_bit.pop(1).squeeze()  
# In case someone has added the url with the id both from admin view, or for member view:
vanity_name_bit = vanity_name_bit.str.split('/admin', expand=True).pop(0).squeeze()
vanity_name_bit = vanity_name_bit.str.split('/mycompany', expand=True).pop(0).squeeze()
vanity_name_bit.head()

In [None]:
# Get rid of forward slash or potential white spaces at the end so that it doesn't 
# interfere with isdigit() in checking if the string is only numbers:
vanity_name_bit = vanity_name_bit.str.rstrip('/ ')
vanity_name_bit.head()

In [None]:
# Check if the url corresponding with the vanity name is only numbers (i.e. company ID instead of vanity name):
vanity_name_errs = pipeline_sheet_dataframe.loc[vanity_name_bit.str.isdigit() == True, :]
vanity_name_errs

## 4. Showcase page URL needs to start with https://www.linkedin.com/showcase/<vanity_name>  and is deemed a valid url by the Validators package

In [None]:
# get the showcase urls
showcase_urls = pipeline_sheet_dataframe.loc[:, 'Showcase URL']
showcase_urls.head()

2    
3    
4    
5    
6    
Name: Showcase URL, dtype: object

In [None]:
# get non-empty showcase urls by checking if their string length is greater than 0
showcase_col_not_empty = pipeline_sheet_dataframe.loc[showcase_urls.str.len().gt(0)]
non_empty_showcase_urls = showcase_col_not_empty['Showcase URL']
non_empty_showcase_urls

27    https://www.linkedin.com/showcase/zucchettiwor...
Name: Showcase URL, dtype: object

In [None]:
# find showcase url errors where they don't begin with 'https://www.linkedin.com/showcase/'
showcase_url_errs = showcase_col_not_empty.loc[non_empty_showcase_urls.str.startswith('https://www.linkedin.com/showcase/') == False]
showcase_url_errs = pipeline_sheet_dataframe.loc[showcase_url_errs.index]
showcase_url_errs

Unnamed: 0,Category ID,Product Category,Product ID,Product name,Product Aliases,Product Description,Active,Deprecated,Product Skill ID,Product URLS,Description Locale,Company ID - OWNER,Comp URL,Company Name,Showcase ID,Showcase URL,Company/product page for display,Customer Organizations IDs,Customer Organizations Company URLs,Source of Product,In V1.1,In MVP,Has ingested IMAGES - 2020Dec09,Has ingested VIDEOS - 2020Dec09,Product notes,New Product?,Notify Pages?


#### Use Validators method to validate URLs

In [None]:
# Use validators package to validate urls and assign true and false values in new column called isURLValid
non_empty_showcase_urls_dataframe = non_empty_showcase_urls.to_frame()
def isUrlValid(url):
    return True if validators.url(url) else False
non_empty_showcase_urls_dataframe['isURLValid'] = non_empty_showcase_urls_dataframe['Showcase URL'].apply(isUrlValid)

In [None]:
# Get rows where url is not deemed valid
non_empty_showcase_urls_dataframe.loc[non_empty_showcase_urls_dataframe['isURLValid'] == False]

Unnamed: 0,Showcase URL,isURLValid


## 5. Product Category ID corresponds to category/categories assigned to product

### To-do

*   **[DONE]** Find where columns don't match in merged dataframe 
*   **[DONE]** Separate double coded products into two lines in dataframe/find another way to check these individually
*   **[DONE]** Find corresponding rows in original pipeline dataframe to highlight the errors



### Single Coded Products:

In [None]:
# This will set pandas to display as many rows of the dataframe if you need to inspect the complete set
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [None]:
# Get a Dataframe of just the Category Ids and Category names of the products in the pipeline sheet
pipeline_sheet_category_IDs_and_names = pipeline_sheet_dataframe.loc[:, "Category ID":"Product Category"]

print(pipeline_sheet_category_IDs_and_names.head())

# Rename the Cat ID column of pipeline sheet to 'URN' to use as a common value for the merge of pipeline and golden dataframes
pipeline_sheet_category_IDs_and_names_renamed = pipeline_sheet_category_IDs_and_names.rename(columns={'Category ID': 'URN'})
pipeline_sheet_category_IDs_and_names_renamed.head()

In [None]:
# Get a Dataframe of just the Category Ids and Category names from the Golden Sheet
golden_sheet_category_IDs_and_names = golden_sheet_dataframe.loc[:, "URN":"Category Name"]
golden_sheet_category_IDs_and_names.head()

In [None]:
# Cast merging columns to same dtype so that the merge is effective:
golden_sheet_category_IDs_and_names['URN'] = golden_sheet_category_IDs_and_names['URN'].astype(str)
pipeline_sheet_category_IDs_and_names_renamed['URN'] = pipeline_sheet_category_IDs_and_names_renamed['URN'].astype(str)

# Move index to the dataframe to preserve it after merging dataframes (otherwise would lose index):
pipeline_sheet_category_IDs_and_names_renamed.reset_index(inplace=True)
pipeline_sheet_category_IDs_and_names_renamed.head()

In [None]:
# Merge both dataframes using inner join on the common URN Column. The index corresponds to the row on the pipeline sheet. NOTE - Not returning doublecoded URNs
golden_and_pipeline_sheets_merged = pd.merge(golden_sheet_category_IDs_and_names, pipeline_sheet_category_IDs_and_names_renamed, how="inner", on=["URN"])

# Get rid of surrounding whitespaces that could alter the comparison:
golden_and_pipeline_sheets_merged['Category Name'] = golden_and_pipeline_sheets_merged['Category Name'].str.strip()
golden_and_pipeline_sheets_merged['Product Category'] = golden_and_pipeline_sheets_merged['Product Category'].str.strip()

golden_and_pipeline_sheets_merged.head()

In [None]:
# Using the merged dataframe locate the rows where the Product Category doesn't match the value of the Category name in the golden sheet 
id_category_mismatch_errs = golden_and_pipeline_sheets_merged.loc[
          ~golden_and_pipeline_sheets_merged.apply(
                          lambda x: x['Category Name'] in x['Product Category'], axis=1)]

id_category_mismatch_errs = id_category_mismatch_errs.set_index('index').sort_index()

id_category_mismatch_errs

In [None]:
# Get all row numbers that contain a single coded category name- ID error and look them up on the original dataframe:
id_category_mismatch_errs = pipeline_sheet_dataframe.loc[id_category_mismatch_errs.index]
id_category_mismatch_errs.head()

### Dealing with multi-coded products

##### Preprocessing of the multi-coded columns:

In [None]:

def preprocess_multi_coded_df(df):
  
  """Function where the preprocessing of the original Catalog + Products pipeline
   sheet dataframe occurs, previous to going into the logic of the validations 
   of the different types of errors (see function below "extract_multi_coded_errors").
  
  Arguments:
    - df:  original Catalog + Products pipeline sheet dataframe
  Returns:
    - original_df_IDs_column: pandas dataframe with only the column "Category ID"; 
    will be used to process the different types of formatting errors
    - multi_coded_categories: dataframe with only products that have been 
    encoded into multiple categories (multiple IDs/category names); will be used
     to check for differing number of elements in both Category IDs columns and
      Category Name, and to get a dataframe clean from formatting errors that 
      will be used further down the line to check if IDs and Category Names match."""

  # Get the column with the category IDs from the original Catalog + Products pipeline sheet:
  pipeline_sheet_prod_cat_id_column = df.loc[:, "Category ID"]

  multi_coded_categories = df.loc[
    # Pick up double and triple coded, cases:
    (pipeline_sheet_prod_cat_id_column.astype(str).str.contains(',') == True)
    ]

  # Encode valid formatted multi-coded columns into lists to check that the num elements matches on both columns:
  multi_coded_categories['Category ID'] = multi_coded_categories['Category ID'].str.split(',', expand=False)
  multi_coded_categories['Product Category'] = multi_coded_categories['Product Category'].str.split('AND', expand=False)

  return [pipeline_sheet_prod_cat_id_column, multi_coded_categories]



def extract_multi_coded_errors(df):
  
  """Main function including the validations of errors for products with 
  multiple encoded categories.
  
  Arguments:
    - df:  original Catalog + Products pipeline sheet dataframe
  Returns:
    - format_and_num_elemen_errs: formatting errors for category IDs and errors 
    for non matching number of elements between Category IDs and Category Names
    """

  # Get column with cat IDs from original df and the preprocessed multi-coded df:
  original_df_IDs_column, multi_coded_df = preprocess_multi_coded_df(df)
  
  # Pick up cases in which num elements of the column Category ID and Product Category are not the same:
  num_elem_each_column = multi_coded_df.applymap(len)
  nonmatching_num_elements = num_elem_each_column.loc[
    ~(num_elem_each_column['Category ID'] ==
      num_elem_each_column['Product Category'])]

  # Get the actual original rows with a mismatch of num elements:
  nonmatching_num_elements_original_rows = multi_coded_df.loc[nonmatching_num_elements.index]

  # [WIP] Find different double coding potential errors in format when inputing 
  # the categories and category IDs:
  format_errs = df.loc[
      # Pick up cases such as '12341348' (no comma): [SEE CELL IMMEDIATELY BELOW]
      (((original_df_IDs_column.astype(str).str.len() >= 8)== True) &
       ((~original_df_IDs_column.astype(str).str.contains(','))== True)) |
      # Pick up cases such as '1234 1348' or '1643 1642,1042' (a missing comma and a whitespace):
      (((original_df_IDs_column.astype(str).str.len() >= 8) == True) &
       ((original_df_IDs_column.astype(str).str.contains(' '))== True))

      # Pick up cases incorrectly formatted by Google Sheets as '1002,1234' into '10,021,234':
      # ---- TODO ----
      ]
  
  # Concat format_errs and errs from non matching num elements in 'Category ID' and 'Product Category':
  format_and_num_elemen_errs = pd.concat([format_errs, nonmatching_num_elements_original_rows])
  # Get rid of duplicate rows for errors:
  format_and_num_elemen_errs = format_and_num_elemen_errs[~format_and_num_elemen_errs.index.duplicated(keep="first")]
  
  return format_and_num_elemen_errs

In [None]:

def extract_multi_coded_correct_and_errs(df):
  """ 
  Umbrella function that preprocesses, detects formatting errors, and separates 
  them from rows that don't have formatting errors, returning both in different 
  dataframes for further validation processing (checking that ID and category name match).

  Arguments:
    - df:  original Catalog + Products pipeline sheet dataframe.
  Returns: 
    - multi_coded_categories: dataframe with only products that have been 
    encoded into multiple categories (multiple IDs/category names, and that 
    don't contain formatting errors. 
    - format_and_num_elemen_errs: rows that contain such types of errors, 
    with indexes from original dataframe."""
  
  # Get column with cat IDs from original df and the preprocessed multi-coded df:
  _, multi_coded_df = preprocess_multi_coded_df(df)

  format_and_num_elemen_errs = extract_multi_coded_errors(df)

  # Keep only the correctly encoded rows for multiple categories by getting rid of all the errors:
  for err_row in format_and_num_elemen_errs.index:
    try:
      multi_coded_df.drop(err_row, inplace=True)
    except:
      pass


  return [multi_coded_df, format_and_num_elemen_errs]


correctly_multi_coded_categories, format_multi_coded_errs = extract_multi_coded_correct_and_errs(
    pipeline_sheet_category_IDs_and_names)

format_multi_coded_errs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,Category ID,Product Category


In [None]:
# Move index to the dataframe to preserve it after merging dataframes (otherwise would lose index):
correctly_multi_coded_categories.reset_index(inplace=True)
correctly_multi_coded_categories.head()

Unnamed: 0,index,Category ID,Product Category
0,2,"[1317, 1605]","[Telemedicine Software , Patient Engagement S..."
1,19,"[1167, 1607]","[Property Management Software , Hotel Managem..."
2,28,"[1110, 1168]","[Real Estate Marketing Software , Multiple Li..."
3,35,"[1549, 1049]","[Meeting Management Software , Video Conferen..."
4,41,"[1068, 1106]","[Local Listing Management Software , Reputati..."


In [None]:
# Separate rows that have several coded IDs/Categories into several rows with 
# sharing indexes (**explode is list dependent**):
correctly_multi_coded_categories = correctly_multi_coded_categories.explode(['Category ID', 'Product Category'])
correctly_multi_coded_categories.head()

Unnamed: 0,index,Category ID,Product Category
0,2,1317,Telemedicine Software
0,2,1605,Patient Engagement Software
1,19,1167,Property Management Software
1,19,1607,Hotel Management Software
2,28,1110,Real Estate Marketing Software


In [None]:
# Rename the Cat ID column to URN to use as a common value for the merge of both dataframes
correctly_multi_coded_categories_renamed = correctly_multi_coded_categories.rename(columns={'Category ID': 'URN'})
correctly_multi_coded_categories_renamed.head()

Unnamed: 0,index,URN,Product Category
0,2,1317,Telemedicine Software
0,2,1605,Patient Engagement Software
1,19,1167,Property Management Software
1,19,1607,Hotel Management Software
2,28,1110,Real Estate Marketing Software


##### Final merging and checking of multi-coded:

In [None]:
# Cast merging columns to same dtype so that the merge is effective:
golden_sheet_category_IDs_and_names['URN'] = golden_sheet_category_IDs_and_names['URN'].astype(str)
correctly_multi_coded_categories_renamed['URN'] = correctly_multi_coded_categories_renamed['URN'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [None]:
# Merge both dataframes using inner join on the common URN Column. The index corresponds to the row on the pipeline sheet. 
golden_and_pipeline_multi_coded_merged = pd.merge(golden_sheet_category_IDs_and_names, 
                                      correctly_multi_coded_categories_renamed, 
                                      how="inner", on="URN")

golden_and_pipeline_multi_coded_merged.head()

Unnamed: 0,URN,Category Name,index,Product Category
0,1049,Video Conferencing Software,35,Video Conferencing Software
1,1068,Local Listing Management Software,41,Local Listing Management Software
2,1074,"Governance, Risk Management, and Compliance (G...",75,"Governance, Risk Management, and Compliance (G..."
3,1106,Reputation Management Software,41,Reputation Management Software
4,1110,Real Estate Marketing Software,28,Real Estate Marketing Software


In [None]:
# Using the multi-coded dataframe to locate the rows where one or more of the Product Category doesn't match the value of the Category name in the golden sheet 
multi_id_category_mismatch_errs = golden_and_pipeline_multi_coded_merged.loc[
                      ~golden_and_pipeline_multi_coded_merged.apply(
                          lambda x: x['Category Name'] in x['Product Category'], axis=1)]

# Set the index to the original values and sort them in ascending order:
multi_id_category_mismatch_errs = multi_id_category_mismatch_errs.set_index('index').sort_index()

multi_id_category_mismatch_errs

Unnamed: 0_level_0,URN,Category Name,Product Category
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


##### Gather all errors for Category Names and Category IDs:

In [None]:
# Get all row numbers that equal an multiple coded format or category mismatch error and look them up on the original dataframe (for consistency):
multi_category_format_errs = pipeline_sheet_dataframe.loc[format_multi_coded_errs.index]
multi_id_category_mismatch_errs = pipeline_sheet_dataframe.loc[multi_id_category_mismatch_errs.index]

# Concatenate both types of erros for multi-coded products:
multi_coded_errs = pd.concat([multi_category_format_errs, multi_id_category_mismatch_errs])
multi_coded_errs.head()

In [None]:
# Concatenate all errors for category names/category IDs (single and multi-coded):
category_IDs_and_names_errs = pd.concat([id_category_mismatch_errs, multi_coded_errs])
category_IDs_and_names_errs

## 6. Ensure Product URL is NOT a PDF (i.e. does not end with ‘.pdf’)


In [None]:
# Get the company urls column
company_urls = pipeline_sheet_dataframe.loc[:, 'Comp URL']

In [None]:
# get all company urls that end with .pdf:
comp_url_pdf_errs = pipeline_sheet_dataframe.loc[company_urls.str.endswith('.pdf') == True, :]
# count how many urls are incorrect
comp_url_pdf_errs

Unnamed: 0,Category ID,Product Category,Product ID,Product name,Product Aliases,Product Description,Active,Deprecated,Product Skill ID,Product URLS,Description Locale,Company ID - OWNER,Comp URL,Company Name,Showcase ID,Showcase URL,Company/product page for display,Customer Organizations IDs,Customer Organizations Company URLs,Source of Product,In V1.1,In MVP,Has ingested IMAGES - 2020Dec09,Has ingested VIDEOS - 2020Dec09,Product notes,New Product?,Notify Pages?


In [None]:
# Get the product urls column
product_urls = pipeline_sheet_dataframe.loc[:, 'Product URLS']

In [None]:
# get all product urls that end with .pdf:
product_url_pdf_errs = pipeline_sheet_dataframe.loc[product_urls.str.endswith('.pdf') == True, :]
product_url_pdf_errs

Unnamed: 0,Category ID,Product Category,Product ID,Product name,Product Aliases,Product Description,Active,Deprecated,Product Skill ID,Product URLS,Description Locale,Company ID - OWNER,Comp URL,Company Name,Showcase ID,Showcase URL,Company/product page for display,Customer Organizations IDs,Customer Organizations Company URLs,Source of Product,In V1.1,In MVP,Has ingested IMAGES - 2020Dec09,Has ingested VIDEOS - 2020Dec09,Product notes,New Product?,Notify Pages?


## 7. Add validation to ensure product categories assigned to products are NOT Group Representatives **(Multi-Coded still to be added here)


In [None]:
# Get Is group representative column
group_rep_column = golden_sheet_dataframe.loc[:, 'Is group representative']

In [None]:
# Get all rows where 'is group representative' is set to 'YES'
group_representatives = golden_sheet_dataframe.loc[group_rep_column.str.contains('YES') == True, :]

In [None]:
# Sanity check that we've got the correct amount of group representatives
len(group_representatives)

60

In [None]:
# Get the urns from the rows of group representatives and covert it to a numpy array
group_representative_urns = group_representatives.loc[:, 'URN']
group_representative_urns.to_numpy()

array(['1002', '1031', '1053', '1127', '1128', '1314', '1315', '1394',
       '1395', '1396', '1397', '1398', '1399', '1400', '1401', '1402',
       '1403', '1496', '1497', '1498', '1499', '1500', '1501', '1502',
       '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510',
       '1511', '1512', '1513', '1514', '1515', '1516', '1651', '1652',
       '1653', '1654', '1736', '1737', '1738', '1739', '1740', '1741',
       '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749',
       '1750', '1751', '1752', '1753'], dtype=object)

In [None]:
# Get the Category IDs of the collected products in the pipeline sheet
category_IDs = pipeline_sheet_dataframe.loc[:, "Category ID"]

In [None]:
# Check if these category IDs are in the array of group representative URNs 
group_rep_category_id_errs = pipeline_sheet_dataframe.loc[category_IDs.isin(group_representative_urns)]
group_rep_category_id_errs

Unnamed: 0,Category ID,Product Category,Product ID,Product name,Product Aliases,Product Description,Active,Deprecated,Product Skill ID,Product URLS,Description Locale,Company ID - OWNER,Comp URL,Company Name,Showcase ID,Showcase URL,Company/product page for display,Customer Organizations IDs,Customer Organizations Company URLs,Source of Product,In V1.1,In MVP,Has ingested IMAGES - 2020Dec09,Has ingested VIDEOS - 2020Dec09,Product notes,New Product?,Notify Pages?


## 8. Identify required fields and add a check to ensure all of those fields are filled (except for Product ID which should be left empty) **(Needs to be adjusted for publications sheets/ CHECK WEIRD PRINTING)

<ins>Required fields</ins>: Category ID, Product ID, Product Name, isActive, isDeprecated, Product URL, LinkedIn Company URL, Company Name, Source of Product, Locale.

The solution will be to create a dataframe for each field with the errors. Then we should be able to index into each of the fields and mark the error independetly.

In [None]:
# Create variables with the list of the names of the columns that will be accessed
# according to the need to have the field filled-in or left empty:
to_be_filled = ['Category ID', 'Product name', 'Active', 
 'Deprecated', 'Product URLS', 'Comp URL', 'Company Name', 
 'Source of Product', 'Description Locale']
to_leave_empty = 'Product ID'

# Create empty dictionary to store the results:
mandatory_fields = {}

In [None]:
import pprint

for field in to_be_filled:
  accessed_column = pipeline_sheet_dataframe.loc[:, field]
  mandatory_fields[field + '_Blanks(Err)'] = pipeline_sheet_dataframe.loc[accessed_column.eq('') == True, :]

  if field == to_be_filled[-1]:  # execute condition after last element
    # Add the field to be left empty to the dictionary of mandatory fields:
    accessed_column = pipeline_sheet_dataframe.loc[:, to_leave_empty]
    mandatory_fields[to_leave_empty + '_Filled(Err)'] = pipeline_sheet_dataframe.loc[accessed_column.eq('') == False, :]

# Print in a 'pretty', legible way the resulting dict:
pp = pprint.PrettyPrinter(indent=4)
# pp.pprint(mandatory_fields)

In [None]:
# Check one of the errors for a field gathered:
mandatory_fields['Product ID_Filled(Err)']

Unnamed: 0,Category ID,Product Category,Product ID,Product name,Product Aliases,Product Description,Active,Deprecated,Product Skill ID,Product URLS,Description Locale,Company ID - OWNER,Comp URL,Company Name,Showcase ID,Showcase URL,Company/product page for display,Customer Organizations IDs,Customer Organizations Company URLs,Source of Product,In V1.1,In MVP,Has ingested IMAGES - 2020Dec09,Has ingested VIDEOS - 2020Dec09,Product notes,New Product?,Notify Pages?
2,13171605,Telemedicine Software AND Patient Engagement S...,11192,Klara,,"Easy communication, seamless telemedicine, and...",T,F,,https://www.klara.com/,en_US,5015134,https://www.linkedin.com/company/getklara/,Klara,,,,,,,YES,YES,False,True,,NO,
3,1319,Personalization Software,35051,BrainSINS,,BrainSINS transforms your online store into an...,T,F,,https://www.brainsins.com/en/,,1514855,,,,,,,,,,,,,,NO,
4,1871,Cryptocurrency Wallets,30873,BitPay,,"Download the BitPay App to securely send, rece...",T,F,,https://bitpay.com/business,,2903458,,,,,,,,,,,,,,NO,
5,1570,Mobile Development Platforms,39459,AppsGeyser,,Develop mobile apps with Android App Templates...,T,F,,https://www.appsgeyser.com/create/start/,,1889694,,,,,,,,VENDOR_GLANOS,,,,,,NO,
6,1659,Video Marketing Software,50049,Viloud,,Create online TV channel: Create linear and on...,T,F,,http://viloud.tv,,5017005,,,,,,,,VENDOR_GLANOS,,,,,,NO,
7,1569,Appointment Scheduling Software,30489,Regiondo,,Online booking system for tour and activity pr...,T,F,,http://pro.regiondo.com/,,2571085,,,,,,,,,,,,,,NO,
8,1037,Marketing Automation Software,4780,Gravitec,,"Send important news, latest updates, special o...",T,F,,https://gravitec.net/,en_US,10649970,,Gravitec.net,,,,,,Category Completion,YES,YES,False,True,,NO,
9,1792,Pop-Up Builder Software,46960,Socital,,Socital is on-site campaign toolkit for e-comm...,T,F,,http://www.socital.com/,,9356674,,,,,,,,,,,,,,NO,
10,1790,Space Management Software,5527,Cobot,,Cobot is the leading management software for c...,T,F,,https://www.cobot.me/en,en_US,2991831,,Cobot,,,,,,Category Completion,YES,YES,False,True,,NO,
11,1152,Geographic Information System (GIS) Software,1417,ArcGIS Collector,,,T,F,,https://www.esri.com/en-us/arcgis/products/arc...,en_US,5311,https://www.linkedin.com/company/esri/,Esri,,,,,,LSS Top Companies,YES,YES,,,,NO,


In [None]:
mandatory_fields['Category ID_Blanks(Err)']  

Unnamed: 0,Category ID,Product Category,Product ID,Product name,Product Aliases,Product Description,Active,Deprecated,Product Skill ID,Product URLS,Description Locale,Company ID - OWNER,Comp URL,Company Name,Showcase ID,Showcase URL,Company/product page for display,Customer Organizations IDs,Customer Organizations Company URLs,Source of Product,In V1.1,In MVP,Has ingested IMAGES - 2020Dec09,Has ingested VIDEOS - 2020Dec09,Product notes,New Product?,Notify Pages?


In [None]:
mandatory_fields['Product name_Blanks(Err)'] 

Unnamed: 0,Category ID,Product Category,Product ID,Product name,Product Aliases,Product Description,Active,Deprecated,Product Skill ID,Product URLS,Description Locale,Company ID - OWNER,Comp URL,Company Name,Showcase ID,Showcase URL,Company/product page for display,Customer Organizations IDs,Customer Organizations Company URLs,Source of Product,In V1.1,In MVP,Has ingested IMAGES - 2020Dec09,Has ingested VIDEOS - 2020Dec09,Product notes,New Product?,Notify Pages?


In [None]:
mandatory_fields['Active_Blanks(Err)'] 

Unnamed: 0,Category ID,Product Category,Product ID,Product name,Product Aliases,Product Description,Active,Deprecated,Product Skill ID,Product URLS,Description Locale,Company ID - OWNER,Comp URL,Company Name,Showcase ID,Showcase URL,Company/product page for display,Customer Organizations IDs,Customer Organizations Company URLs,Source of Product,In V1.1,In MVP,Has ingested IMAGES - 2020Dec09,Has ingested VIDEOS - 2020Dec09,Product notes,New Product?,Notify Pages?


In [None]:
mandatory_fields['Deprecated_Blanks(Err)']

Unnamed: 0,Category ID,Product Category,Product ID,Product name,Product Aliases,Product Description,Active,Deprecated,Product Skill ID,Product URLS,Description Locale,Company ID - OWNER,Comp URL,Company Name,Showcase ID,Showcase URL,Company/product page for display,Customer Organizations IDs,Customer Organizations Company URLs,Source of Product,In V1.1,In MVP,Has ingested IMAGES - 2020Dec09,Has ingested VIDEOS - 2020Dec09,Product notes,New Product?,Notify Pages?


In [None]:
mandatory_fields['Product URLS_Blanks(Err)']

Unnamed: 0,Category ID,Product Category,Product ID,Product name,Product Aliases,Product Description,Active,Deprecated,Product Skill ID,Product URLS,Description Locale,Company ID - OWNER,Comp URL,Company Name,Showcase ID,Showcase URL,Company/product page for display,Customer Organizations IDs,Customer Organizations Company URLs,Source of Product,In V1.1,In MVP,Has ingested IMAGES - 2020Dec09,Has ingested VIDEOS - 2020Dec09,Product notes,New Product?,Notify Pages?


In [None]:
mandatory_fields['Comp URL_Blanks(Err)']

Unnamed: 0,Category ID,Product Category,Product ID,Product name,Product Aliases,Product Description,Active,Deprecated,Product Skill ID,Product URLS,Description Locale,Company ID - OWNER,Comp URL,Company Name,Showcase ID,Showcase URL,Company/product page for display,Customer Organizations IDs,Customer Organizations Company URLs,Source of Product,In V1.1,In MVP,Has ingested IMAGES - 2020Dec09,Has ingested VIDEOS - 2020Dec09,Product notes,New Product?,Notify Pages?
3,1319,Personalization Software,35051,BrainSINS,,BrainSINS transforms your online store into an...,T,F,,https://www.brainsins.com/en/,,1514855,,,,,,,,,,,,,,NO,
4,1871,Cryptocurrency Wallets,30873,BitPay,,"Download the BitPay App to securely send, rece...",T,F,,https://bitpay.com/business,,2903458,,,,,,,,,,,,,,NO,
5,1570,Mobile Development Platforms,39459,AppsGeyser,,Develop mobile apps with Android App Templates...,T,F,,https://www.appsgeyser.com/create/start/,,1889694,,,,,,,,VENDOR_GLANOS,,,,,,NO,
6,1659,Video Marketing Software,50049,Viloud,,Create online TV channel: Create linear and on...,T,F,,http://viloud.tv,,5017005,,,,,,,,VENDOR_GLANOS,,,,,,NO,
7,1569,Appointment Scheduling Software,30489,Regiondo,,Online booking system for tour and activity pr...,T,F,,http://pro.regiondo.com/,,2571085,,,,,,,,,,,,,,NO,
8,1037,Marketing Automation Software,4780,Gravitec,,"Send important news, latest updates, special o...",T,F,,https://gravitec.net/,en_US,10649970,,Gravitec.net,,,,,,Category Completion,YES,YES,False,True,,NO,
9,1792,Pop-Up Builder Software,46960,Socital,,Socital is on-site campaign toolkit for e-comm...,T,F,,http://www.socital.com/,,9356674,,,,,,,,,,,,,,NO,
10,1790,Space Management Software,5527,Cobot,,Cobot is the leading management software for c...,T,F,,https://www.cobot.me/en,en_US,2991831,,Cobot,,,,,,Category Completion,YES,YES,False,True,,NO,
12,1833,Supervisory Control and Data Acquisition (SCAD...,1010538,Topkapi,,"Solution multi-sectorielle, Topkapi est une pl...",T,F,,https://www.areal-topkapi.com/topkapi/presenta...,,110693,,,,,,,,,,,,,,NO,
13,1011,Application Performance Monitoring (APM) Software,33841,Aternity Application Performance Monitoring,Aternity APM,Microservices deployed in containers across dy...,T,F,,https://www.aternity.com/application-performan...,,25350,,,,,,,,VENDOR_GLANOS,,,,,,NO,


In [None]:
mandatory_fields['Company Name_Blanks(Err)']

Unnamed: 0,Category ID,Product Category,Product ID,Product name,Product Aliases,Product Description,Active,Deprecated,Product Skill ID,Product URLS,Description Locale,Company ID - OWNER,Comp URL,Company Name,Showcase ID,Showcase URL,Company/product page for display,Customer Organizations IDs,Customer Organizations Company URLs,Source of Product,In V1.1,In MVP,Has ingested IMAGES - 2020Dec09,Has ingested VIDEOS - 2020Dec09,Product notes,New Product?,Notify Pages?
3,1319,Personalization Software,35051,BrainSINS,,BrainSINS transforms your online store into an...,T,F,,https://www.brainsins.com/en/,,1514855,,,,,,,,,,,,,,NO,
4,1871,Cryptocurrency Wallets,30873,BitPay,,"Download the BitPay App to securely send, rece...",T,F,,https://bitpay.com/business,,2903458,,,,,,,,,,,,,,NO,
5,1570,Mobile Development Platforms,39459,AppsGeyser,,Develop mobile apps with Android App Templates...,T,F,,https://www.appsgeyser.com/create/start/,,1889694,,,,,,,,VENDOR_GLANOS,,,,,,NO,
6,1659,Video Marketing Software,50049,Viloud,,Create online TV channel: Create linear and on...,T,F,,http://viloud.tv,,5017005,,,,,,,,VENDOR_GLANOS,,,,,,NO,
7,1569,Appointment Scheduling Software,30489,Regiondo,,Online booking system for tour and activity pr...,T,F,,http://pro.regiondo.com/,,2571085,,,,,,,,,,,,,,NO,
9,1792,Pop-Up Builder Software,46960,Socital,,Socital is on-site campaign toolkit for e-comm...,T,F,,http://www.socital.com/,,9356674,,,,,,,,,,,,,,NO,
12,1833,Supervisory Control and Data Acquisition (SCAD...,1010538,Topkapi,,"Solution multi-sectorielle, Topkapi est une pl...",T,F,,https://www.areal-topkapi.com/topkapi/presenta...,,110693,,,,,,,,,,,,,,NO,
13,1011,Application Performance Monitoring (APM) Software,33841,Aternity Application Performance Monitoring,Aternity APM,Microservices deployed in containers across dy...,T,F,,https://www.aternity.com/application-performan...,,25350,,,,,,,,VENDOR_GLANOS,,,,,,NO,
14,1072,Electronic Data Interchange (EDI) Software,38089,EXIM EDI,,MicroD has ecommerce solutions to help the fur...,T,F,,https://www.microdinc.com/electronic-data-inte...,,939541,,,,,,,,VENDOR_GLANOS,,,,,,NO,
15,1551,Online Community Software,36376,ProBoards,,Each ProBoards free forum comes with hundreds ...,T,F,,https://www.proboards.com/,,7693670,,,,,,,,VENDOR_GLANOS,,,,,,NO,


In [None]:
mandatory_fields['Source of Product_Blanks(Err)']

Unnamed: 0,Category ID,Product Category,Product ID,Product name,Product Aliases,Product Description,Active,Deprecated,Product Skill ID,Product URLS,Description Locale,Company ID - OWNER,Comp URL,Company Name,Showcase ID,Showcase URL,Company/product page for display,Customer Organizations IDs,Customer Organizations Company URLs,Source of Product,In V1.1,In MVP,Has ingested IMAGES - 2020Dec09,Has ingested VIDEOS - 2020Dec09,Product notes,New Product?,Notify Pages?
2,13171605,Telemedicine Software AND Patient Engagement S...,11192,Klara,,"Easy communication, seamless telemedicine, and...",T,F,,https://www.klara.com/,en_US,5015134,https://www.linkedin.com/company/getklara/,Klara,,,,,,,YES,YES,False,True,,NO,
3,1319,Personalization Software,35051,BrainSINS,,BrainSINS transforms your online store into an...,T,F,,https://www.brainsins.com/en/,,1514855,,,,,,,,,,,,,,NO,
4,1871,Cryptocurrency Wallets,30873,BitPay,,"Download the BitPay App to securely send, rece...",T,F,,https://bitpay.com/business,,2903458,,,,,,,,,,,,,,NO,
7,1569,Appointment Scheduling Software,30489,Regiondo,,Online booking system for tour and activity pr...,T,F,,http://pro.regiondo.com/,,2571085,,,,,,,,,,,,,,NO,
9,1792,Pop-Up Builder Software,46960,Socital,,Socital is on-site campaign toolkit for e-comm...,T,F,,http://www.socital.com/,,9356674,,,,,,,,,,,,,,NO,
12,1833,Supervisory Control and Data Acquisition (SCAD...,1010538,Topkapi,,"Solution multi-sectorielle, Topkapi est une pl...",T,F,,https://www.areal-topkapi.com/topkapi/presenta...,,110693,,,,,,,,,,,,,,NO,
17,1026,E-Commerce Platforms,1666400,Pollen,,Pollen is the first solution to make the consu...,T,F,,www.pollenreturns.com,,66608850,,,,,,,,,,,,,,NO,
23,1067,Job Boards Software,37040,Adicio,,"White-label platforms for job boards, real est...",T,F,,https://www.adicio.com/how-we-help-you/careerc...,,34496,,,,,,,,,,,,,,NO,
24,1440,Email Tracking Software,43922,ZeroBounce,,Our Email Validation Service quickly and accur...,T,F,,https://www.zerobounce.net,,18249430,,,,,,,,,,,,,,NO,
26,1081,Conversational AI Software,30601,Solvemate,,Solvemate enables brands to deliver quality cu...,T,F,,https://www.solvemate.com/,,10497605,,,,,,,,,,,,,,NO,


In [None]:
mandatory_fields['Description Locale_Blanks(Err)']

Unnamed: 0,Category ID,Product Category,Product ID,Product name,Product Aliases,Product Description,Active,Deprecated,Product Skill ID,Product URLS,Description Locale,Company ID - OWNER,Comp URL,Company Name,Showcase ID,Showcase URL,Company/product page for display,Customer Organizations IDs,Customer Organizations Company URLs,Source of Product,In V1.1,In MVP,Has ingested IMAGES - 2020Dec09,Has ingested VIDEOS - 2020Dec09,Product notes,New Product?,Notify Pages?
3,1319,Personalization Software,35051,BrainSINS,,BrainSINS transforms your online store into an...,T,F,,https://www.brainsins.com/en/,,1514855,,,,,,,,,,,,,,NO,
4,1871,Cryptocurrency Wallets,30873,BitPay,,"Download the BitPay App to securely send, rece...",T,F,,https://bitpay.com/business,,2903458,,,,,,,,,,,,,,NO,
5,1570,Mobile Development Platforms,39459,AppsGeyser,,Develop mobile apps with Android App Templates...,T,F,,https://www.appsgeyser.com/create/start/,,1889694,,,,,,,,VENDOR_GLANOS,,,,,,NO,
6,1659,Video Marketing Software,50049,Viloud,,Create online TV channel: Create linear and on...,T,F,,http://viloud.tv,,5017005,,,,,,,,VENDOR_GLANOS,,,,,,NO,
7,1569,Appointment Scheduling Software,30489,Regiondo,,Online booking system for tour and activity pr...,T,F,,http://pro.regiondo.com/,,2571085,,,,,,,,,,,,,,NO,
9,1792,Pop-Up Builder Software,46960,Socital,,Socital is on-site campaign toolkit for e-comm...,T,F,,http://www.socital.com/,,9356674,,,,,,,,,,,,,,NO,
12,1833,Supervisory Control and Data Acquisition (SCAD...,1010538,Topkapi,,"Solution multi-sectorielle, Topkapi est une pl...",T,F,,https://www.areal-topkapi.com/topkapi/presenta...,,110693,,,,,,,,,,,,,,NO,
13,1011,Application Performance Monitoring (APM) Software,33841,Aternity Application Performance Monitoring,Aternity APM,Microservices deployed in containers across dy...,T,F,,https://www.aternity.com/application-performan...,,25350,,,,,,,,VENDOR_GLANOS,,,,,,NO,
14,1072,Electronic Data Interchange (EDI) Software,38089,EXIM EDI,,MicroD has ecommerce solutions to help the fur...,T,F,,https://www.microdinc.com/electronic-data-inte...,,939541,,,,,,,,VENDOR_GLANOS,,,,,,NO,
15,1551,Online Community Software,36376,ProBoards,,Each ProBoards free forum comes with hundreds ...,T,F,,https://www.proboards.com/,,7693670,,,,,,,,VENDOR_GLANOS,,,,,,NO,


## 9. Validation that isActive = T, and isDeprecated = F for New Products. **Find errors where we have T/T or F/F. (Possible check for product deprecations using product changes notes)


In [None]:
# Get the isActive values of the collected products in the pipeline sheet
is_active_values = pipeline_sheet_dataframe.loc[:, "Active"]
is_active_values.head()

2    T
3    T
4    T
5    T
6    T
Name: Active, dtype: object

In [None]:
# Get the rows that don't have T marked in Active column
pipeline_sheet_dataframe.loc[is_active_values.str.contains('T') == False, :]

Unnamed: 0,Category ID,Product Category,Product ID,Product name,Product Aliases,Product Description,Active,Deprecated,Product Skill ID,Product URLS,Description Locale,Company ID - OWNER,Comp URL,Company Name,Showcase ID,Showcase URL,Company/product page for display,Customer Organizations IDs,Customer Organizations Company URLs,Source of Product,In V1.1,In MVP,Has ingested IMAGES - 2020Dec09,Has ingested VIDEOS - 2020Dec09,Product notes,New Product?,Notify Pages?


In [None]:
# Get the isDeprecated values of the collected products in the pipeline sheet
is_deprecated_values = pipeline_sheet_dataframe.loc[:, "Deprecated"]
is_deprecated_values.head()

2    F
3    F
4    F
5    F
6    F
Name: Deprecated, dtype: object

In [None]:
# Get the rows that don't have F marked in Deprecated column
pipeline_sheet_dataframe.loc[is_deprecated_values.str.contains('F') == False, :]

Unnamed: 0,Category ID,Product Category,Product ID,Product name,Product Aliases,Product Description,Active,Deprecated,Product Skill ID,Product URLS,Description Locale,Company ID - OWNER,Comp URL,Company Name,Showcase ID,Showcase URL,Company/product page for display,Customer Organizations IDs,Customer Organizations Company URLs,Source of Product,In V1.1,In MVP,Has ingested IMAGES - 2020Dec09,Has ingested VIDEOS - 2020Dec09,Product notes,New Product?,Notify Pages?


In [None]:
# Get rows that have both isActive and isDeprecated is the same value for both (e.g. both 'T' or both 'True'):
conditions = (is_active_values.str.contains('T') & is_deprecated_values.str.contains('T')) | (is_active_values.str.contains('F') & is_deprecated_values.str.contains('F')) | (is_active_values.str.contains('True') & is_deprecated_values.str.contains('True')) | (is_active_values.str.contains('False') & is_deprecated_values.str.contains('False'))
pipeline_sheet_dataframe.loc[conditions]

Unnamed: 0,Category ID,Product Category,Product ID,Product name,Product Aliases,Product Description,Active,Deprecated,Product Skill ID,Product URLS,Description Locale,Company ID - OWNER,Comp URL,Company Name,Showcase ID,Showcase URL,Company/product page for display,Customer Organizations IDs,Customer Organizations Company URLs,Source of Product,In V1.1,In MVP,Has ingested IMAGES - 2020Dec09,Has ingested VIDEOS - 2020Dec09,Product notes,New Product?,Notify Pages?


## 10.  **Ensure all characters are unicode. (i.e. No special characters like Äô)


## 11.  **Ensure Product Skill ID is valid and not 0



In [None]:
# First get the column of product Skill IDs:
product_skills = pipeline_sheet_dataframe['Product Skill ID']
product_urls_dataframe = product_skills.to_frame()

In [None]:
# Convert values to str so that we can use str.contains method:
product_skills = product_skills.astype(str)

# Catch cases in which Skill ID is 0:
product_skills_errs = pipeline_sheet_dataframe.loc[product_skills.str.contains('0')]
product_skills_errs

Unnamed: 0,Category ID,Product Category,Product ID,Product name,Product Aliases,Product Description,Active,Deprecated,Product Skill ID,Product URLS,Description Locale,Company ID - OWNER,Comp URL,Company Name,Showcase ID,Showcase URL,Company/product page for display,Customer Organizations IDs,Customer Organizations Company URLs,Source of Product,In V1.1,In MVP,Has ingested IMAGES - 2020Dec09,Has ingested VIDEOS - 2020Dec09,Product notes,New Product?,Notify Pages?
62,1003,Desktop Publishing Software,1027,QuarkXPress,,QuarkXPress is the Best leading desktop publis...,T,F,1630,http://www.quark.com/Products/QuarkXPress/,en_US,4238,,,,,,,,Pages Team,YES,YES,False,True,,NO,


## [WIP] (Last step) Mark in <span style="color:red">red</span> all errors for all validations:


In [None]:
# DataRange is used for storing/manipulating a range of data in a worksheet
# Using DataRange we will change the color of the selected range on the sheet
from pygsheets.datarange import DataRange

In [None]:
# Select the cell we wish to change the color of
model_cell = specific_pipeline_tab.cell('A2')

In [None]:
# Change the selected cells to white
model_cell.color = (1,1,1, 0.5)

In [None]:

# Change the selected cells to red
model_cell.color = (1, 0.1, 0.1, 0.5)

In [None]:
# Apply the format changes to the sheet
DataRange('A2','A2', worksheet=specific_pipeline_tab).apply_format(model_cell)

In [None]:
#  David's attempt -------------------------------------------------------------
# Access the worksheet's header column names:
column_headers = specific_pipeline_tab.get_row(1, include_tailing_empty=True)
col_enumerated_headers = list(enumerate(column_headers, 1))

# Substitute numbers by Alphabet? is it necessary?

In [None]:
# Remove any of the enumerated headers that have an empty string with a list comprehension:   MAYBE WE WANT TO GO HERE FOR A ALPHABETIC NAMING
col_enumerated_headers = [tuple_object for tuple_object in col_enumerated_headers if tuple_object[1]]

In [None]:
# Convert this list of tuples into a python dictionary to create a look up 
# table for columns versus their index positions:
lookup_table = dict(col_enumerated_headers)
# And reverse it:
lookup_table = {value: key for key, value in lookup_table.items()}  # may need to remove sorting (don't know where is happening)
lookup_table

{'Active': 7,
 'Category ID': 1,
 'Comp URL': 13,
 'Company ID - OWNER': 12,
 'Company Name': 14,
 'Company/product page for display': 17,
 'Customer Organizations Company URLs': 19,
 'Customer Organizations IDs': 18,
 'Deprecated': 8,
 'Description Locale': 11,
 'Has ingested IMAGES - 2020Dec09': 23,
 'Has ingested VIDEOS - 2020Dec09': 24,
 'In MVP ': 22,
 'In V1.1': 21,
 'New Product?': 26,
 'Notify Pages?': 27,
 'Product Aliases': 5,
 'Product Category': 2,
 'Product Description': 6,
 'Product ID': 3,
 'Product Skill ID': 9,
 'Product URLS': 10,
 'Product name': 4,
 'Product notes': 25,
 'Showcase ID': 15,
 'Showcase URL': 16,
 'Source of Product': 20}