In [None]:
import pandas as pd


def process_data(data):
    # Drop irrelevant columns
    data.drop(['index',
               'noticeNo',
               'title',
               'countyCode',
               'contractValue',
               'cpvCodeType',
               'cNoticeEstimatedContractValue',
               'currencyCode',
               'publicationDate',
               'contractingAuthorityName',
               'cpvCodeName',
               'cNoticeNo',
               'cNoticePublicationDate',
               'cNoticeTitle',
               'organizationName'],
              axis=1, inplace=True)

    # Split the cpvCode column into two separate columns
    data['cpvCategory'] = data['cpvCode'].str.split('-', expand=True)[0]
    data['cpvSubcategory'] = data['cpvCode'].str.split('-', expand=True)[1]

    # Apply one-hot encoding to the category and subcategory columns separately
    cpvCategory_dummies = pd.get_dummies(data['cpvCategory'], prefix='cpvCategory')
    cpvSubcategory_dummies = pd.get_dummies(data['cpvSubcategory'], prefix='cpvSubcategory')

    # Concatenate the original data frame with the one-hot encoded columns
    data = pd.concat([data, cpvCategory_dummies, cpvSubcategory_dummies], axis=1)

    # Drop the original cpvCode, cpvCategory, and cpvSubcategory columns
    data.drop(['cpvCode', 'cpvCategory', 'cpvSubcategory'], axis=1, inplace=True)

    # Split the data into features (X) and target (y)
    X = data
    # X = data.drop('organizationId', axis=1)
    y = data['organizationId']

    return X, y


In [None]:
import pandas as pd


def process_data(data):
    # Drop irrelevant columns
    data.drop([
        # 'index',
        'noticeNo',
        'title',
        'countyCode',
        # 'contractValue',
        # 'cpvCodeType',
        # 'cNoticeEstimatedContractValue',
        'currencyCode',
        'publicationDate',
        'contractingAuthorityName',
        'cpvCodeName',
        'cNoticeNo',
        'cNoticePublicationDate',
        'cNoticeTitle',
        'organizationName'
    ],
        axis=1, inplace=True)

    # Split the cpvCode column
    data['cpv'] = data['cpvCode'].str.split('-', expand=True)[0]
    data['cpvCheckDigit'] = data['cpvCode'].str.split('-', expand=True)[1]
    data['cpvDivision'] = data['cpv'].str.slice(stop=2)
    data['cpvGroup'] = data['cpv'].str.slice(stop=3)
    data['cpvClass'] = data['cpv'].str.slice(stop=4)
    data['cpvCategory'] = data['cpv'].str.slice(stop=5)

    # Apply one-hot encoding to the cpvCode
    cpvCode_dummies = pd.get_dummies(data['cpvCode'], prefix="cpvCode")
    cpv_dummies = pd.get_dummies(data['cpv'], prefix="cpv")
    cpvDivision_dummies = pd.get_dummies(data['cpvDivision'], prefix="cpvDivision")
    cpvGroup_dummies = pd.get_dummies(data['cpvGroup'], prefix="cpvGroup")
    cpvClass_dummies = pd.get_dummies(data['cpvClass'], prefix="cpvClass")
    cpvCategory_dummies = pd.get_dummies(data['cpvCategory'], prefix="cpvCategory")

    # Apply one-hot encoding to the category and subcategory columns separately
    # cpvCategory_dummies = pd.get_dummies(data['cpvCategory'], prefix='cpvCategory')
    # cpvSubcategory_dummies = pd.get_dummies(data['cpvSubcategory'], prefix='cpvSubcategory')

    # Concatenate the original data frame with the one-hot encoded columns
    # data = pd.concat([data, cpvCategory_dummies, cpvSubcategory_dummies], axis=1)
    data = pd.concat(
        [data,
         cpvCode_dummies,
         # cpvDivision_dummies,
         # cpvGroup_dummies,
         # cpvClass_dummies,
         # cpvCategory_dummies
         ], axis=1)

    # Drop the original cpvCode, cpvCategory, and cpvSubcategory columns
    # data.drop(['cpvCode', 'cpvCategory', 'cpvSubcategory'], axis=1, inplace=True)
    data.drop(['cpvCode'], axis=1, inplace=True)

    # Split the data into features (X) and target (y)
    # X = data
    X = data.drop('organizationId', axis=1)
    y = data['organizationId']

    return X, y