In [None]:
# Import all libraries and modules, define variables and objects needed to run the code

import pandas as pd
import numpy as np

# For loading dataframes to/from GColab or GDrive
import gdown
import shutil

# Load progress idicator tools
!pip install tqdm
from tqdm import tqdm

# For cleaning text (from stopwords, CamelCase etc.) and changing type of obj variables to numeric
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('foam')

# For label encoding
from sklearn.preprocessing import LabelEncoder

# For vectorization with Spacy
!pip install spacy
!python -m spacy download en_core_web_lg
import spacy
nlp = spacy.load('en_core_web_lg') # pre-trained model used for vektorization

# For vectorization with BERT
from transformers import BertModel, BertTokenizer
import torch
# Load pre-trained model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

##**1. Load and read the initial CSV file with data**

In [None]:
# If google drive will be used as a storage for data and results
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load data file in GColab directory
file_id = '...'
gdown.download(id=file_id)

# Read csv file as pandas df
df_path = '...'
df0 = pd.read_csv(df_path, delimiter = ",", keep_default_na=False)
df0

##**2. Split the main dataframe to train, validation and test dataframes**

In [None]:
# a) change type of _TRANSDATE to datetime
df0['_TRANSDATE'] = pd.to_datetime(df0['_TRANSDATE'])

In [None]:
# b) split the main dataframe (df0) to 3 part - training (from 2023-06-01 to 2023-06-17),
# validation (from 2023-06-18 to 2023-06-24) and testing (2023-06-25 to 2023-06-30)

date1 = '2023-06-17'
date2 = '2023-06-24'

df1_train = df0.loc[df0['_TRANSDATE'] <= date1]
df1_val = df0.loc[(df0['_TRANSDATE'] > date1) & (df0['_TRANSDATE'] <= date2)]
df1_test = df0.loc[df0['_TRANSDATE'] > date2]

# d) save df as csv for futher processing on demand
df1_train.to_csv('df1_train.csv', index=False)
df1_val.to_csv('df1_val.csv', index=False)
df1_test.to_csv('df1_test.csv', index=False)

In [None]:
# check dataframes by time span
print('df1_train:\n', df1_train['_TRANSDATE'].value_counts().sort_index())
print('df1_val:\n', df1_val['_TRANSDATE'].value_counts().sort_index())
print('df1_test:\n', df1_test['_TRANSDATE'].value_counts().sort_index())

##**3.1 Functions for preprocessing: language detection and NaN fore some variables droping**

In [None]:
def drop_nan(df, list_variables_dna):
  return df.dropna(subset=list_variables_dna, inplace=True)

In [None]:
# Any language detection library can be used to determine the language of item names

##**3.2 Functions for preprocessing: ITEM_NAME cleaning function**

In [None]:
# Cleaning functions: removes particular symbols, resolves CamelCases, deleats stopwords from the stopwords list

def clean_ITEM(df, stopwords):
    # Check if 'ITEM_NAME' column exists
    if 'ITEM_NAME' not in df.columns:
        return 'No ITEM_NAME column found'
    else:
        # Function to remove symbols and split camel case
        def symbol_remover_camel_case_split(phrase):
            # Remove all characters besides A-Za-z0-9
            phrase = re.sub("[^A-Za-z0-9]", " ", phrase)
            # Separate numbers from words at the end
            phrase = re.sub('([a-z])([0-9]+)', r'\1 \2', phrase)
            # Separate numbers from words at the beginning
            phrase = re.sub('([0-9]+)([a-zA-Z])', r'\1 \2', phrase)
            # Split camel case
            phrase = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', phrase))
            return phrase

        # Function to remove stopwords
        def remove_stopwords(item_name):
            words = item_name.split()
            cleaned_words = [word for word in words if word.lower() not in stopwords]
            return ' '.join(cleaned_words)

        # Apply symbol removal and camel case splitting
        df['ITEM_NAME'] = df['ITEM_NAME'].apply(symbol_remover_camel_case_split)
        # Apply stopwords removal
        df['ITEM_NAME'] = df['ITEM_NAME'].apply(remove_stopwords)

    return df

##3.3 Function to change the obj variables to numeric

In [None]:
# To change type of variable to numeric

def convert_to_float(value):
    match = re.search(r'\d+(\.\d+)?', str(value))
    if match:
        return float(match.group())
    else:
        return None

In [None]:
##3.4 Function to create labels on selection

In [None]:
# The function to create labels on selection

def labels_create(conditions, choices):
    return np.select(conditions, choices, default=np.nan)

##**3.4 Function for embeddings: ITEM_NAME vectorization with Spacy**

In [None]:
# The function to transform ITEM_NAME to vector with Spacy with batches
def batches_to_vectors_spacy(phrases, batch_size):
    vectors = []
    for i in tqdm(range(0, len(phrases), batch_size)):
        batch = phrases[i:i+batch_size]
        docs = nlp.pipe(batch)
        vectors.extend([doc.vector for doc in docs]) # return as array
    return np.array(vectors)

##**4. Preprocessing, embeddings, and labels**

In [None]:
# 4.3.1 a) Dropping NaN in Price and Quantity (it can be also done before training with matching masks)
list_variables_dna = ['PRICE_num', 'QUANTITY_num']
drop_nan(df1_train, list_variables_dna)
drop_nan(df1_val, list_variables_dna)
drop_nan(df1_test, list_variables_dna)

In [None]:
# 4.3.1 b) Language detection.

tqdm.pandas()
df1_train['ENG'] = df1_train['ITEM_NAME'].progress_apply(lang_detector)
df1_val['ENG'] = df1_val['ITEM_NAME'].progress_apply(lang_detector)
df1_test['ENG'] = df1_test['ITEM_NAME'].progress_apply(lang_detector)

In [None]:
# 4.3.2 Cleaning ITEM_NAME of data frames

df1_train = clean_ITEM(df1_train, stopwords)
df1_val = clean_ITEM(df1_val, stopwords)
df1_test = clean_ITEM(df1_test, stopwords)

In [None]:
# 4.3.3 a) Change PRICE, QUANTITY to numerical type

df1_train['PRICE_num'] = df1_train['PRICE'].apply(convert_to_float).astype(np.float32)
df1_train['QUANTITY_num'] = df1_train['QUANTITY'].apply(convert_to_float).astype(np.float32)

df1_val['PRICE_num'] = df1_val['PRICE'].apply(convert_to_float).astype(np.float32)
df1_val['QUANTITY_num'] = df1_val['QUANTITY'].apply(convert_to_float).astype(np.float32)

df1_test['PRICE_num'] = df1_test['PRICE'].apply(convert_to_float).astype(np.float32)
df1_test['QUANTITY_num'] = df1_test['QUANTITY'].apply(convert_to_float).astype(np.float32)

In [None]:
# 4.3.3 b) encoding HASHED_SITEID to categorical SITEID_num

# concatinate HASHED_SITEID in all the df
siteid_cc = pd.concat([df1_train['HASHED_SITEID'], df1_val['HASHED_SITEID'], df1_test['HASHED_SITEID']], axis=0)

# LabelEncoder to transfer HASHED_SITEID to categorical data
labeler = LabelEncoder()
siteid_lab = labeler.fit_transform(siteid_cc)

# Division back to df1_train, df1_val, df1_test to get absolutely identical encoding in a new SITEID_num
df1_train['SITEID_num'] = siteid_lab[:len(df1_train)]
df1_val['SITEID_num'] = siteid_lab[len(df1_train):(len(df1_train)+len(df1_val))]
df1_test['SITEID_num'] = siteid_lab[(len(df1_train)+len(df1_val)):]

**4.3.4 The following cell to do embeding of item names

In [None]:
# Embedding of ITEM_NAME by Spacy

batch_size = 1000

train_embed = batches_to_vectors_spacy(df1_train['ITEM_NAME'], batch_size)
np.save('train_embed.npy', train_embed) # if files with embeddings will be used later

val_embed = batches_to_vectors_spacy(df1_val['ITEM_NAME'], batch_size)
np.save('val_embed.npy', val_embed) # if files with embeddings will be used later

test_embed = batches_to_vectors_spacy(df1_test['ITEM_NAME'], batch_size)
np.save('test_embed.npy', test_embed) # if files with embeddings will be used later


**4.3.5 Create labels: non-fraud (-1), regular fraud (1), fraud rings (2)**

In [None]:
# 4.3.5 a) Labels for df1_train

# Convert HASHED_FRAUDRINGNAME to numeric type
df1_train['HASHED_FRAUDRINGNAME_num'] = df1_train['HASHED_FRAUDRINGNAME'].apply(convert_to_float).astype(np.float32)

# mask for conditions
conditions_1 = [
    (df1_train['IF_FRAUD'] == False),
    ((df1_train['IF_FRAUD'] == True) & (df1_train['HASHED_FRAUDRINGNAME_num'].isna())),
    (df1_train['HASHED_FRAUDRINGNAME_num'].notna())
]

# values of labels
choices_1 = [-1, 1, 2]

# call the function to create labels using the defined conditions and choises
df1_train['Label1'] = labels_create(conditions_1, choices_1)

# save results if needed


In [None]:
# 4.3.5 b) Labels for df1_val

# Convert HASHED_FRAUDRINGNAME to numeric type
df1_val['HASHED_FRAUDRINGNAME_num'] = df1_val['HASHED_FRAUDRINGNAME'].apply(convert_to_float).astype(np.float32)

# mask for conditions
conditions_1 = [
    (df1_val['IF_FRAUD'] == False),
    ((df1_val['IF_FRAUD'] == True) & (df1_val['HASHED_FRAUDRINGNAME_num'].isna())),
    (df1_val['HASHED_FRAUDRINGNAME_num'].notna())
]

# values of labels
choices_1 = [-1, 1, 2]

# call the function to create labels using the defined conditions and choises
df1_val['Label1'] = labels_create(conditions_1, choices_1)

# save results if needed


In [None]:
# 4.3.5 c) Labels for df1_test

# Convert HASHED_FRAUDRINGNAME to numeric type
df1_test['HASHED_FRAUDRINGNAME_num'] = df1_test['HASHED_FRAUDRINGNAME'].apply(convert_to_float).astype(np.float32)

# mask for conditions
conditions_1 = [
    (df1_test['IF_FRAUD'] == False),
    ((df1_test['IF_FRAUD'] == True) & (df1_test['HASHED_FRAUDRINGNAME_num'].isna())),
    (df1_test['HASHED_FRAUDRINGNAME_num'].notna())
]

# values of labels
choices_1 = [-1, 1, 2]

# call the function to create labels using the defined conditions and choises
df1_test['Label1'] = labels_create(conditions_1, choices_1)

# save results if needed


All the results of preprocessing we usually saved as 3 csv and 3 npy files for futher work. Google Drive can be used as a storage and code like this can be used for mooving files from Colab to GD

In [None]:
source_path = '...'
destination_path = '....'
shutil.copy(source_path, destination_path)

source_path = '...'
destination_path = '...'
shutil.copy(source_path, destination_path)