In [54]:
import numpy as np 
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score

from imblearn.over_sampling import SMOTE

In [4]:
TRAIN_MODE = True
SAMPLE_NUM = 7000

In [5]:
mapped_df = pd.read_csv('/kaggle/input/preprocessedcompany/mapped_company')

In [48]:
mapped_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18272251 entries, 0 to 18272250
Data columns (total 6 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   Unnamed: 0      int64 
 1   NAME            object
 2   INDUSTRY        object
 3   SIZE            object
 4   INDUSTRY_GROUP  object
 5   NAME_CLEANED    object
dtypes: int64(1), object(5)
memory usage: 836.4+ MB


In [6]:
def Mode_selection(data, train_mode = None, sampling_num = None):
    if train_mode == True:
        data = data.sample(n = sampling_num, random_state=42)
    else:
        data
    return data

In [7]:
df = Mode_selection(mapped_df, TRAIN_MODE, SAMPLE_NUM)

In [8]:
# Step 1: Feature extraction using TfidfVectorizer for NAME_CLEANED
vectorizer = TfidfVectorizer(max_features=1000)  # Limiting to top 1000 features
X = vectorizer.fit_transform(df['NAME_CLEANED'].fillna('')).toarray()

In [9]:
# Step 2: Encode the target variable (INDUSTRY_GROUP)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['INDUSTRY_GROUP'])

In [10]:
# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [11]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [18]:
import cupy as cp

# Convert data to GPU-compatible format
X_resampled_gpu = cp.array(X_resampled)
y_resampled_gpu = cp.array(y_resampled)

X_test_gpu = cp.array(X_test)
y_test_gpu = cp.array(y_test)

## Trial_1: Default model

In [45]:
# Step 4: Train the XGBoost model
xgb_model = XGBClassifier(objective= 'multi:softmax', 
                          eval_metric=['mlogloss', 'merror'],
                          device ='cuda',
                          random_state=42)

xgb_model.fit(X_resampled_gpu, y_resampled_gpu, eval_set=[(X_test_gpu, y_test_gpu)])

[0]	validation_0-mlogloss:2.66082	validation_0-merror:0.82037
[1]	validation_0-mlogloss:2.60146	validation_0-merror:0.78563
[2]	validation_0-mlogloss:2.56612	validation_0-merror:0.77212
[3]	validation_0-mlogloss:2.53978	validation_0-merror:0.76225
[4]	validation_0-mlogloss:2.52011	validation_0-merror:0.75575
[5]	validation_0-mlogloss:2.50385	validation_0-merror:0.74875
[6]	validation_0-mlogloss:2.49210	validation_0-merror:0.74513
[7]	validation_0-mlogloss:2.47839	validation_0-merror:0.74138
[8]	validation_0-mlogloss:2.46907	validation_0-merror:0.74062
[9]	validation_0-mlogloss:2.46060	validation_0-merror:0.73562
[10]	validation_0-mlogloss:2.45298	validation_0-merror:0.73313
[11]	validation_0-mlogloss:2.44622	validation_0-merror:0.73313
[12]	validation_0-mlogloss:2.43992	validation_0-merror:0.73200
[13]	validation_0-mlogloss:2.43396	validation_0-merror:0.72937
[14]	validation_0-mlogloss:2.42752	validation_0-merror:0.72650
[15]	validation_0-mlogloss:2.42402	validation_0-merror:0.72588
[1

In [46]:
from sklearn.metrics import accuracy_score, f1_score

# Make predictions
y_pred = xgb_model.predict(X_test)

# Calculate metrics
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')  # Use 'weighted' for multi-class

print("============Default Model============")
print(f"Accuracy: {acc}")
print(f"F1 Score: {f1}")

Accuracy: 0.27975
F1 Score: 0.30600562219551103


In [47]:
print('Classification report:')
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Classification report:
                                      precision    recall  f1-score   support

                   Business Services       0.34      0.23      0.27       472
          Construction & Real Estate       0.70      0.30      0.42       913
                Education & Training       0.42      0.32      0.36       337
                              Energy       0.23      0.23      0.23       115
Environmental Services & Agriculture       0.01      0.02      0.01        55
                             Finance       0.34      0.31      0.32       221
                  Food & Hospitality       0.61      0.36      0.45       627
             Government & Non-Profit       0.23      0.22      0.23       258
               Healthcare & Wellness       0.59      0.45      0.51       504
                               Legal       0.55      0.46      0.50       145
         Manufacturing & Engineering       0.29      0.21      0.24       287
         Media, Entertainment & Arts    

## Trial_2: GridSearch

In [13]:
from sklearn.model_selection import GridSearchCV

In [14]:
xgb_model_2 = XGBClassifier(objective= 'multi:softmax', 
                          eval_metric=['mlogloss', 'merror'],
                          device ='cuda',
                          random_state=42)

In [None]:
# Define the hyperparameter grid
param_grid = {
    'max_depth': [30, 50, 100],
    'learning_rate': [0.1, 0.01],
    'subsample': [1, 3, 5]
}


#Create the GridSearchCV object
grid_search = GridSearchCV(xgb_model_2, param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the training data
grid_search.fit(X_resampled, y_resampled)

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

In [None]:
"""
Best set of hyperparameters:  {'learning_rate': 0.1, 'max_depth': 7, 'subsample': 1}
Best score:  0.4164925208368039

Best set of hyperparameters:  {'learning_rate': 0.1, 'max_depth': 30, 'subsample': 1}
Best score:  0.4326516468062371

Best set of hyperparameters:  {'learning_rate': 0.1, 'max_depth': 100, 'subsample': 1}
Best score:  0.4370304028942969
"""

## Trial_3: Optuna

In [47]:
import optuna

In [60]:
# 1. Define an objective function to be maximized.
def objective(trial):
    # 2. Suggest values of the hyperparameters using a trial object.
    params = {
    'n_estimators' : trial.suggest_int('n_estimators',2000,3000),
    'max_depth':  trial.suggest_int('max_depth',3,8),
    'min_child_weight': trial.suggest_float('min_child_weight', 2,4),
    "learning_rate" : trial.suggest_float('learning_rate',1e-4, 0.2),
    'subsample': trial.suggest_float('subsample', 0.2, 1),
    'gamma': trial.suggest_float("gamma", 1e-4, 1.0),
    "colsample_bytree" : trial.suggest_float('colsample_bytree',0.2,1),
    "colsample_bylevel" : trial.suggest_float('colsample_bylevel',0.2,1),
    "colsample_bynode" : trial.suggest_float('colsample_bynode',0.2,1),
    }
    
    xgbmodel_optuna = XGBClassifier(
                          **params,
                          objective= 'multi:softmax',
                          eval_metric=['mlogloss', 'merror'],
                          device ='cuda',
                          random_state=42)
    
    xgbmodel_optuna.fit(X_resampled, y_resampled,
              eval_set=[(X_resampled, y_resampled),(X_test, y_test)])
    cv = cross_val_score(xgbmodel_optuna, X_resampled, y_resampled, cv = 4,scoring='accuracy').mean()
    return cv
    

In [None]:
# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100,timeout=1200)

In [62]:
params = {'n_estimators': 2744, 'max_depth': 6, 'min_child_weight': 3.9426060282299655, 
          'learning_rate': 0.162900223477397, 'subsample': 0.7744903608428784, 
          'gamma': 0.014822453886960114, 'colsample_bytree': 0.33619569069290856, 
          'colsample_bylevel': 0.5940538354478573, 'colsample_bynode': 0.6331592624322864}

In [66]:
# Step 4: Train the XGBoost model
xgb_model_3 = XGBClassifier(objective= 'multi:softmax',
                          **params,
                          eval_metric=['mlogloss', 'merror'],
                          device ='cuda',
                          random_state=42)

In [67]:
xgb_model_3.fit(X_resampled_gpu, y_resampled_gpu,
              eval_set=[(X_resampled_gpu, y_resampled_gpu),(X_test_gpu, y_test_gpu)])

[0]	validation_0-mlogloss:2.78049	validation_0-merror:0.86898	validation_1-mlogloss:2.81371	validation_1-merror:0.89143
[1]	validation_0-mlogloss:2.74115	validation_0-merror:0.83050	validation_1-mlogloss:2.79269	validation_1-merror:0.72714
[2]	validation_0-mlogloss:2.71542	validation_0-merror:0.80702	validation_1-mlogloss:2.78083	validation_1-merror:0.88071
[3]	validation_0-mlogloss:2.67826	validation_0-merror:0.78951	validation_1-mlogloss:2.76753	validation_1-merror:0.72286
[4]	validation_0-mlogloss:2.65509	validation_0-merror:0.77563	validation_1-mlogloss:2.75219	validation_1-merror:0.86714
[5]	validation_0-mlogloss:2.62865	validation_0-merror:0.76750	validation_1-mlogloss:2.74085	validation_1-merror:0.86071
[6]	validation_0-mlogloss:2.60487	validation_0-merror:0.76273	validation_1-mlogloss:2.72829	validation_1-merror:0.85357
[7]	validation_0-mlogloss:2.58313	validation_0-merror:0.75711	validation_1-mlogloss:2.71911	validation_1-merror:0.85214
[8]	validation_0-mlogloss:2.56600	valida

In [68]:
# Make predictions
y_pred = xgb_model_3.predict(X_test_gpu)

# Calculate metrics
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')  # Use 'weighted' for multi-class

print("============Default Model============")
print(f"Accuracy: {acc}")
print(f"F1 Score: {f1}")

Accuracy: 0.30428571428571427
F1 Score: 0.2637123932606561
