In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder

In [2]:
class CustomLabelEncoder(LabelEncoder):
    def __init__(self, labels):
        super(LabelEncoder, self).__init__()
        self.fit(labels)
        self.n_classes = len(self.classes_)
    
    def int2label(self, int_array):
        return self.inverse_transform(int_array)
    
    def label2int(self, str_array):
        return self.transform(str_array)
    
    def get_int2label_dict(self):
        return {k:v for k,v in zip(range(self.n_classes), self.classes_)}
    
    def get_label2int_dict(self):
        return {k:v for k,v in zip(self.classes_, range(self.n_classes))}

In [3]:
def encode_labels(
    label_encoder: CustomLabelEncoder, 
    df: pd.DataFrame, 
    label_col: str) -> pd.DataFrame:
    """
    Make the label columns into integer labels.
    """
    df[label_col] = label_encoder.label2int(df[label_col])
    return df

In [4]:
LABEL_COL = 'category'

In [5]:
df = pd.read_csv('bbc-text.csv')
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [6]:
label_encoder = CustomLabelEncoder(df[LABEL_COL])

In [7]:
df = encode_labels(label_encoder=label_encoder, df=df, label_col=LABEL_COL)
df.head()

Unnamed: 0,category,text
0,4,tv future in the hands of viewers with home th...
1,0,worldcom boss left books alone former worldc...
2,3,tigers wary of farrell gamble leicester say ...
3,3,yeading face newcastle in fa cup premiership s...
4,1,ocean s twelve raids box office ocean s twelve...


In [8]:
label_encoder.n_classes

5

In [9]:
label_encoder.get_int2label_dict()

{0: 'business', 1: 'entertainment', 2: 'politics', 3: 'sport', 4: 'tech'}

In [10]:
label_encoder.get_label2int_dict()

{'business': 0, 'entertainment': 1, 'politics': 2, 'sport': 3, 'tech': 4}