# Load Data

In [63]:
import pandas as pd

In [64]:
df = pd.read_csv("data/data_train.tsv", sep='\t', names=['text', 'label'] )
df

Unnamed: 0,text,label
0,mvbe,ID_681575024
1,sanax,ID_628496423
2,gas lighter,ID_765419922
3,stopper,ID_930030564
4,5 minute timer,ID_337225844
...,...,...
6128527,hat cooling pad,ID_746285313
6128528,miniature 120mb,ID_286749025
6128529,v belts 212 inches,ID_350489365
6128530,ee2602dfx16,ID_432431119


In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6128532 entries, 0 to 6128531
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   text    object
 1   label   object
dtypes: object(2)
memory usage: 93.5+ MB


In [66]:
df.label.nunique()

485

In [67]:
df.label.value_counts()

ID_699068447    283833
ID_946477099    240845
ID_459833412    191248
ID_40650829     126986
ID_699945089    124288
                 ...  
ID_911331170         3
ID_971210579         1
ID_223719075         1
ID_817600387         1
ID_82032374          1
Name: label, Length: 485, dtype: int64

In [68]:
df.label.isna().sum(), df.text.isna().sum()

(0, 4)

In [69]:
# Drop nan as count is very low

df = df.dropna()
df.shape

(6128528, 2)

## Text cleaning

In [70]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # Remove special characters, whitespace, newline, and tab
    re.sub(r'\W+', ' ', text.lower())
    
    # remov dig
    #text = re.sub(r'\d+', '', text)
    
    # Tokenize text
    #tokens = word_tokenize(text)
    
    # Remove stopwords
    #stop_words = set(stopwords.words('english'))
    #tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize tokens
    #lemmatizer = WordNetLemmatizer()
    #tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into a single string
    #cleaned_text = ' '.join(tokens)
    
    return text

In [71]:
df.label = df.label.apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [72]:
# df_filtered = df.groupby('label').filter(lambda x: len(x) >= 100)
# df_filtered.shape, df_filtered.label.nunique()

# Create sample dataframes

In [73]:
def select_samples_per_label(df, label_column, n):
     #n: Number of samples to select per label
    selected_df = df.groupby(label_column).apply(lambda x: x.sample(n) if len(x) >= n else x)
    selected_df.reset_index(drop=True, inplace=True)
    return selected_df

In [74]:
df_100 = df.groupby('label').filter(lambda x: len(x) >= 100000).sample(100)
print(df_100.shape, df_100.label.nunique())
df_100.label.value_counts()

(100, 2) 8


id_946477099    20
id_699068447    19
id_699945089    13
id_459833412    11
id_40650829     11
id_776333775    10
id_337225844    10
id_835979401     6
Name: label, dtype: int64

In [75]:
df_1K = df.groupby('label').filter(lambda x: len(x) >= 100000).sample(1000)
print(df_1K.shape, df_1K.label.nunique())
df_1K.label.value_counts()

(1000, 2) 8


id_699068447    214
id_946477099    186
id_459833412    159
id_40650829     120
id_699945089     90
id_835979401     88
id_776333775     76
id_337225844     67
Name: label, dtype: int64

In [76]:
10000/df.label.nunique()

20.61855670103093

In [90]:
df_10K = df.groupby('label').filter(lambda x: len(x) >= 100000).sample(10000)

# df_10K= select_samples_per_label(df, 'label', 20)
print(df_10K.shape, df_10K.label.nunique())
df_10K.label.value_counts()

(10000, 2) 8


id_699068447    2205
id_946477099    1847
id_459833412    1476
id_699945089     975
id_40650829      932
id_835979401     876
id_337225844     874
id_776333775     815
Name: label, dtype: int64

In [79]:
1000000/df.label.nunique()

2061.855670103093

In [80]:
#df_1M = df.groupby('label').filter(lambda x: len(x) >= 100).sample(1000000)

df_1M = select_samples_per_label(df, 'label', 2850)
print(df_1M.shape, df_1M.label.nunique())
df_1M.label.value_counts()

(1012477, 2) 485


id_101371589    2850
id_579223708    2850
id_635210945    2850
id_633338916    2850
id_633057679    2850
                ... 
id_911331170       3
id_817600387       1
id_82032374        1
id_223719075       1
id_971210579       1
Name: label, Length: 485, dtype: int64

In [81]:
df_1M.shape[0]/df.shape[0]

0.16520720799513358

# Model Selection

## Using tfidf & LogisticRegression - Baseline Model

In [82]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import ClassifierChain
import lightgbm as lgb

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_1M["text"], df_1M["label"], test_size=0.2, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Convert labels to binary format for multilabel classification
mlb = MultiLabelBinarizer()
y_train_binary = mlb.fit_transform(y_train)
y_test_binary = mlb.transform(y_test)

# Train the classifier chain
classifier = ClassifierChain(lgb.LGBMClassifier())
classifier.fit(X_train_tfidf, y_train_binary)

# Predict on the test set
y_pred = classifier.predict(X_test_tfidf)

# Model performance evaluation
accuracy = accuracy_score(y_test_binary, y_pred)
precision = precision_score(y_test_binary, y_pred, average="micro")
recall = recall_score(y_test_binary, y_pred, average="micro")
f1 = f1_score(y_test_binary, y_pred, average="micro")

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)



Accuracy: 0.2331354693426043
Precision: 0.8162960520367406
Recall: 0.8319869901070606
F1 Score: 0.8240668356170268


## Using LGBM & tfidf

In [83]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_1M["text"], df_1M["label"], test_size=0.2, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Convert labels to binary format for multiclass classification
y_train = pd.factorize(y_train)[0]
y_test = pd.factorize(y_test)[0]

# Create LightGBM dataset
train_data = lgb.Dataset(X_train_tfidf, label=y_train)

# Set LightGBM parameters
params = {
    'objective': 'multiclass',
    'num_class': len(df_1M['label'].unique())
}

# Train the classifier
classifier = lgb.train(params, train_data)

# Predict on the test set
y_pred = classifier.predict(X_test_tfidf)
y_pred = y_pred.argmax(axis=1)  # Get the predicted class with highest probability

# Model performance evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 196855
[LightGBM] [Info] Number of data points in the train set: 809981, number of used features: 11187
[LightGBM] [Info] Start training from score -5.857169
[LightGBM] [Info] Start training from score -5.872397
[LightGBM] [Info] Start training from score -5.891875
[LightGBM] [Info] Start training from score -5.854582
[LightGBM] [Info] Start training from score -5.864102
[LightGBM] [Info] Start training from score -5.890535
[LightGBM] [Info] Start training from score -5.867150
[LightGBM] [Info] Start training from score -5.873713
[LightGBM] [Info] Start training from score -5.874152
[LightGBM] [Info] Start training from score -6.493254
[LightGBM] [Info] Start training from score -6.152944
[LightGBM] [Info] Start training from score -6.745151
[LightGBM] [Info] Start training from score -5.868896
[LightGBM] [Info] Start training from score -5.859330
[LightGBM] [Info] Start training from score -6.283578

[LightGBM] [Info] Start training from score -5.855013
[LightGBM] [Info] Start training from score -7.761222
[LightGBM] [Info] Start training from score -6.202315
[LightGBM] [Info] Start training from score -6.795727
[LightGBM] [Info] Start training from score -6.922657
[LightGBM] [Info] Start training from score -6.504739
[LightGBM] [Info] Start training from score -6.307675
[LightGBM] [Info] Start training from score -5.868022
[LightGBM] [Info] Start training from score -5.871082
[LightGBM] [Info] Start training from score -5.885192
[LightGBM] [Info] Start training from score -5.865407
[LightGBM] [Info] Start training from score -5.867586
[LightGBM] [Info] Start training from score -5.888751
[LightGBM] [Info] Start training from score -6.149468
[LightGBM] [Info] Start training from score -5.864102
[LightGBM] [Info] Start training from score -7.858563
[LightGBM] [Info] Start training from score -6.649173
[LightGBM] [Info] Start training from score -5.870207
[LightGBM] [Info] Start trai









































































































































































































































































































































































































































































































































































































































































































































































































































  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.0025284450063211127
Precision: 6.6992792146526335e-06
Recall: 0.0025284450063211127
F1 Score: 1.3363151890709461e-05


## Using SentenceTransformer-LLM Embedding & LGBM classification

In [95]:
   
sentence_transformers_models = ['all-mpnet-base-v2', 'all-distilroberta-v1', 'all-MiniLM-L12-v2', 'all-MiniLM-L6-v2', 'multi-qa-mpnet-base-dot-v1',
         'multi-qa-distilbert-cos-v1', 'paraphrase-multilingual-mpnet-base-v2', 'paraphrase-albert-small-v2', 'multi-qa-MiniLM-L6-cos-v1',
         'paraphrase-multilingual-MiniLM-L12-v2', 'paraphrase-MiniLM-L3-v2', 'distiluse-base-multilingual-cased-v1',
         'distiluse-base-multilingual-cased-v2']

![image.png](attachment:image.png)

#https://www.sbert.net/docs/pretrained_models.html


In [96]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sentence_transformers import SentenceTransformer
import lightgbm as lgb
import joblib

def train_model(df, model_name, save_model_path):
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

    # Load a pre-trained Sentence Transformer model
    model = SentenceTransformer(model_name)

    # Convert text data to embeddings using the Sentence Transformer model
    X_train_embeddings = model.encode(X_train.tolist())
    X_test_embeddings = model.encode(X_test.tolist())

    # Train a LightGBM classifier on the embeddings
    classifier = lgb.LGBMClassifier()
    classifier.fit(X_train_embeddings, y_train)

    # Save the trained model
    joblib.dump(classifier, save_model_path)

    # Predict on the test set
    y_pred = classifier.predict(X_test_embeddings)

    # Evaluate the model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')

    return accuracy, precision, recall, f1

def load_and_predict(test_data, model_name, model_path):
    # Load the saved model
    classifier = joblib.load(model_path)

    # Load a pre-trained Sentence Transformer model
    #model_name = 'bert-base-nli-mean-tokens'  # Replace with the desired pre-trained model
    model = SentenceTransformer(model_name)

    # Convert text data to embeddings using the Sentence Transformer model
    test_embeddings = model.encode(test_data.tolist())

    # Predict using the loaded model
    y_pred = classifier.predict(test_embeddings)

    return y_pred


def run_lgbm(df, model_name, model_string):
    # Define the model name and save path
    #model_name = 'bert-base-nli-mean-tokens'  # Replace with the desired pre-trained model
    save_model_path = f'{model_name}_{model_string}.pkl'

    # Train the model
    accuracy, precision, recall, f1 = train_model(df, model_name, save_model_path)

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)

#     # Test data for prediction
#     test_data = df['text'].head(100).tolist()

#     # Load the saved model and make predictions
#     y_pred = load_and_predict(test_data, model_name, save_model_path)

#     print("Predictions:", y_pred)

    return accuracy, precision, recall, f1



In [97]:
import time
df_sent_trans_models = pd.DataFrame(columns=['Model', 'accuracy', 'precision', 'recall', 'f1', 'Time (min)'])

for model_name in sentence_transformers_models:
    print("Model: ", model_name)
    start_time = time.time()
    accuracy, precision, recall, f1 = run_lgbm(df_10K, model_name, model_string='lgbm_10k')
    end_time = time.time()
    time_taken = (end_time - start_time) / 60  # Convert time to minutes
    df_sent_trans_models = df_sent_trans_models.append({'Model': model_name, 'accuracy': accuracy, 
                                                        'precision': precision, 'recall': recall, 
                                                        'f1': f1, 'Time (min)': time_taken}, ignore_index=True)
    print('-' * 80)
    
df_sent_trans_models = df_sent_trans_models.set_index('Model')

df_sent_trans_models

Model:  all-mpnet-base-v2
Accuracy: 0.7475
Precision: 0.7475
Recall: 0.7475
F1 Score: 0.7475
--------------------------------------------------------------------------------
Model:  all-distilroberta-v1
Accuracy: 0.737
Precision: 0.737
Recall: 0.737
F1 Score: 0.737
--------------------------------------------------------------------------------
Model:  all-MiniLM-L12-v2
Accuracy: 0.7525
Precision: 0.7525
Recall: 0.7525
F1 Score: 0.7525
--------------------------------------------------------------------------------
Model:  all-MiniLM-L6-v2
Accuracy: 0.747
Precision: 0.747
Recall: 0.747
F1 Score: 0.747
--------------------------------------------------------------------------------
Model:  multi-qa-mpnet-base-dot-v1
Accuracy: 0.755
Precision: 0.755
Recall: 0.755
F1 Score: 0.755
--------------------------------------------------------------------------------
Model:  multi-qa-distilbert-cos-v1
Accuracy: 0.7635
Precision: 0.7635
Recall: 0.7635
F1 Score: 0.7635
-----------------------------

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/402 [00:00<?, ?B/s]

Accuracy: 0.736
Precision: 0.736
Recall: 0.736
F1 Score: 0.736
--------------------------------------------------------------------------------
Model:  paraphrase-albert-small-v2


Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.03k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/827 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/245 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/465 [00:00<?, ?B/s]

Accuracy: 0.725
Precision: 0.725
Recall: 0.725
F1 Score: 0.7250000000000001
--------------------------------------------------------------------------------
Model:  multi-qa-MiniLM-L6-cos-v1


Downloading:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Accuracy: 0.7385
Precision: 0.7385
Recall: 0.7385
F1 Score: 0.7385
--------------------------------------------------------------------------------
Model:  paraphrase-multilingual-MiniLM-L12-v2


Downloading:   0%|          | 0.00/968 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Accuracy: 0.705
Precision: 0.705
Recall: 0.705
F1 Score: 0.705
--------------------------------------------------------------------------------
Model:  paraphrase-MiniLM-L3-v2


Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.01k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Accuracy: 0.724
Precision: 0.724
Recall: 0.724
F1 Score: 0.724
--------------------------------------------------------------------------------
Model:  distiluse-base-multilingual-cased-v1


Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/556 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/539M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/452 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Accuracy: 0.6985
Precision: 0.6985
Recall: 0.6985
F1 Score: 0.6985
--------------------------------------------------------------------------------
Model:  distiluse-base-multilingual-cased-v2


Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/610 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/539M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/531 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Accuracy: 0.7095
Precision: 0.7095
Recall: 0.7095
F1 Score: 0.7095
--------------------------------------------------------------------------------


Unnamed: 0_level_0,accuracy,precision,recall,f1,Time (min)
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
all-mpnet-base-v2,0.7475,0.7475,0.7475,0.7475,3.382869
all-distilroberta-v1,0.737,0.737,0.737,0.737,1.895457
all-MiniLM-L12-v2,0.7525,0.7525,0.7525,0.7525,1.168197
all-MiniLM-L6-v2,0.747,0.747,0.747,0.747,0.656865
multi-qa-mpnet-base-dot-v1,0.755,0.755,0.755,0.755,3.380424
multi-qa-distilbert-cos-v1,0.7635,0.7635,0.7635,0.7635,1.910675
paraphrase-multilingual-mpnet-base-v2,0.736,0.736,0.736,0.736,7.530364
paraphrase-albert-small-v2,0.725,0.725,0.725,0.725,2.319291
multi-qa-MiniLM-L6-cos-v1,0.7385,0.7385,0.7385,0.7385,0.997075
paraphrase-multilingual-MiniLM-L12-v2,0.705,0.705,0.705,0.705,3.975916


In [99]:
import time
df_sent_trans_models_1m = pd.DataFrame(columns=['Model', 'accuracy', 'precision', 'recall', 'f1', 'Time (min)'])

for model_name in ['multi-qa-distilbert-cos-v1', 'multi-qa-mpnet-base-dot-v1', 'all-MiniLM-L12-v2']:
    print("Model: ", model_name)
    start_time = time.time()
    accuracy, precision, recall, f1 = run_lgbm(df_1M, model_name, model_string='lgbm_1M')
    end_time = time.time()
    time_taken = (end_time - start_time) / 60  # Convert time to minutes
    df_sent_trans_models_1m = df_sent_trans_models_1m.append({'Model': model_name, 'accuracy': accuracy, 
                                                        'precision': precision, 'recall': recall, 
                                                        'f1': f1, 'Time (min)': time_taken}, ignore_index=True)
    print('-' * 80)
    
df_sent_trans_models_1m = df_sent_trans_models_1m.set_index('Model')

df_sent_trans_models_1m

Model:  multi-qa-distilbert-cos-v1
Accuracy: 0.026844974715549936
Precision: 0.026844974715549936
Recall: 0.026844974715549936
F1 Score: 0.026844974715549936
--------------------------------------------------------------------------------
Model:  multi-qa-mpnet-base-dot-v1
Accuracy: 0.02002014854614412
Precision: 0.02002014854614412
Recall: 0.02002014854614412
F1 Score: 0.02002014854614412
--------------------------------------------------------------------------------
Model:  all-MiniLM-L12-v2


KeyboardInterrupt: 

## Using FastText

In [86]:
import time 
import pandas as pd
import numpy as np
import fasttext
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def save_fasttext_data(df, output_file):
    fasttext_data = df.apply(lambda row: '__label__' + row['label'] + ' ' + row['text'], axis=1)
    fasttext_data.to_csv(output_file, index=False, header=False)
    
def save_fasttext_data_util(df, model_string):
    # Save DataFrame in FastText format
    save_fasttext_data(df, f'{model_string}.txt')

    # Split the dataset into training and testing sets
    train_file = f'{model_string}_train.txt'
    test_file = f'{model_string}_test.txt'
    train_data, test_data = train_test_split(df, test_size=0.1, random_state=24)

    # Save training and testing data into separate files
    save_fasttext_data(train_data, train_file)
    save_fasttext_data(test_data, test_file)
    
    return train_data, test_data, train_file, test_file


def train_fasttext_model(train_file, output_model):
    model = fasttext.train_supervised(input=train_file)
    model.save_model(output_model)

def load_fasttext_model(model_file):
    return fasttext.load_model(model_file)

def predict_fasttext(test_data, model_string):
    # Load the trained model for prediction
    loaded_model = load_fasttext_model(f'{model_string}.bin')
    
    y_pred = [loaded_model.predict(text.strip().replace('\n', ''))[0][0].replace('__label__', '') for text in test_data.tolist()]
    return y_pred

def evaluate_classification(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='micro')
    recall = recall_score(y_true, y_pred, average='micro')
    f1 = f1_score(y_true, y_pred, average='micro')
    
    return accuracy, precision, recall, f1
    
    
def run(df, model_string):
    start_time = time.time()
    train_data, test_data, train_file, test_file = save_fasttext_data_util(df, model_string)

    # Train the FastText classifier
    train_fasttext_model(train_file, f'{model_string}.bin')

    # Predict on the test set
    y_pred = predict_fasttext(test_data['text'], model_string)

    # Model performance evaluation
    accuracy, precision, recall, f1= evaluate_classification(y_true= test_data['label'], y_pred=y_pred)
    
    end_time = time.time()
    time_taken = (end_time - start_time) / 60  # Convert time to minutes
    
    # Print evaluation metrics
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Runtime: ", time_taken)

In [91]:
run(df_10K, model_string='fasttext_data_10k')

Accuracy: 0.779
Precision: 0.779
Recall: 0.779
F1 Score: 0.779
Runtime:  0.012664679686228435




In [88]:
run(df_1M, model_string='fasttext_data_1m')



Accuracy: 0.6668674936788875
Precision: 0.6668674936788875
Recall: 0.6668674936788875
F1 Score: 0.6668674936788875
Runtime:  2.8764015793800355


In [92]:
run(df, model_string='fasttext_data_all')



Accuracy: 0.7209722396724826
Precision: 0.7209722396724826
Recall: 0.7209722396724826
F1 Score: 0.7209722396724826
Runtime:  19.804026015599568


FastText is a popular text classification algorithm developed by Facebook AI Research. It is known for its efficiency in handling large-scale text datasets and its ability to capture subword information, which makes it effective for scenarios where the vocabulary includes rare or out-of-vocabulary words.

FastText utilizes a bag-of-words approach and character n-grams to represent text features. The architecture consists of the following key components:

1. **Input Representation**: FastText represents each word as a bag of character n-grams. These n-grams are used to capture subword information and handle out-of-vocabulary words. By default, FastText considers 3-gram character sequences.

2. **Embedding Layer**: FastText assigns a dense vector representation (embedding) to each word and character n-gram. This allows the model to capture semantic and syntactic information.

3. **Hidden Layer**: FastText uses a single hidden layer neural network. The hidden layer is fully connected and computes the weighted sum of the word and n-gram embeddings.

4. **Output Layer**: FastText employs a softmax activation function at the output layer. The output layer calculates the probabilities for each predefined class or label in the classification task. It supports both single-label and multi-label classification.

5. **Loss Function**: FastText uses the negative log likelihood loss (also known as the cross-entropy loss) to train the model. It optimizes the parameters to minimize the difference between predicted and actual label probabilities.

The key features of FastText are:

1. **Efficiency**: FastText is designed to be computationally efficient, making it suitable for large-scale text datasets. It employs techniques such as hierarchical softmax and the N-gram hashing trick to speed up the training process.

2. **Subword Information**: FastText leverages subword information through character n-grams. This enables the model to handle rare words, misspellings, and word variations effectively.

3. **Language Agnostic**: FastText can work well with different languages, including those with large vocabularies or complex morphological structures. Its subword representation allows it to handle words that may not exist in the training data.

4. **Multi-label Classification**: FastText supports multi-label classification, where each instance can be assigned multiple labels. It treats each label independently and predicts the probability of each label separately.

5. **Pretrained Models**: FastText provides pre-trained models for various languages and tasks. These models can be fine-tuned on specific datasets or used as feature extractors for downstream tasks.

Overall, FastText's architecture and features make it a powerful tool for text classification, particularly in scenarios involving large-scale datasets, multilingual text, and multi-label classification tasks.

In [100]:
# tfidf 
# confusion matrix
# model eval 
# LLM architecture 
# LGBM working, decision tree 
# sent transformers
# fasttext 