In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!cp /content/drive/MyDrive/train.csv .

In [4]:
data = pd.read_csv('train.csv')
data[0:5]

Unnamed: 0,text,category
0,အရေပြားမှိုစွဲ တာ ရဲ့ အတွေ့ ရ အများဆုံး လက္ခဏာ...,med
1,ဘူး လေး အပြင် အိတ် လေး ပါ ပါ သေး တယ် ဆို တော့ ...,ecom
2,ကလေး တို့ မျက်လုံး ကျန်းမာရေး အတွက် ဘယ် အချိန်...,ecom
3,ပြင်းထန် အဆုတ်ရောင် ရောဂါ ဟာ ဖွံ့ဖြိုးဆဲ နိုင်...,med
4,တစ်ခါ တည်း ရှင်းပြ တာ နားလည် သွား တယ်,dc


In [5]:
# Encode the target labels
label_encoder = LabelEncoder()

data['label'] = label_encoder.fit_transform(data['category'])

In [6]:
data['label']

0         2
1         1
2         1
3         2
4         0
         ..
105605    1
105606    1
105607    1
105608    0
105609    2
Name: label, Length: 105610, dtype: int64

In [7]:
# Split the data into training, validation, and test sets
train_data, temp_data = train_test_split(data, test_size=0.1, random_state=42)
valid_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

In [8]:
train_data[0:5]

Unnamed: 0,text,category,label
74268,ဇရာ ကို ရှောင် ချင် ကြ တော့ မျက်နှာ ကို အမျိုး...,dc,0
63718,အမျိုးသမီး တွေ မှာ သံဓာတ် ချို့တဲ့ တာ ရဲ့ အဓိက...,dc,0
59703,အသက် ဆယ့် နှစ် နှစ် နဲ့ ခြောက်ဆယ့် ငါး ကြား ကျ...,med,2
29517,ရာသီဥတု က မိုးရွာ မဲ့ ပုံ ပဲ,dc,0
55886,အသစ် ထုတ် လိုက် တဲ့ စည်းကမ်း နဲ့ ပတ်သက် ပြီး မ...,dc,0


In [9]:
len(train_data)

95049

In [10]:
valid_data[0:5]

Unnamed: 0,text,category,label
26509,အညို လို ချင် ပါ တယ်,ecom,1
86931,အပူဒဏ် စုပ်ယူမှု နည်း တဲ့ အရောင် ဖျော့ဖျော့ အဝ...,med,2
65315,လက်ဆောင် တွေ စု ထား လို့ ရ ပြီ,dc,0
101132,သုံးစွဲ အပြီး စိတ်ကျေနပ်မှု ရရှိ စေ မှာ ပါ,ecom,1
97640,အခု ဝင် လို့ ရ တယ်,dc,0


In [11]:
len(valid_data)

5280

In [12]:
test_data[0:5]

Unnamed: 0,text,category,label
49792,မိုက်ရိုင်း တာ ဘယ်သူ လဲ,dc,0
32515,အချိန် ကြာ ညောင်း တာ ကို ပြော ချင် တာ,dc,0
89733,သန္ဓေသား ၏ လည်ကုပ် သည် မိခင် ၏ ဆီးခုံရိုး အတွင...,med,2
11643,ကိုယ်လက် တွေ ညောင်းညာ ကိုက်ခဲ နေ တယ်,dc,0
342,အသက် ခြောက် လ မှ သုံး နှစ် အတွင်း ဖြစ်ပွားမှု ...,med,2


In [13]:
len(test_data)

5281

In [14]:
# Tokenizer for Burmese
def tokenize(line):
    sentence = line.split(" ")
    return sentence

In [15]:
# Convert text data into numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), tokenizer=tokenize)
X_train = tfidf_vectorizer.fit_transform(train_data['text'])
X_valid = tfidf_vectorizer.transform(valid_data['text'])
X_test = tfidf_vectorizer.transform(test_data['text'])



In [16]:
# Save the TF-IDF vectorizer to a file
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [17]:
X_train[0:1]

<1x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [18]:
# Create LightGBM datasets
train_dataset = lgb.Dataset(X_train, label=train_data['label'])
valid_dataset = lgb.Dataset(X_valid, label=valid_data['label'], reference=train_dataset)

In [19]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 3,
    'metric': 'multi_logloss',
    'num_leaves': 127,
    'learning_rate': 0.1,
    # 'bagging_freq': 5,
    # 'bagging_fraction': 0.8,
    # 'feature_fraction': 0.8,
    # 'lambda_l1': 0.1,
    # 'lambda_l2': 0.2,
    'force_col_wise': True,
    'verbose': 10 }

In [20]:
# Train the LightGBM model with early stopping
num_round = 500  # Adjust the number of boosting rounds
bst = lgb.train(
    params,
    train_dataset,
    num_round,
    valid_sets=[train_dataset, valid_dataset],
    valid_names=['train', 'valid'],
    callbacks=[lgb.early_stopping(stopping_rounds=10)],
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.998274
[LightGBM] [Info] Total Bins 345016
[LightGBM] [Info] Number of data points in the train set: 95049, number of used features: 9958
[LightGBM] [Info] Start training from score -1.070633
[LightGBM] [Info] Start training from score -1.158208
[LightGBM] [Info] Start training from score -1.069559
[LightGBM] [Debug] Trained a tree with leaves = 127 and depth = 63
[LightGBM] [Debug] Trained a tree with leaves = 127 and depth = 67
[LightGBM] [Debug] Trained a tree with leaves = 127 and depth = 36
Training until validation scores don't improve for 10 rounds
[LightGBM] [Debug] Trained a tree with leaves = 127 and depth = 55
[LightGBM] [Debug] Trained a tree with leaves = 127 and depth = 70
[LightGBM] [Debug] Trained a tree with leaves = 127 and depth = 33
[LightGBM] [Debug] Trained a tree with leaves = 127 and depth = 50
[LightGBM] [Debug] Trained a tree with leaves = 127 and depth = 72
[LightGBM] [Debug] Trained a t

In [21]:
# Track the best performance on the validation set
best_iteration = bst.best_iteration
best_score = bst.best_score['valid']['multi_logloss']

print(best_score)

# Make predictions on the test set using the best model
y_pred = bst.predict(X_test, num_iteration=best_iteration)
y_pred = np.argmax(y_pred, axis=1)

# Decode the predicted labels
y_pred_label = label_encoder.inverse_transform(y_pred)

0.3782832671209029


In [22]:
# Evaluate the model
accuracy = accuracy_score(test_data['category'], y_pred)
print(f'Best Iteration: {best_iteration}')
print(f'Best Validation Log Loss: {best_score}')
print(f'Accuracy: {accuracy}')
test_data['category'] = label_encoder.fit_transform(test_data['category'])
print(classification_report(test_data['category'], y_pred))

Best Iteration: 271
Best Validation Log Loss: 0.3782832671209029
Accuracy: 0.0
              precision    recall  f1-score   support

           0       0.86      0.89      0.87      1770
           1       0.86      0.82      0.84      1671
           2       0.87      0.87      0.87      1840

    accuracy                           0.86      5281
   macro avg       0.86      0.86      0.86      5281
weighted avg       0.86      0.86      0.86      5281



In [23]:
# Save the trained model
bst.save_model('burmese_text_classifier.model')

<lightgbm.basic.Booster at 0x7d941543a8f0>

In [24]:
# Load the trained LightGBM model
bst = lgb.Booster(model_file='burmese_text_classifier.model')

# Load the TF-IDF vectorizer used during training
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')  # Replace with your actual vectorizer file

# Input sentence to test
sentence = ["ကင်ဆာ ရောဂါ နဲ့ ဆုံး သွား တာ", "မင်္ဂလာ ပါ ခင်ဗျာ", "စား ဖူး သူ တိုင်း အရမ်း ကြိုက် ကြ တဲ့ ဂျယ်လီ လေး ပါ"]

# Transform the input sentence using the TF-IDF vectorizer
test_data = tfidf_vectorizer.transform(sentence)

# Make predictions using the loaded model
predictions = bst.predict(test_data)

print(predictions)

[[3.22784238e-01 6.91370312e-02 6.08078731e-01]
 [5.70383158e-01 4.21052527e-01 8.56431446e-03]
 [3.33773036e-03 9.96134689e-01 5.27580926e-04]]


In [25]:
predictions = np.argmax(predictions, axis=1)
print (predictions)

# Decode the predicted labels
pred_label = label_encoder.inverse_transform(predictions)
print (pred_label)

[2 0 1]
['med' 'dc' 'ecom']
