In [1]:
#!pip install openpyxl for reading excel

In [4]:
import data_cleaning as dc
dc.main()

The first step in our pipeline involves loading the data into a pandas DataFrame. This is accomplished using the pandas library, which is imported at the beginning of the script.

In [5]:
# Step 1: Load the data
import pandas as pd

df = pd.read_csv('./data/MMNames_clean.csv')

In [None]:
# Step 2: Prepare the data
import data_preprocessing as dp
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

df = dp.preprocess_category(df,'SR_Name') # Convert State/Region as categorial code
df = dp.preprocess_onehot(df,'name') # Conver Town/Village as one hot encoding categorail code

y = df['SR_Name'].values 
X = df.drop(columns=['SR_Name']).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ellen\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ellen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ellen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


(13659, 13003) (5854, 13003)


In [None]:
#print(df)

       SR_Name  name_(Du) Chee Yar Tan  name_(Du) Nyaung Pin Gyi  \
0            0                   False                     False   
1            0                   False                     False   
2            0                   False                     False   
3            0                   False                     False   
4            0                   False                     False   
...        ...                     ...                       ...   
19508       12                   False                     False   
19509        6                   False                     False   
19510       15                   False                     False   
19511       11                   False                     False   
19512       11                   False                     False   

       name_(Kyun Nyo Gyi) Kyun Hteik  name_(Pa) Nyaung Pin Gyi  \
0                               False                     False   
1                               False            

In [None]:
#print(X_train)

[[False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]
 ...
 [False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]]


In [None]:
# Step 3: Build a NN model with TensorFlow
import tensorflow as tf

def create_classification_model(input_shape, num_classes, params={}):
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(shape=input_shape),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(8, activation='relu'),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
#print(X_train.shape[1], len(df['SR_Name'].unique())) # Input = 13003, output = 18

13003 18


In [16]:
# Step 4: Create and train the model
import time
model = create_classification_model(input_shape=[X_train.shape[1]],num_classes=len(df['SR_Name'].unique()), )
start_time = time.time()  # to calcuate the training time
history = model.fit(X_train, y_train, epochs=50, batch_size=32, 
                    validation_data=(X_test, y_test), verbose=0)
training_time = time.time() - start_time # to calcuate the training time
print(f"Training time: {training_time:.2f} seconds")
# model.fit() returns a History object. history.history — a dictionary of loss and validation loss (and metrics if specified) for each epoch.

Training time: 119.02 seconds


In [17]:
model.summary()

In [18]:
# Step 5: Evaluate the model

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test, batch_size=32, verbose=0) # batch_size = 32: processes 32 samples at a time for efficiency., verbose=0: disables output logs.
y_pred = y_pred.argmax(axis=1)  # Converts probability outputs into actual class labels
report = classification_report(y_test, y_pred, output_dict=True) # Compares predicted labels (y_pred) to true labels (y_test). output_dict=True: returns the report as a dictionary for easier DataFrame conversion.
report_df = pd.DataFrame(report).round(2).transpose() # Converts the report dictionary into a clean, rounded DataFrame.
report_df.to_csv('./data/cls_report_test.csv', index=False)

y_pred = model.predict(X_train, batch_size=32, verbose=0)
y_pred = y_pred.argmax(axis=1)
report = classification_report(y_train, y_pred, output_dict=True)
report_df = pd.DataFrame(report).round(2).transpose()
report_df.to_csv('./data/cls_report_train.csv', index=False)
