In [1]:
#!pip install openpyxl for reading excel

In [1]:
import data_cleaning as dc
dc.main()

The first step in our pipeline involves loading the data into a pandas DataFrame. This is accomplished using the pandas library, which is imported at the beginning of the script.

In [8]:
# Step 1: Load the data
import pandas as pd
from tensorflow.keras.callbacks import EarlyStopping

df = pd.read_csv('C:/Intro-to-Deep-Learning/chapter2/Project_02/data/MMNames_clean.csv')

In [13]:
# Step 2: Prepare the data
import data_preprocessing as dp
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

df = dp.preprocess_category(df,'SR_Name')
#df = dp.preprocess_onehot(df,'name')

y = df['SR_Name'].values 
#X = df.drop(columns=['SR_Name']).values
 #--- CHANGE 1: Using new TF-IDF function ---
# This creates features based on character patterns, not identities.
X, vectorizer = dp.preprocess_text_features(df, 'name')

print(f"Shape of feature matrix: {X.shape}") # Should be (num_samples, 1000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape)


# Convert sparse TF-IDF matrix X to a DataFrame
feature_df = pd.DataFrame(
        X, 
        columns=vectorizer.get_feature_names_out()      # all 1 000 n-gram columns
)

# Display the first 5 rows and only the first 20 columns
print(feature_df.iloc[:5, :20])

Shape of feature matrix: (19513, 1000)
(13659, 1000) (5854, 1000)
     (   (e  (ea   (n  (no   (s  (so    a   ai  ain   au    b   ba   be   bo  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   

     c   ch  cha  cho    d  
0  0.0  0.0  0.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  
4  0.0  0.0  0.0  0.0  0.0  


In [19]:
# Step 3: Build a NN model with TensorFlow
import tensorflow as tf

def create_classification_model(input_shape, num_classes, params={}):
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(shape=input_shape),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [20]:
# Step 4: Create and train the model
model = create_classification_model(input_shape=[X_train.shape[1]],num_classes=len(df['SR_Name'].unique()), )
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=0)

# 1. Create an EarlyStopping callback
# This will monitor the validation loss and stop training when it stops improving.
early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience=10, # Number of epochs with no improvement before stopping
    restore_best_weights=True # Restores the model from the epoch with the best val_loss
)

In [21]:
# Step 5: Evaluate the model

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test, batch_size=32, verbose=0)
y_pred = y_pred.argmax(axis=1)
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).round(2).transpose()
report_df.to_csv('C:/Intro-to-Deep-Learning/chapter2/Project_02/data/cls_report_test_4hidden_ngram.csv', index=False)

y_pred = model.predict(X_train, batch_size=32, verbose=0)
y_pred = y_pred.argmax(axis=1)
report = classification_report(y_train, y_pred, output_dict=True)
report_df = pd.DataFrame(report).round(2).transpose()
report_df.to_csv('C:/Intro-to-Deep-Learning/chapter2/Project_02/data/cls_report_train_4hidden_ngram.csv', index=False)
