In [None]:
import numpy as np
import pandas as pd
import cv2
import os
from skimage.feature import graycomatrix, graycoprops

# Function to compute GLCM features for six properties across four directions
def compute_glcm_features_4_directions(window, distances=[1], levels=256):
    # Convert to grayscale if the window is not already
    if len(window.shape) == 3:
        window = cv2.cvtColor(window, cv2.COLOR_BGR2GRAY)
    
    # Ensure the window pixel intensity is in the range [0, levels-1]
    window = np.clip(window, 0, levels - 1).astype(np.uint8)
    
    # Define angles for the four directions: 0°, 45°, 90°, 135°
    angles = [0, np.pi/4, np.pi/2, 3*np.pi/4]
    
    # Compute GLCM
    glcm = graycomatrix(window, distances=distances, angles=angles, levels=levels, symmetric=True, normed=True)
    
    # Extract GLCM properties for six features
    features = {}
    features['contrast'] = np.mean(graycoprops(glcm, 'contrast'))
    features['dissimilarity'] = np.mean(graycoprops(glcm, 'dissimilarity'))
    features['homogeneity'] = np.mean(graycoprops(glcm, 'homogeneity'))
    features['ASM'] = np.mean(graycoprops(glcm, 'ASM'))  # Angular Second Moment (Energy)
    features['energy'] = np.mean(np.sqrt(graycoprops(glcm, 'ASM')))  # Energy is sqrt(ASM)
    features['correlation'] = np.mean(graycoprops(glcm, 'correlation'))
    
    return [features['contrast'], features['dissimilarity'], features['homogeneity'], 
            features['ASM'], features['energy'], features['correlation']]

# Load data from the Excel file
labels_df = pd.read_excel(r'//home//jaykishor_c//final-model-training//line_gt 7.xlsx')  

# Parameters
window_width = 50  # Width of each sliding window in pixels
step_size = 20     # Step size of the sliding window in pixels
image_folder = r'//home//jaykishor_c//final-model-training//color_equlsize_jpg 2//color_equlsize_jpg'  

# labels_df = pd.read_excel(r'//home//jaykishor_c//final-model-training//gt_WIndow .xlsx')
# image_folder = r'//home//jaykishor_c//final-model-training//color_window_double1//color_window_double1' 

# Dictionary to store GLCM feature sequences for each character across all images
character_glcm_sequences = {}

# Process each image (word) in the dataset
for index, row in labels_df.iterrows():
    image_name = row['image name']          # image name of respective gt text
    character_sequence = row['gt']          # gt text
    
    # Load the corresponding image
    image_path = os.path.join(image_folder, image_name)
    image = cv2.imread(image_path)
    if image is None:
        print(f"Image {image_name} could not be loaded.")
        continue

    image_width = image.shape[1]  # Get image width
    
    # Calculate the width of each character region based on the sequence length
    num_characters = len(character_sequence)
    character_width = image_width // num_characters
    
    # Loop through each character in the sequence and collect its GLCM features
    for i, char in enumerate(character_sequence):
        # Define the region corresponding to the current character
        region_start = i * character_width
        region_end = region_start + character_width
        character_region = image[:, region_start:region_end]  # Assume height is all rows
        
        # Split the character region into windows to capture GLCM features
        num_windows = (character_width - window_width) // step_size + 1
        char_glcm_sequence = []
        
        for j in range(num_windows):
            # Calculate the start and end of the window within the character region
            window_start = region_start + j * step_size
            window_end = window_start + window_width
            
            # Extract the window
            window = image[:, window_start:window_end]
            
            # Compute GLCM features for this window across four directions
            glcm_features = compute_glcm_features_4_directions(window)
            char_glcm_sequence.append(glcm_features)
        
        # Append this character's GLCM features to the global dictionary
        if char not in character_glcm_sequences:
            character_glcm_sequences[char] = []
        character_glcm_sequences[char].append(char_glcm_sequence)

# Print the number of GLCM feature sequences for each character
for char, sequences in character_glcm_sequences.items():
    print(f"Character '{char}'  has {len(sequences)}    sequences of GLCM features.")

# Save the GLCM features to a file for analysis
output_path = r'character_glcm_features_final.xlsx'
features_df = pd.DataFrame({
    'Character': [char for char, sequences in character_glcm_sequences.items() for _ in sequences],
    'GLCM Features': [seq for sequences in character_glcm_sequences.values() for seq in sequences]
})



features_df.to_excel(output_path, index=False)
print(f"GLCM features saved to {output_path}")


In [None]:
len(char_glcm_sequence)

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from hmmlearn import hmm
from hmmlearn.hmm import GaussianHMM


# Dictionary to store character HMMs
character_hmms = {}
num_states = 4

# Example: 'character_glcm_sequences' contains the GLCM feature sequences for each character
for char, sequences in character_glcm_sequences.items():
    # Remove empty sequences
    sequences = [seq for seq in sequences if len(seq) > 0]
    
    if len(sequences) == 0:
        print(f"Warning: No valid sequences for character {char}. Skipping this character.")
        continue  # Skip to the next character if no valid sequences
    
    if len(sequences) >= 15:
        # Prepare training data for the HMM
        X = np.vstack(sequences)  # Stack the sequences into a single array
        lengths = [len(seq) for seq in sequences]  # Length of each sequence

        # Initialize HMM for this character
        model = hmm.GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000, init_params='', verbose=True)


        # Use the custom model
        model.fit(X, lengths)

        # Store the trained model
        character_hmms[char] = model
        print(f"Model trained for character: {char}")


In [None]:
print(model.transmat_)  # View the transition matrix after training
row_sums = model.transmat_.sum(axis=1)
print(row_sums)



In [None]:
import joblib
import re

# Assuming your models are in a dictionary called `character_hmms`
# e.g., `character_hmms = {'െ': trained_model_1, 'ത': trained_model_2, ... }`

def sanitize_filename(char):
    # Replace invalid filename characters with an underscore or remove them
    return re.sub(r'[<>:"/\\|?*]', '_', char)

for char, model in character_hmms.items():
    # Sanitize character to create a valid filename
    sanitized_char = sanitize_filename(char)
    print(char,sanitized_char )
    # Save the trained HMM model to a file with the sanitized name
    joblib.dump(model, f"{sanitized_char}_hmm.pkl")

In [None]:
print(X.shape)  

In [None]:
# Extract the character names (keys) from the character_hmms dictionary
char_names = list(character_hmms.keys())

# Print the character names
print("Character names:", char_names)

In [None]:
len(char_names)

In [None]:
ground_truth =char_names

In [None]:
print("Ground truth list:", ground_truth)

In [None]:
# Extract the character names (keys) from the character_hmms dictionary
char_names = list(character_hmms.keys())

# Print the character names
print("Character names:", char_names)

In [None]:
unique_chars = sorted(set("".join(ground_truth)))
char_to_state = {char: idx for idx, char in enumerate(unique_chars)}
state_to_char = {idx: char for char, idx in char_to_state.items()}

In [None]:
from nltk.translate.bleu_score import sentence_bleu
import numpy as np

characters_to_remove = ['ണ',' ൾ','ു','ഥ','ൃ','ബ','ൂഃ','ഃ','ൻ']
 
# Calculate BLEU score for predictions
for char, model in character_hmms.items():
    try:
        sequences = character_glcm_sequences[char]
        # Remove empty sequences
        sequences = [seq for seq in sequences if len(seq) > 0]
        # Check if we have any sequences left
        if len(sequences) >= 15:
            X = np.vstack(sequences)
            lengths = [len(seq) for seq in sequences]
            # Check for valid transition matrix
            row_sums = model.transmat_.sum(axis=1)
            if not np.allclose(row_sums, 1):
                print(f"Problem with transition matrix for character '{char}': row sums = {row_sums}")
                characters_to_remove.append(char)
                continue  # Skip this model and move to the next one
            # Get predicted states
            predicted_states = model.predict(X)
            mapped_predictions = [state_to_char[state] for state in predicted_states]
            # Calculate BLEU score
            bleu_score = sentence_bleu([ground_truth], mapped_predictions)
            print(f"BLEU Score for character '{char}': {bleu_score}")
    except ValueError as e:
        print(f"Error with character '{char}': {e}")

In [None]:
print(f"These all character should be removed : {characters_to_remove}, becauses Problem with transition matrix for character, row sums = [0. 0. 0. 0.] ")

In [None]:
# List of characters to remove from the character_hmms dictionary
# Remove characters if they exist in the dictionary
for char in characters_to_remove:
    print (char)
    if char in character_hmms:
        del character_hmms[char]
        print(f"Deleted HMM model for character: {char}")
    else:
        print(f"Character {char} not found in character_hmms.")

In [None]:

from nltk.translate.bleu_score import sentence_bleu
import numpy as np

# Calculate BLEU score for predictions (this is more useful for sequence generation tasks)
for char, model in character_hmms.items():
    sequences = character_glcm_sequences[char]
    
    # Remove empty sequences
    sequences = [seq for seq in sequences if len(seq) > 0]
    
    # Check if we have any sequences left
    if len(sequences) >=15:
       
        X = np.vstack(sequences)
    
    lengths = [len(seq) for seq in sequences]
    
    # Get predicted states
    predicted_states = model.predict(X)
    mapped_predictions = [state_to_char[state] for state in predicted_states]

# Calculate BLEU score
bleu_score = sentence_bleu([ground_truth], mapped_predictions)
print(f"BLEU Score: {bleu_score}")


In [None]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
import numpy as np

# List to store results
results = []

# Loop through each character and its model
for char, model in character_hmms.items():
    sequences = character_glcm_sequences[char]
    
    # Remove empty sequences
    sequences = [seq for seq in sequences if len(seq) > 0]
    
    # Check if we have enough sequences to proceed
    if len(sequences) >= 15:
        X = np.vstack(sequences)

        lengths = [len(seq) for seq in sequences]
        
        # Get predicted states and map to characters
        predicted_states = model.predict(X)
        mapped_predictions = [state_to_char[state] for state in predicted_states]
        
        # Calculate BLEU score
        bleu_score = sentence_bleu([ground_truth], mapped_predictions)
        
        # Store the results
        results.append({
            'Character': char,
            'Mapped Predictions': ''.join(mapped_predictions),
            'BLEU Score': bleu_score
        })

# Create a DataFrame to store results
results_df = pd.DataFrame(results)


# Save the results to an Excel file
output_path = r'mapped_predictions_bleu_scores_hog.xlsx'
results_df.to_excel(output_path, index=False)
print(f"Results saved to {output_path}")


In [None]:
results_df

**Log Probability**

In [None]:
'''for char, model in character_hmms.items():
    sequences = character_glcm_sequences[char]
    sequences = [seq for seq in sequences if len(seq) > 0]

    if len(sequences) >= 15:
        X = np.vstack(sequences)
        lengths = [len(seq) for seq in sequences]
        log_probability = model.score(X, lengths)
        print(f"Log Probability for character '{char}': {log_probability}")
'''

**Model Probability**

In [None]:
'''for char, model in character_hmms.items():
    sequences = character_glcm_sequences[char]
    sequences = [seq for seq in sequences if len(seq) > 0]

    if len(sequences) >= 15:
        X = np.vstack(sequences)
        lengths = [len(seq) for seq in sequences]
        log_probability = model.score(X, lengths)
        model_probability = np.exp(log_probability)  # Convert to regular probability
        print(f"Model Probability for character '{char}': {model_probability}")
'''

**Transition Matrix Validation**

In [None]:
'''for char, model in character_hmms.items():
    row_sums = model.transmat_.sum(axis=1)
    if not np.allclose(row_sums, 1):
        print(f"Transition matrix issue for '{char}': {row_sums}")
    else:
        print(f"Transition matrix for '{char}' is valid.")
'''

**Emission Probabilities**

In [None]:
'''for char, model in character_hmms.items():
    print(f"Emission means for character '{char}': {model.means_}")
    print(f"Emission covariances for character '{char}': {model.covars_}")
'''

**Per-State Log Probability**

In [None]:
'''for char, model in character_hmms.items():
    sequences = character_glcm_sequences[char]
    sequences = [seq for seq in sequences if len(seq) > 0]

    if len(sequences) >= 15:
        X = np.vstack(sequences)  # Combine all sequences into a single array

        # Use score_samples to compute posterior probabilities for states
        log_prob, posteriors = model.score_samples(X)
        print(f"Log probabilities for character '{char}': {log_prob}")
        print(f"Posterior probabilities for states:\n{posteriors}")
'''

**Actual vs Prediction**

In [None]:
# Dictionary to store actual vs predicted characters
results = {'Actual': [], 'Predicted': []}
 
# Process each test character and its sequences
for actual_char, test_sequences in character_glcm_sequences.items():
    for test_sequence in test_sequences:
        test_sequence = np.array(test_sequence)
        # Dictionary to store scores for each character model
        scores = {}
        # Score the test sequence against each character's HMM model
        for char, model in character_hmms.items():
            try:
                scores[char] = model.score(test_sequence)
            except:
                scores[char] = -np.inf  # If model can't score, assign a low score
        # Predict the character with the highest score
        predicted_char = max(scores, key=scores.get)
        # Store the result
        results['Actual'].append(actual_char)
        results['Predicted'].append(predicted_char)
 
# Save the results to an Excel file
output_path = r'actual_vs_predicted.xlsx'
results_df = pd.DataFrame(results)
results_df.to_excel(output_path, index=False)

# Calculate accuracy
correct_predictions = sum([1 for actual, predicted in zip(results['Actual'], results['Predicted']) if actual == predicted])
total_predictions = len(results['Actual'])
accuracy = (correct_predictions / total_predictions) * 100

# Print accuracy
print(f"Accuracy: {accuracy:.2f}%")


print(f"Results saved to {output_path}")