In [1]:
# import json
# import csv
# import glob
# import os

# # Path to JSON files
# json_files = glob.glob('/home/kushagra/Documents/code/AI/Adobe-India-Hackathon25-main/adobe_india_hackathon/Challenge - 1(a)/Datasets/Output.json/*.json')

# # Output CSV file
# csv_filename = 'output_outline.csv'
# fieldnames = ['file', 'title', 'level', 'text', 'page']  # Added 'title'

# with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
#     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#     writer.writeheader()
    
#     for json_file in json_files:
#         with open(json_file, 'r', encoding='utf-8') as f:
#             try:
#                 data = json.load(f)
#                 title = data.get('title', '')  # Get title once per file
#                 for item in data.get('outline', []):
#                     writer.writerow({
#                         'file': os.path.basename(json_file),
#                         'title': title,
#                         'level': item.get('level', ''),
#                         'text': item.get('text', ''),
#                         'page': item.get('page', '')
#                     })
#             except json.JSONDecodeError as e:
#                 print(f"Error decoding JSON file {json_file}: {e}")


In [2]:
import pandas as pd
data = pd.read_csv('output_outline.csv')

In [3]:
data

Unnamed: 0,file,title,level,text,page
0,E0CCG5S312.json,Overview Foundation Level Extensions,H1,Revision History,2
1,E0CCG5S312.json,Overview Foundation Level Extensions,H1,Table of Contents,3
2,E0CCG5S312.json,Overview Foundation Level Extensions,H1,Acknowledgements,4
3,E0CCG5S312.json,Overview Foundation Level Extensions,H1,1. Introduction to the Foundation Level Extens...,5
4,E0CCG5S312.json,Overview Foundation Level Extensions,H1,2. Introduction to Foundation Level Agile Test...,6
...,...,...,...,...,...
56,STEMPathwaysFlyer.json,,H1,Parsippany -Troy Hills STEM Pathways,0
57,STEMPathwaysFlyer.json,,H2,PATHWAY OPTIONS,0
58,STEMPathwaysFlyer.json,,H2,Elective Course Offerings,1
59,STEMPathwaysFlyer.json,,H3,What Colleges Say!,1


In [4]:
# Check data structure and unique levels
print("Data shape:", data.shape)
print("\nColumn names:", data.columns.tolist())
print("\nUnique levels:", data['level'].unique())
print("\nLevel counts:")
print(data['level'].value_counts())
print("\nFirst few rows:")
print(data.head())

Data shape: (61, 5)

Column names: ['file', 'title', 'level', 'text', 'page']

Unique levels: ['H1' 'H2' 'H3' 'H4']

Level counts:
level
H3    26
H2    20
H1    11
H4     4
Name: count, dtype: int64

First few rows:
              file                                    title level  \
0  E0CCG5S312.json  Overview  Foundation Level Extensions      H1   
1  E0CCG5S312.json  Overview  Foundation Level Extensions      H1   
2  E0CCG5S312.json  Overview  Foundation Level Extensions      H1   
3  E0CCG5S312.json  Overview  Foundation Level Extensions      H1   
4  E0CCG5S312.json  Overview  Foundation Level Extensions      H1   

                                                text  page  
0                                  Revision History      2  
1                                 Table of Contents      3  
2                                  Acknowledgements      4  
3  1. Introduction to the Foundation Level Extens...     5  
4  2. Introduction to Foundation Level Agile Test...     6  


In [5]:
# Import required libraries for machine learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
# Data preprocessing and feature engineering
print("Preparing data for Random Forest classification...")

# Clean and prepare the data
data_clean = data.dropna(subset=['title', 'text', 'level'])

# Combine title and text for feature extraction
data_clean['combined_text'] = data_clean['title'].astype(str) + ' ' + data_clean['text'].astype(str)

# Remove empty or very short texts
data_clean = data_clean[data_clean['combined_text'].str.len() > 3]

print(f"Cleaned data shape: {data_clean.shape}")
print(f"Level distribution after cleaning:")
print(data_clean['level'].value_counts())

Preparing data for Random Forest classification...
Cleaned data shape: (56, 6)
Level distribution after cleaning:
level
H3    25
H2    18
H1     9
H4     4
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['combined_text'] = data_clean['title'].astype(str) + ' ' + data_clean['text'].astype(str)


In [7]:
# Feature extraction using TF-IDF with multilingual UTF-8 support
print("Extracting features using TF-IDF with multilingual support...")

# Create TF-IDF features from combined text with UTF-8 and multilingual support
vectorizer = TfidfVectorizer(
    max_features=2000,  # Increased for multilingual content
    ngram_range=(1, 3),  # Use unigrams, bigrams, and trigrams for better multilingual support
    min_df=1,  # Reduced to handle diverse languages
    max_df=0.9,  # Slightly increased threshold
    lowercase=True,  # Convert to lowercase for consistency
    analyzer='char_wb',  # Character-based analysis for multilingual support
    encoding='utf-8',  # Explicit UTF-8 encoding
    decode_error='ignore',  # Handle encoding errors gracefully
    strip_accents='unicode',  # Handle accented characters
    token_pattern=r'(?u)\b\w+\b'  # Unicode-aware word boundaries
)

# Alternative configuration for word-based analysis (uncomment if preferred)
# vectorizer = TfidfVectorizer(
#     max_features=2000,
#     stop_words=None,  # No stop words for multilingual support
#     ngram_range=(1, 2),
#     min_df=1,
#     max_df=0.9,
#     lowercase=True,
#     encoding='utf-8',
#     decode_error='ignore',
#     strip_accents='unicode'
# )

# Fit and transform the text data
X = vectorizer.fit_transform(data_clean['combined_text'])
y = data_clean['level']

print(f"Feature matrix shape: {X.shape}")
print(f"Target variable shape: {y.shape}")
print(f"Vectorizer encoding: {vectorizer.encoding}")
print(f"Vectorizer analyzer: {vectorizer.analyzer}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

Extracting features using TF-IDF with multilingual support...
Feature matrix shape: (56, 954)
Target variable shape: (56,)
Vectorizer encoding: utf-8
Vectorizer analyzer: char_wb
Training set size: 50
Testing set size: 6




In [8]:
from sklearn.model_selection import GridSearchCV

# Train Random Forest Classifier with hyperparameter tuning and class weighting
print("Training Random Forest Classifier with GridSearch and Class Weighting...")

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True]
}

# Create a Random Forest classifier with balanced class weights
rf_classifier_balanced = RandomForestClassifier(
    random_state=42,
    class_weight='balanced'  # Address data imbalance
)

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=rf_classifier_balanced,
    param_grid=param_grid,
    cv=3,  # 3-fold cross-validation
    n_jobs=-1,  # Use all available cores
    verbose=1
)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best estimator
rf_classifier = grid_search.best_estimator_

print("\nBest Parameters found by GridSearchCV:")
print(grid_search.best_params_)

# Make predictions with the best model
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred, labels=np.unique(y))
print(cm)

Training Random Forest Classifier with GridSearch and Class Weighting...
Fitting 3 folds for each of 24 candidates, totalling 72 fits

Best Parameters found by GridSearchCV:
{'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}

Model Accuracy: 0.8333

Classification Report:
              precision    recall  f1-score   support

          H1       1.00      1.00      1.00         1
          H2       1.00      0.50      0.67         2
          H3       0.75      1.00      0.86         3

    accuracy                           0.83         6
   macro avg       0.92      0.83      0.84         6
weighted avg       0.88      0.83      0.82         6


Confusion Matrix:
[[1 0 0 0]
 [0 1 1 0]
 [0 0 3 0]
 [0 0 0 0]]

Best Parameters found by GridSearchCV:
{'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}

Model Accuracy: 0.8333

Classification Report:
              precision    recall  

In [9]:
# Function to predict heading level for new title and text
def predict_heading_level(title, text):
    """
    Predict the heading level (H1, H2, H3, H4) for given title and text
    
    Args:
        title (str): The title/heading text
        text (str): The content text
    
    Returns:
        tuple: (predicted_level, confidence_scores)
    """
    # Combine title and text
    combined_text = str(title) + ' ' + str(text)
    
    # Transform using the same vectorizer
    text_features = vectorizer.transform([combined_text])
    
    # Predict
    prediction = rf_classifier.predict(text_features)[0]
    
    # Get prediction probabilities
    probabilities = rf_classifier.predict_proba(text_features)[0]
    classes = rf_classifier.classes_
    
    # Create confidence scores dictionary
    confidence_scores = dict(zip(classes, probabilities))
    
    return prediction, confidence_scores

# Test the prediction function with some examples
print("Testing the prediction function:")
print("-" * 50)

# Example 1
title1 = "Introduction"
text1 = "This document provides an overview of the foundation level extensions"
pred1, conf1 = predict_heading_level(title1, text1)
print(f"Title: '{title1}'")
print(f"Text: '{text1}'")
print(f"Predicted Level: {pred1}")
print(f"Confidence Scores: {conf1}")
print()

# Example 2
title2 = "Testing Methods"
text2 = "Various testing approaches and methodologies used in software development"
pred2, conf2 = predict_heading_level(title2, text2)
print(f"Title: '{title2}'")
print(f"Text: '{text2}'")
print(f"Predicted Level: {pred2}")
print(f"Confidence Scores: {conf2}")
print()

# Example 3
title3 = "Specific Implementation Details"
text3 = "Detailed explanation of implementation steps and procedures"
pred3, conf3 = predict_heading_level(title3, text3)
print(f"Title: '{title3}'")
print(f"Text: '{text3}'")
print(f"Predicted Level: {pred3}")
print(f"Confidence Scores: {conf3}")

Testing the prediction function:
--------------------------------------------------
Title: 'Introduction'
Text: 'This document provides an overview of the foundation level extensions'
Predicted Level: H2
Confidence Scores: {'H1': 0.23984548784548784, 'H2': 0.6984795983743354, 'H3': 0.061674913780176935, 'H4': 0.0}

Title: 'Testing Methods'
Text: 'Various testing approaches and methodologies used in software development'
Predicted Level: H2
Confidence Scores: {'H1': 0.2567142857142858, 'H2': 0.5789341269288432, 'H3': 0.13435158735687142, 'H4': 0.03}

Title: 'Specific Implementation Details'
Text: 'Detailed explanation of implementation steps and procedures'
Predicted Level: H2
Confidence Scores: {'H1': 0.320418747918748, 'H2': 0.5875345434292805, 'H3': 0.08204670865197179, 'H4': 0.01}


In [11]:
# Test multilingual capabilities
print("Testing multilingual capabilities:")
print("=" * 60)

# English examples
print("ENGLISH EXAMPLES:")
print("-" * 30)
pred_en1, conf_en1 = predict_heading_level("Chapter Introduction", "This chapter covers the basic concepts")
print(f"EN Title: 'Chapter Introduction'")
print(f"EN Text: 'This chapter covers the basic concepts'")
print(f"Predicted: {pred_en1}, Confidence: {max(conf_en1.values()):.3f}")
print()

# Spanish examples
print("SPANISH EXAMPLES:")
print("-" * 30)
pred_es1, conf_es1 = predict_heading_level("Introducción al Capítulo", "Este capítulo cubre los conceptos básicos")
print(f"ES Title: 'Introducción al Capítulo'")
print(f"ES Text: 'Este capítulo cubre los conceptos básicos'")
print(f"Predicted: {pred_es1}, Confidence: {max(conf_es1.values()):.3f}")
print()

# French examples
print("FRENCH EXAMPLES:")
print("-" * 30)
pred_fr1, conf_fr1 = predict_heading_level("Introduction du Chapitre", "Ce chapitre couvre les concepts de base")
print(f"FR Title: 'Introduction du Chapitre'")
print(f"FR Text: 'Ce chapitre couvre les concepts de base'")
print(f"Predicted: {pred_fr1}, Confidence: {max(conf_fr1.values()):.3f}")
print()

# German examples
print("GERMAN EXAMPLES:")
print("-" * 30)
pred_de1, conf_de1 = predict_heading_level("Kapitel Einführung", "Dieses Kapitel behandelt die Grundkonzepte")
print(f"DE Title: 'Kapitel Einführung'")
print(f"DE Text: 'Dieses Kapitel behandelt die Grundkonzepte'")
print(f"Predicted: {pred_de1}, Confidence: {max(conf_de1.values()):.3f}")
print()

# Test with accented characters
print("ACCENTED CHARACTERS:")
print("-" * 30)
pred_acc, conf_acc = predict_heading_level("Configuración Avanzada", "Configuración detallada de parámetros específicos")
print(f"Accented Title: 'Configuración Avanzada'")
print(f"Accented Text: 'Configuración detallada de parámetros específicos'")
print(f"Predicted: {pred_acc}, Confidence: {max(conf_acc.values()):.3f}")
print()

print("TEST:")
print("-" * 30)
pred_acc2, conf_acc2 = predict_heading_level(" ", "What Colleges Say!")
print(f"Accented Title: ''")
print(f"Accented Text: 'What Colleges Say!'")
print(f"Predicted: {pred_acc2}, Confidence: {max(conf_acc2.values()):.3f}")
print()

# Test with mixed languages
print("MIXED LANGUAGE:")
print("-" * 30)
pred_mix, conf_mix = predict_heading_level("API Documentation", "Documentation complète pour l'API REST")
print(f"Mixed Title: 'API Documentation'")
print(f"Mixed Text: 'Documentation complète pour l'API REST'")
print(f"Predicted: {pred_mix}, Confidence: {max(conf_mix.values()):.3f}")
print()

# Japanese examples
print("JAPANESE EXAMPLES:")
print("-" * 30)
pred_jp, conf_jp = predict_heading_level("章の紹介", "この章では基本的な概念について説明します")
print(f"JP Title: '章の紹介'")
print(f"JP Text: 'この章では基本的な概念について説明します'")
print(f"Predicted: {pred_jp}, Confidence: {max(conf_jp.values()):.3f}")


Testing multilingual capabilities:
ENGLISH EXAMPLES:
------------------------------
EN Title: 'Chapter Introduction'
EN Text: 'This chapter covers the basic concepts'
Predicted: H2, Confidence: 0.670

SPANISH EXAMPLES:
------------------------------
ES Title: 'Introducción al Capítulo'
ES Text: 'Este capítulo cubre los conceptos básicos'
Predicted: H2, Confidence: 0.666

FRENCH EXAMPLES:
------------------------------
FR Title: 'Introduction du Chapitre'
FR Text: 'Ce chapitre couvre les concepts de base'
Predicted: H2, Confidence: 0.683

GERMAN EXAMPLES:
------------------------------
DE Title: 'Kapitel Einführung'
DE Text: 'Dieses Kapitel behandelt die Grundkonzepte'
Predicted: H2, Confidence: 0.652

ACCENTED CHARACTERS:
------------------------------
Accented Title: 'Configuración Avanzada'
Accented Text: 'Configuración detallada de parámetros específicos'
Predicted: H2, Confidence: 0.637

TEST:
------------------------------
Accented Title: ''
Accented Text: 'What Colleges Say!'
Pre