In [102]:
import pandas as pd

In [103]:
filepath = r"C:\Users\ajati\OneDrive\Desktop\Mission\Projects\AI-Lawyer\data\dataset\The_Constitution_Of_India.csv"
df = pd.read_csv(filepath)
df

Unnamed: 0,Part_Number,Part_Title,Article_Number,Article_Text
0,Part I,Union & Its Territory,1,"Name and territory of the Union\n(1) India, th..."
1,Part I,Union & Its Territory,2,Admission or establishment of new States: Parl...
2,Part I,Union & Its Territory,2A,Sikkim to be associated with the Union Rep by ...
3,Part I,Union & Its Territory,3,Formation of new States and alteration of area...
4,Part I,Union & Its Territory,4,Laws made under Articles 2 and 3 to provide fo...
...,...,...,...,...
450,Part XXI,"Temporary, Transitional and Special Provisions",378A,Special provision as to duration of Andhra Pra...
451,Part XXI,"Temporary, Transitional and Special Provisions",392,Power of the President to remove difficulties\...
452,Part XXII,"Short title, Commencement, Authoritative Text in",393,This Constitution may be called the Constituti...
453,Part XXII,"Short title, Commencement, Authoritative Text in",394,"Commencement This article and Articles 5, 6, 7..."


In [97]:
df = pd.read_csv(r"C:\Users\ajati\OneDrive\Desktop\Mission\Projects\AI-Lawyer\data\dataset\Constitution Of India.csv")
df

Unnamed: 0,Articles
0,"1. Name and territory of the Union\n(1) India,..."
1,1 The territories of the States; the Union ter...
2,2. Admission or establishment of new States: P...
3,2A. Sikkim to be associated with the Union Rep...
4,3. Formation of new States and alteration of a...
...,...
451,378A. Special provision as to duration of Andh...
452,392. Power of the President to remove difficul...
453,393 Short title This Constitution may be calle...
454,"394. Commencement This article and Articles 5,..."


In [98]:
df.iloc[10:20]

Unnamed: 0,Articles
10,9. Person voluntarily acquiring citizenship of...
11,10. Continuance of the rights of citizenship E...
12,11 Parliament to regulate the right of citizen...
13,"12. Definition In this part, unless the contex..."
14,13. Laws inconsistent with or in derogation of...
15,14. Equality before law The State shall not de...
16,15. Prohibition of discrimination on grounds o...
17,16. Equality of opportunity in matters of publ...
18,17 Abolition of Untouchability Untouchability ...
19,"18. Abolition of titles No title, not being a ..."


In [99]:
import re
import pandas as pd

def merge_rows_with_articles(df, start_n, col_name):
    """
    df:        DataFrame with the text column
    start_n:   starting article number (int)
    col_name:  name of the text column (str)
    """
    n = start_n
    merged_rows = []
    buffer = ""

    for text in df[col_name].astype(str):
        # Build patterns for current n and previous (n-1)
        # Allow one or two trailing capital letters before the dot, e.g. 2A., 243ZH.
        pattern_main = re.compile(rf"^{n}\. |^{n}[A-Z]{{1,2}}\. ")
        if n > 1:
            pattern_prev_letter = re.compile(rf"^{n-1}[A-Z]{{1,2}}\. ")
        else:
            pattern_prev_letter = None

        if pattern_main.match(text):
            # Rule 1: matches "{n}. " or "{n}{CapitalLetter(s)}. "
            if buffer:
                merged_rows.append(buffer.strip())
            buffer = text
            n += 1  # increment n

        elif pattern_prev_letter is not None and pattern_prev_letter.match(text):
            # Rule 2: matches "{n-1}{CapitalLetter(s)}. "
            if buffer:
                merged_rows.append(buffer.strip())
            buffer = text
            # do not increment n

        else:
            # Rule 3: merge with previous row
            if buffer:
                buffer += " " + text
            else:
                buffer = text  # first row fallback

    if buffer:
        merged_rows.append(buffer.strip())

    return pd.DataFrame({col_name: merged_rows})

In [100]:
df.columns

Index(['Articles                                                                                         '], dtype='object')

In [101]:
df_merged = merge_rows_with_articles(df, 1, 'Articles                                                                                         ')
df_merged

Unnamed: 0,Articles
0,"1. Name and territory of the Union\n(1) India,..."
1,2. Admission or establishment of new States: P...
2,2A. Sikkim to be associated with the Union Rep...
3,3. Formation of new States and alteration of a...
4,4. Laws made under Articles 2 and 3 to provide...
5,5. Citizenship at the commencement of the Cons...
6,6. Rights of citizenship of certain persons wh...
7,7. Rights of citizenship of certain migrants t...
8,8. Rights of citizenship of certain persons of...
9,9. Person voluntarily acquiring citizenship of...


In [80]:
def split_article_column(df, col_name):
    """
    Splits the given column on the first occurrence of '. '.
    Left part → Article_Index, right part → Article.
    """
    left_right = df[col_name].astype(str).str.split('. ', n=1, expand=True)

    df_out = df.copy()
    df_out['Article_Index'] = left_right[0]
    df_out['Article'] = left_right[1]  # will be NaN if '. ' not found
    df_out.drop(columns=[col_name], inplace=True)

    return df_out

In [86]:
df_merged

Unnamed: 0,Articles
0,"1. Name and territory of the Union\n(1) India,..."
1,2. Admission or establishment of new States: P...
2,2A. Sikkim to be associated with the Union Rep...
3,3. Formation of new States and alteration of a...
4,4. Laws made under Articles 2 and 3 to provide...
5,5. Citizenship at the commencement of the Cons...
6,6. Rights of citizenship of certain persons wh...
7,7. Rights of citizenship of certain migrants t...
8,8. Rights of citizenship of certain persons of...
9,9. Person voluntarily acquiring citizenship of...


In [83]:
df_split = split_article_column(df_merged, "Articles")
df_split

KeyError: 'Articles'

In [71]:
import pandas as pd

def analyze_article_index(df, col_name='Article_Index'):
    """
    Classifies Article_Index values into:
      - pure numbers            (e.g. '2')
      - composite like 2A, 243ZH (digits + one or two capital letters)
      - other                   (neither of the above)
    Prints counts and lists, and returns them in a dict.
    """
    s = df[col_name].astype(str).str.strip()

    # Pure numbers: only digits
    is_pure = s.str.fullmatch(r'\d+')

    # Composite: digits followed by ONE OR TWO capital letters (e.g. 2A, 243ZH)
    is_composite = s.str.fullmatch(r'\d+[A-Z]{1,2}')

    # Other: not pure and not composite
    is_other = ~(is_pure | is_composite)

    pure_list = sorted(s[is_pure].unique(), key=lambda x: int(x))
    composite_list = sorted(s[is_composite].unique())
    other_list = sorted(s[is_other].unique())

    print(f"Pure numeric Article_Index count: {len(pure_list)}")
    print("Pure numeric Article_Index values:", pure_list)
    print()
    print(f"Composite (like 2A, 243ZH) Article_Index count: {len(composite_list)}")
    print("Composite Article_Index values:", composite_list)
    print()
    print(f"Other Article_Index count: {len(other_list)}")
    print("Other Article_Index values:", other_list)

    return {
        "pure_count": len(pure_list),
        "pure_values": pure_list,
        "composite_count": len(composite_list),
        "composite_values": composite_list,
        "other_count": len(other_list),
        "other_values": other_list,
    }

In [72]:
_res = analyze_article_index(df_split)

Pure numeric Article_Index count: 377
Pure numeric Article_Index values: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149

In [55]:
idx_df = pd.read_csv(r"C:\Users\ajati\OneDrive\Desktop\Mission\Projects\AI-Lawyer\data\dataset\Index.csv", encoding="latin1")
idx_df

Unnamed: 0,Parts of the Indian Constitution,Subject Mentioned in the Part,Articles in Indian Constitution
0,Part I,Union & Its Territory,Article 1-4
1,Part II,Citizenship,Article 5-11
2,Part III,Fundamental Rights,Article 12-35
3,Part IV,Directive Principles,Article 36-51
4,Part IV A,Fundamental Duties,Article 51A
5,Part V,The Union,Article 52-151
6,Part VI,The States,Article 152-237
7,Part VII,"Note: 7th Amendment Act, 1956 repealed Part 7",
8,Part VIII,The Union Territories,Article 239-242
9,Part IX,The Panchayats,Article 243-243O


In [62]:
def preprocess_idx(idx_df, col="Articles in Indian Constitution"):
    """
    Parses 'Article a-b' or 'Article x' into Starting_Article / Ending_Article.
    Uses nullable Int64 so NaNs are allowed.
    """
    s = idx_df[col].astype(str).str.strip()

    m = s.str.extract(
        r'Article\s+(\d+)[A-Z]*\s*(?:[-–]\s*(\d+)[A-Z]*)?',
        expand=True
    )

    out = idx_df.copy()

    # Convert to numeric, allowing NaNs
    start = pd.to_numeric(m[0], errors="coerce")
    end = pd.to_numeric(m[1], errors="coerce")

    out["Starting_Article"] = start.astype("Int64")
    out["Ending_Article"] = end.fillna(start).astype("Int64")

    return out

def attach_idx_to_articles(articles_df, idx_df, idx_col="Articles in Indian Constitution"):
    """
    For each row in articles_df (with column 'Article_Index'),
    attaches rows from idx_df whose [Starting_Article, Ending_Article]
    range contains the base numeric part of Article_Index.

    Example: range 242–245 will match 242, 243, 243A, 243B, 244, 245, etc.
    """
    # Prepare idx_df with numeric ranges
    idx_proc = preprocess_idx(idx_df, col=idx_col)

    # Extract base numeric part from Article_Index, e.g. '243A' -> 243
    art = articles_df.copy()
    art["__base_num"] = (
        art["Article_Index"]
        .astype(str)
        .str.extract(r'(\d+)', expand=False)
        .astype(int)
    )

    # Cross-join and then filter by range
    art["_key"] = 1
    idx_proc["_key"] = 1

    merged = art.merge(idx_proc, on="_key", how="left")

    in_range = (
        (merged["__base_num"] >= merged["Starting_Article"]) &
        (merged["__base_num"] <= merged["Ending_Article"])
    )
    merged = merged[in_range].drop(columns=["_key", "__base_num"])

    return merged

In [63]:
idx_proc = preprocess_idx(idx_df, "Articles in Indian Constitution")
idx_proc

Unnamed: 0,Parts of the Indian Constitution,Subject Mentioned in the Part,Articles in Indian Constitution,Starting_Article,Ending_Article
0,Part I,Union & Its Territory,Article 1-4,1.0,4.0
1,Part II,Citizenship,Article 5-11,5.0,11.0
2,Part III,Fundamental Rights,Article 12-35,12.0,35.0
3,Part IV,Directive Principles,Article 36-51,36.0,51.0
4,Part IV A,Fundamental Duties,Article 51A,51.0,51.0
5,Part V,The Union,Article 52-151,52.0,151.0
6,Part VI,The States,Article 152-237,152.0,237.0
7,Part VII,"Note: 7th Amendment Act, 1956 repealed Part 7",,,
8,Part VIII,The Union Territories,Article 239-242,239.0,242.0
9,Part IX,The Panchayats,Article 243-243O,243.0,243.0


In [None]:
df_with_idx = attach_idx_to_articles(df_split, idx_df, "Articles in Indian Constitution")
df_with_idx