### Process Out

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("gp_delim_orig.csv",
            usecols = ["original_filename", "page", "col1", "col2", "col3", "col4", "col5"])

new_column_names = {
    'col1': 'sr_no',
    'col2': 'current_gp_name',
    'col3': 'old_gp_villages',
    'col4': 'new_or_reconstituted_gp_name',
    'col5': 'new_gp_villages'
}

df.rename(columns=new_column_names, inplace = True)

In [3]:
def split_column(df, col_name):
    new_rows = []

    # Iterate over each row in the original DataFrame
    for index, row in df.iterrows():
        # Check if the column is not NaN (i.e., a string)
        if isinstance(row[col_name], str):
            # Split the cell in the column by newline character and get the list of items
            items = row[col_name].split('\n')

            # Iterate over the items, starting from the second item
            for i in range(len(items)):
                new_row = row.copy()  # Copy the original row
                new_row[col_name] = items[i].split('. ', 1)[-1].strip()  # Update column with the new item (removing bullet point)
                new_rows.append(new_row)

    # Create a new DataFrame from the list of new rows
    new_df = pd.DataFrame(new_rows, columns=df.columns)
    return new_df

In [4]:
# Apply the function to col3
new_df_col3 = split_column(df, 'old_gp_villages')

## Clean up

In [5]:
new_df_col3.drop_duplicates(inplace = True)

In [6]:
new_df_col3['sr_no'] = new_df_col3['sr_no'].str.replace('क्र.सं.', '')
new_df_col3['old_gp_villages'] = new_df_col3['old_gp_villages'].str.replace('वर्तमान ग्रा ० प ० में', '')
new_df_col3['new_gp_villages'] = new_df_col3['new_gp_villages'].str.replace('नवसृजित ग्रा 0 प 0 में', '')
new_df_col3['current_gp_name'] = new_df_col3['current_gp_name'].str.replace('वर्तमान\nग्रा 0 प 0 का\nनाम', '')
new_df_col3['new_or_reconstituted_gp_name'] = new_df_col3['new_or_reconstituted_gp_name'].str.replace('पुनर्गठित /\nपुनर्सीमांकित / नवसृजित\nग्रा ० प ० का नाम', '')

new_df_col3.to_csv("2019_delim_processed.csv", index = False)