# JSON file preprocessing  
- GitLab Location : /ponsatcnam/malin/src/Elise/data/ex_classification/layoutlm_ex/json
- Content : extracted text for each exercise
- Format Input : json

## JSON Structure

In [None]:
# Import json
import json
import pprint

# Load the file
data = json.load(open('hachetteCE2_100_1.json', 'r', encoding='utf-8'))
print("\n")
print("---------------JSON file Top and Sub Keys---------------")
print("\n")
# Top level keys
pprint.pprint(list(data.keys()))
# Second level keys
pprint.pprint(list(data['exercise'].keys()))
# Third level keys
pprint.pprint(list(data['exercise']['sentences'][0].keys()))
print("\n")
print("---------------Example of Extracting Exercise Data---------------")
print("\n")
# We want to extract the content where the category is 'consigne' and 'enonce'
print(f"Consigne:\n{data['exercise']['sentences'][2]['text']}\n")
print(f"Enonce:\n{data['exercise']['sentences'][3]['text']}")



---------------JSON file Top and Sub Keys---------------


['exercise']
['id', 'manual', 'box', 'label', 'sentences']
['id', 'text', 'categorie', 'box', 'words']


---------------Example of Extracting Exercise Data---------------


Consigne:
Recopie et classe ces verbes selon leur groupe.

Enonce:
voir – aimer – obéir – prendre – apercevoir – applaudir – comprendre – danser – pouvoir – tourner – bâtir – habiter – apprendre – vouloir – guetter – vendre – choisir – sortir – descendre – dormir


Then, we can iterate through all the JSON files in the folder.  

In [None]:
import os
import json
import pandas as pd

In [None]:
folder = './json'

In [None]:
df_rows = [] # Store the data

In [None]:
# Go through all files in the folder
for filename in os.listdir(folder):
    # Check if the file is a JSON file
    if filename.endswith('.json'):
        # Load the file
        with open(os.path.join(folder, filename), 'r', encoding='utf-8') as f:
            data = json.load(f)

            consigne_text = ''
            enonce_list = []

            # Extract from the sentences data
            extracted = data['exercise']['sentences']

            # Loop through the content
            for element in extracted:
                # Check if 'categorie' exists and is equal to 'consigne'
                if element.get('categorie') == 'consigne':
                    consigne_text = element.get('text', '')
                    break # Assume that we only have one 'consigne' in each exercise file

            # Iterate again
            for sentence in extracted:
                # Check if 'categorie' exists and is equal to 'enonce'
                if sentence.get('categorie') == 'enonce':
                    enonce_list.append(sentence.get('text', ''))

            # Append the data
            if consigne_text and enonce_list:
                add_row = {
                    'filename': filename,
                    'consigne': consigne_text,
                    'enonce': enonce_list
                }
                df_rows.append(add_row)

            else:
                print(f"Skipping {filename} because it doesn't contain 'consigne' or 'enonce'")
# Create the dataframe
df = pd.DataFrame(df_rows)

# Split the filename from the '.json'
df['filename'] = df['filename'].str.split('.').str[0]

# For multiline enonce in a list, separate them by '\n'
df['enonce'] = df['enonce'].str.join('\n')

Skipping hachetteCE2_107_11.json because it doesn't contain 'consigne' or 'enonce'
Skipping hachetteCE2_133_13.json because it doesn't contain 'consigne' or 'enonce'
Skipping hachetteCE2_135_12.json because it doesn't contain 'consigne' or 'enonce'
Skipping hachetteCE2_13_14.json because it doesn't contain 'consigne' or 'enonce'
Skipping hachetteCE2_15_13.json because it doesn't contain 'consigne' or 'enonce'
Skipping hachetteCE2_15_14.json because it doesn't contain 'consigne' or 'enonce'
Skipping hachetteCE2_23_18.json because it doesn't contain 'consigne' or 'enonce'
Skipping hachetteCE2_81_14.json because it doesn't contain 'consigne' or 'enonce'
Skipping hachetteCE2_83_10.json because it doesn't contain 'consigne' or 'enonce'
Skipping hachetteCE2_85_19.json because it doesn't contain 'consigne' or 'enonce'
Skipping magnardCE1_117_3.json because it doesn't contain 'consigne' or 'enonce'
Skipping magnardCE1_123_9.json because it doesn't contain 'consigne' or 'enonce'
Skipping magnar

In [None]:
df.head()

Unnamed: 0,filename,consigne,enonce
0,hachetteCE2_100_1,Recopie et classe ces verbes selon leur groupe.,voir – aimer – obéir – prendre – apercevoir – ...
1,hachetteCE2_100_2,Recopie ce texte.,L’affolement règne dans les couloirs du musée....
2,hachetteCE2_100_4,Recopie ces verbes conjugués.,elles attrapent – tu remercies – vous transpor...
3,hachetteCE2_100_5,Relève tous les verbes conjugués de ce texte.,Violette marche sur la plage et ramasse des co...
4,hachetteCE2_100_6,Recopie ces verbes conjugués avec un pronom pe...,… glissais – … distribue – … lavez – … se repo...


In [None]:
# Save the dataframe to a CSV file
df.to_csv('./exos_to_csv.csv', index=False)