In [1]:
import requests, os
from dotenv import load_dotenv
import polars as pl

In [2]:
load_dotenv()

True

In [9]:
input_folder_path = 'data/study_plan'
output_folder_path = 'data/output'

# Get the SMART_PARSER_ENDPOINT from environment variables
smart_parser_endpoint = os.getenv('SMART_PARSER_ENDPOINT')

# Check if the endpoint is set
if not smart_parser_endpoint:
    raise ValueError("SMART_PARSER_ENDPOINT is not set in the environment variables.")

In [20]:
# List all files in the specified folder
files = os.listdir(input_folder_path)
# Filter out directories, only keep files
files = [os.path.join(input_folder_path, f) for f in files if os.path.isfile(os.path.join(input_folder_path, f))]

files

['data/study_plan/StudyPlan_Edoardo_Sezzi.pdf']

In [21]:
for file in files:
    # Prepare the payload
    with open(file, 'rb') as pdf_file:
        study_plan_file = {'file': pdf_file}  # Key is 'file'

        # Make the POST request
        response = requests.post(smart_parser_endpoint + 'parse', files=study_plan_file)

        # Check the response status
        if response.status_code == 200:
            print("Request was successful.")
            print("Response JSON:", response.json())  # Print the response JSON if applicable
        else:
            print(f"Request failed with status code: {response.status_code}")
            print("Response text:", response.text)  # Print the response text for debugging

Request was successful.
Response JSON: {'stp-llm-version': '576ac470709e376ab8511158b50944dd28889ddb', 'timestamp': '2024-12-09_17:05:19', 'filename': 'StudyPlan_Edoardo_Sezzi.pdf', 'document_tables': [{'degree_course': [], 'academic_year': [], 'semester': [], 'data': ['01/02/2013', '04/02/2013', '27/06/2013', '01/07/2013', '04/07/2013', '21/10/2013', '24/10/2013'], 'code': ['20136', '20199', '20142', '20203', '20269', '20297', '20282'], 'ssd': ['SECS-S/06 (8)', 'SECS-P/07 (12)', 'IUS/04 (6)', 'SECS-P/05 (8)', 'SECS-P/02 (6)', 'SECS-P/01 (6)', 'SECS-P/12 (6)'], 'cfu': ['8', '12', '6', '8', '6', '6', '6'], 'taf': ['B', 'B', 'B', 'B', 'B', 'B', 'C'], 'exam': ["MATEMATICA AVANZATA PER L'ECONOMIA E LE SCIENZE SOCIALI (DIDATTICA IN ITALIANO)", "ISTITUZIONI, GOVERNO E SOCIETA' (DIDATTICA IN ITALIANO)", "DIRITTO DELL'IMPRESA E DEL MERCATO (DIDATTICA IN ITALIANO)", 'ANALISI ECONOMETRICA (DIDATTICA IN ITALIANO)', 'ECONOMICS OF EUROPEAN INTEGRATION (DIDATTICA IN INGLESE)', 'MONETARY POLICY (DIDA

In [29]:
tables = response.json()['document_tables']
tables

[{'degree_course': [],
  'academic_year': [],
  'semester': [],
  'data': ['01/02/2013',
   '04/02/2013',
   '27/06/2013',
   '01/07/2013',
   '04/07/2013',
   '21/10/2013',
   '24/10/2013'],
  'code': ['20136', '20199', '20142', '20203', '20269', '20297', '20282'],
  'ssd': ['SECS-S/06 (8)',
   'SECS-P/07 (12)',
   'IUS/04 (6)',
   'SECS-P/05 (8)',
   'SECS-P/02 (6)',
   'SECS-P/01 (6)',
   'SECS-P/12 (6)'],
  'cfu': ['8', '12', '6', '8', '6', '6', '6'],
  'taf': ['B', 'B', 'B', 'B', 'B', 'B', 'C'],
  'exam': ["MATEMATICA AVANZATA PER L'ECONOMIA E LE SCIENZE SOCIALI (DIDATTICA IN ITALIANO)",
   "ISTITUZIONI, GOVERNO E SOCIETA' (DIDATTICA IN ITALIANO)",
   "DIRITTO DELL'IMPRESA E DEL MERCATO (DIDATTICA IN ITALIANO)",
   'ANALISI ECONOMETRICA (DIDATTICA IN ITALIANO)',
   'ECONOMICS OF EUROPEAN INTEGRATION (DIDATTICA IN INGLESE)',
   'MONETARY POLICY (DIDATTICA IN INGLESE)',
   'STORIA, ISTITUZIONI E CRISI DEL SISTEMA'],
  'grade': ['VENTISEI',
   'VENTISETTE',
   'VENTICINQUE',
   'VENT

In [33]:
dates = []
codes = []
ssds = []
cfus = []
tags = []
exams = []
grades = []

for table in tables:

    dates.append([table.get('data')])
    codes.append(table.get('code'))
    ssds.append(table.get('ssd'))
    cfus.append(table.get('cfu'))
    tags.append(table.get('tag'))
    exams.append(table.get('exam'))
    grades.append(table.get('grade'))


In [34]:
dates

[['0',
  '1',
  '/',
  '0',
  '2',
  '/',
  '2',
  '0',
  '1',
  '3',
  '0',
  '4',
  '/',
  '0',
  '2',
  '/',
  '2',
  '0',
  '1',
  '3',
  '2',
  '7',
  '/',
  '0',
  '6',
  '/',
  '2',
  '0',
  '1',
  '3',
  '0',
  '1',
  '/',
  '0',
  '7',
  '/',
  '2',
  '0',
  '1',
  '3',
  '0',
  '4',
  '/',
  '0',
  '7',
  '/',
  '2',
  '0',
  '1',
  '3',
  '2',
  '1',
  '/',
  '1',
  '0',
  '/',
  '2',
  '0',
  '1',
  '3',
  '2',
  '4',
  '/',
  '1',
  '0',
  '/',
  '2',
  '0',
  '1',
  '3'],
 ['1',
  '7',
  '/',
  '0',
  '9',
  '/',
  '2',
  '0',
  '1',
  '4',
  '1',
  '7',
  '/',
  '0',
  '9',
  '/',
  '2',
  '0',
  '1',
  '4',
  '1',
  '7',
  '/',
  '0',
  '9',
  '/',
  '2',
  '0',
  '1',
  '4',
  '2',
  '3',
  '/',
  '1',
  '0',
  '/',
  '2',
  '0',
  '1',
  '4',
  '0',
  '3',
  '/',
  '1',
  '1',
  '/',
  '2',
  '0',
  '1',
  '4',
  '2',
  '0',
  '/',
  '0',
  '1',
  '/',
  '2',
  '0',
  '1',
  '5',
  '0',
  '5',
  '/',
  '0',
  '2',
  '/',
  '2',
  '0',
  '1',
  '5'],
 []]