# How to use this notebook

**Before running this notebook:**
<br>
Set up the following file structure (namely, the `raw/`, `processed/`, and `archive/` folders within `data/`). Populate `raw/` with the data that we have been given. 

```
data/
├── raw/
│   ├── 2023 District 18 Scores.json
│   ├── 2023 District 18 Student Attributes.json
│   ├── 2023 District 18 Vendor Student Usage.json
│   ├── 2024-09-11 District A Vendor Student Usage.json
│   ├── 2024-09-11_District_A_Benchmarks.json
│   ├── 2024-09-11_District_A_Scores.json
│   ├── benchmarksByMeasurement_202410141332.json
│   ├── district.json
│   ├── District 18 Anonymized Schools.json
│   ├── District 18 Vendor Usage Types.json
│   ├── vendorProducts_202409111049.csv
│   └── vendors_202410081500.json
├── processed/
└── archive/
README.md
```

**When you run this notebook, it will...**
- Rename the raw files to follow a consistent format
- Pull out each individual table 
- Save each table as a `.csv` in `/data/processed`

All names will be formatted as follows: `[district]_[table]_[date].[extension]`
- e.g. `a_all_2024-09-11.json`
- e.g. `a_scores_2024-09-11.csv`

**After running this notebook:**
<br>
You can import tables indiviudally by using `pd.read_csv()`.


In [11]:
import os
import json
import shutil
import pandas as pd

In [12]:
repo_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
data_path = os.path.join(repo_root, 'predicting-proficiency', 'data')

## Renaming files

In [13]:
file_names = [
    # District 45 / A
    ('2024-09-11 District A Benchmarks.json', 'a_benchmarks_2024-09-11.json'),
    ('2024-09-11 District A Scores.json', 'a_scores_2024-09-11.json'),
    ('2024-09-11 District A Vendor Student Usage.json', 'a_vendorUsage_2024-09-11.json'),
    ('district.json', 'a_all_2024-08-29.json'),
    ('vendorProducts_202409111049.csv', 'a_vendorKey_2024-09-11.csv'),
    ('vendors_202410081500.json', 'a_vendorKey_2024-10-08.csv'),

    # District 18
    ('2023 District 18 Scores.json', '18_scores_2024-10-08.json'),
    ('2023 District 18 Student Attributes.json', '18_studentAttributes_2024-10-08.json'),
    ('2023 District 18 Vendor Student Usage.json', '18_vendorUsage_2024-10-08.json'),
    ('District 18 Anonymized Schools.json', '18_schools_2024-10-08.json'),
    ('District 18 Vendor Usage Types.json', '18_vendorUsageTypes_2024-10-08.json'),
    ('benchmarksByMeasurement_202410141332.json', '18_benchmarks_2024-10-08.json'),
]

for old_name, new_name in file_names:
    old_path = os.path.join(data_path, 'raw', old_name)
    new_path = os.path.join(data_path, 'raw', new_name)
    if os.path.exists(old_path) and not os.path.exists(new_path):
        os.rename(old_path, new_path)

# shutil.move('data/raw/a_vendorKey_2024-09-11.csv', 'data/archive/a_vendorKey_2024-09-11.csv')

In [14]:
# District 45 / A

with open('data/raw/a_all_2024-08-29.json', 'r') as file:
    data_45 = json.loads(file.read())

with open('data/raw/a_scores_2024-09-11.json', 'r') as file:
    scores_45_json = json.loads(file.read())

with open('data/raw/a_benchmarks_2024-09-11.json', 'r') as file:
    benchmarks_45_json = json.loads(file.read())

with open('data/raw/a_vendorUsage_2024-09-11.json', 'r') as file:
    vendorUsage_45_json = json.loads(file.read())

In [16]:
# District 18
with open('data/raw/18_scores_2024-10-08.json', 'r') as file:
    scores_18_json = json.loads(file.read())

with open('data/raw/18_schools_2024-10-08.json', 'r') as file:
    schools_18_json = json.loads(file.read())

with open('data/raw/18_studentAttributes_2024-10-08.json', 'r') as file:
    studentAttributes_18_json = json.loads(file.read())

with open('data/raw/18_vendorUsage_2024-10-08.json', 'r') as file:
    vendorUsage_18_json = json.loads(file.read())

with open('data/raw/18_vendorUsageTypes_2024-10-08.json', 'r') as file:
    vendorUsageTypes_18_json = json.loads(file.read())

with open('data/raw/18_benchmarks_2024-10-08.json', 'r') as file:
    benchmarks_18_json = json.loads(file.read())

## Saving .csvs

In [5]:
# District 45 / A

scores_45_data = scores_45_json[list(scores_45_json.keys())[0]]
scores_45_df = pd.DataFrame(scores_45_data)

benchmarks_45_df = pd.DataFrame(benchmarks_45_json['benchmarks'])
schools_45_df = pd.DataFrame(data_45['schools'])
courseSections_45_df = pd.DataFrame(data_45['courseSections'])
courseSectionRosters_45_df = pd.DataFrame(data_45['courseSectionRosters'])

vendorUsage_45_data = vendorUsage_45_json[list(vendorUsage_45_json.keys())[0]]
vendorUsage_45_df = pd.DataFrame(vendorUsage_45_data)

# vendorKey_df = pd.read_csv('data/raw/a_vendorKey_2024-10-08.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data/raw/a_vendorKey_2024-10-08.csv'

In [10]:
# District 18

scores_18_data = scores_18_json[list(scores_18_json.keys())[0]]
scores_18_df= pd.DataFrame(scores_18_data)

schools_18_data = schools_18_json[list(schools_18_json.keys())[0]]
schools_18_df = pd.DataFrame(schools_18_data)

studentAttributes_18_data = studentAttributes_18_json[list(studentAttributes_18_json.keys())[0]]
studentAttributes_18_df=pd.DataFrame(studentAttributes_18_data)

vendorUsage_18_data = vendorUsage_18_json[list(vendorUsage_18_json.keys())[0]]
vendorUsage_18_df = pd.DataFrame(vendorUsage_18_data)

vendorUsageTypes_18_data = vendorUsageTypes_18_json[list(vendorUsageTypes_18_json.keys())[0]]
vendorUsageTypes_18_df = pd.DataFrame(vendorUsageTypes_18_data)

benchmarks_18_data = benchmarks_18_json[list(benchmarks_18_json.keys())[0]]
benchmarks_18_df = pd.DataFrame(benchmarks_18_data)


NameError: name 'scores_18_json' is not defined

In [None]:
data_path = os.path.join(repo_root, 'predicting-proficiency', 'data')

processed_dir = os.path.join(data_path, 'processed')
# Delete all files in processed/
if os.path.exists(processed_dir):
    for file in os.listdir(processed_dir):
        file_path = os.path.join(processed_dir, file)
        if os.path.isfile(file_path):
            os.unlink(file_path)

os.makedirs(os.path.dirname(data_path), exist_ok=True)
# District 45 / A
benchmarks_45_df.to_csv(os.path.join(data_path, 'processed', '45_benchmarks_2024-09-11.csv'), index=False)
schools_45_df.to_csv(os.path.join(data_path, 'processed', '45_schools_2024-08-29.csv'), index=False)
courseSections_45_df.to_csv(os.path.join(data_path, 'processed', '45_courseSections_2024-08-29.csv'), index=False)
courseSectionRosters_45_df.to_csv(os.path.join(data_path, 'processed', '45_courseSectionRosters_2024-08-29.csv'), index=False)
scores_45_df.to_csv(os.path.join(data_path, 'processed', '45_scores_2024-09-11.csv'), index=False)
vendorUsage_45_df.to_csv(os.path.join(data_path, 'processed', '45_vendorUsage_2024-09-11.csv'), index=False)
# District 18
scores_18_df.to_csv(os.path.join(data_path, 'processed', '18_scores_2024-10-08.csv'), index=False)
schools_18_df.to_csv(os.path.join(data_path, 'processed', '18_schools_2024-10-08.csv'), index=False)
studentAttributes_18_df.to_csv(os.path.join(data_path, 'processed', '18_studentAttributes_2024-10-08.csv'), index=False)
vendorUsage_18_df.to_csv(os.path.join(data_path, 'processed', '18_vendorUsage_2024-10-08.csv'), index=False)
vendorUsageTypes_18_df.to_csv(os.path.join(data_path, 'processed', '18_vendorUsageTypes_2024-10-08.csv'), index=False)
benchmarks_18_df.to_csv(os.path.join(data_path, 'processed', '18_benchmarks_2024-10-15.csv'), index=False)