# Update GRIT entries on the website

In [1]:
import pandas as pd
import re
import json
import os
from datetime import datetime
from calendar import month_name

## read the exported json

In [7]:
with open('content_grit.json', mode='r') as file:
    current_data = json.load(file)

In [8]:
current_data = current_data[1:] #remove sept
item = current_data[0]

In [16]:
item

{'id': 350,
 'title': 'May 1979 GRIT',
 'slug': 'may-1979-grit',
 'is_enabled': 1,
 'published_at': '1979-05-01 00:00:00',
 'expired_at': None,
 'content_group': None,
 'grit_date': '1979-05-01 00:00:00',
 'file': 'grit/1979/SHHA-GRIT-1979_05.pdf',
 'image': 'grit/1979/SHHA-GRIT-1979_05.png',
 'content': 'In 1979, the Sandia Heights Security Patrol, backed by substantial resident support, extended its operational hours to roughly 65 hours per week, focusing primarily on weekend nights. Continued contributions from residents were encouraged to augment and ameliorate service. For enhanced child safety, motorized traffic was prohibited from the ballfield/playground area, and residents were asked to advocate for maintenance priorities for Trasway Blvd. with county officials. The Sandia Heights Homeowners Association scheduled an annual meeting on May 12, 1979, with discussions on topics like annual solicitations for contributions and playground issues, and voting for filling two board seat

In [13]:
text_folder = '/Users/heidi/Documents/SHHA/GRIT/GRIT_archive_AI_summaries/'

updated_data = []
for item in current_data:
    pdf = item['file']
    year = item['file'].split('-')[-1].split('_')[0] 
    month = item['file'].split('-')[-1].split('_')[1][0:2]
    
    # create new item with minimum info
    new_item = {}
    new_item['id'] = item['id']
    
    # Edit image path
    new_item['image'] = pdf[:-4] + '.png'

    # Edit text path
    '''
    text_file = os.path.basename(item['file'])[:-4]+'_summary.txt'  
    text_file_path = os.path.join(text_folder,year,text_file)
    if os.path.exists(text_file_path):
        text = open(text_file_path).read()
        item['content'] = text
    '''
    updated_data.append(new_item)

In [14]:
filename = f"content_grit_{datetime.now().strftime('%y%m%d%H%M')}.json"
with open(filename, 'w') as f:
    json.dump(updated_data, f)

In [41]:
month

'12'

In [12]:
text_file

'SHHA-GRIT-1991_11_summary.txt'

In [15]:
text

'In 1979, the Sandia Heights Security Patrol, backed by substantial resident support, extended its operational hours to roughly 65 hours per week, focusing primarily on weekend nights. Continued contributions from residents were encouraged to augment and ameliorate service. For enhanced child safety, motorized traffic was prohibited from the ballfield/playground area, and residents were asked to advocate for maintenance priorities for Trasway Blvd. with county officials. The Sandia Heights Homeowners Association scheduled an annual meeting on May 12, 1979, with discussions on topics like annual solicitations for contributions and playground issues, and voting for filling two board seats. The residents were informed about an upcoming annual cleanup day and requested for voluntary dues to help install a grass and watering system in the playground. The Sandia Heights Swim team announced the commencement of its season the following month under new coach Livy Parsons, a highly qualified pro

## parse each entry and create an updated json

In [4]:
# update the existing entries
updated_data = []
existing_YM_entries = []
for item in current_data:
    year,month = extract_year_month(item['slug'])
    item['file'] = '/grit/'+year+'/SHHA-GRIT-'+year+'_'+month+'.pdf'
    updated_data.append(item)
    existing_YM_entries.append(year+month)

In [13]:
#with open('updated_data.json', 'w') as json_file:
#    json.dump(updated_data, json_file)

In [37]:
df = pd.DataFrame(updated_data)

df['Title'] = df['title']
df['Slug'] = df['slug']
df['Enabled Date'] = df['grit_date']
df['File'] = df['file'] 
df['ID'] = df['id']
df = df[['ID','Title','Slug','Enabled Date','File']]

#df.to_csv('updated_data.csv')

In [39]:
df.to_csv('updated_data.csv')

## Add in new data

In [9]:
# Now, add data that doesn't yet exist

# Find the entries that are in PDF but not in JSON
new_YM_entries = []
for root, dirs, files in os.walk('./GRIT_archive'):
    for f in files:
        if f.endswith('pdf'):
            f = f.split('-')[-1].split('.')[0]
            year,month = f.split('_')
            if year+month not in existing_YM_entries:
                new_YM_entries.append((year,month))
new_YM_entries = sorted(new_YM_entries, key=lambda x: (x[0], x[1]), reverse=True)
new_YM_entries = [(int(y),int(m)) for y,m in new_YM_entries]

starting_id = max([int(item['id']) for item in current_data])
data_new = []    
for i, (year, month) in enumerate(new_YM_entries):
    month_name_str = month_name[month]
    grit_date = f"{year}-{month:02d}-01 00:00:00"
    file_path = f"/grit/{year}/SHHA-GRIT-{year}_{month:02d}.pdf"

    entry = {
        'id': starting_id + i,
        'title': f"{month_name_str} {year} GRIT",
        'slug': f"{month_name_str.lower()}-{year}-grit",
        'is_enabled': 1,
        'published_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'expired_at': None,
        'content_group': None,
        'grit_date': grit_date,
        'file': file_path
    }

    data_new.append(entry)

In [40]:
df = pd.DataFrame(data_new)

df['Title'] = df['title']
df['Slug'] = df['slug']
df['Enabled Date'] = df['grit_date']
df['File'] = df['file'] 
df['ID'] = df['id']
df = df[['ID','Title','Slug','Enabled Date','File']]

df.to_csv('data_new.csv')

# Finally, fix the URL issue

In [42]:
df = pd.read_csv('content_grit_combined_badURL.csv')

In [44]:
df['File'] = df['File'].apply(lambda x: x[1:] if isinstance(x, str) else x)
df

Unnamed: 0,ID,Title,Slug,Enabled,Publish Date,Date,File
0,94,August 2024 GRIT,august-2024-grit,1.0,2024-08-01 20:58:59,2024-08-01 20:58:59,grit/2024/SHHA-GRIT-2024_08.pdf
1,93,"July, 2024 GRIT",july-2024-grit,1.0,2024-07-01 17:57:00,2024-07-01 17:57:00,grit/2024/SHHA-GRIT-2024_07.pdf
2,92,June 2024 GRIT,june-2024-grit,1.0,2024-06-01 19:53:00,2024-06-01 19:53:00,grit/2024/SHHA-GRIT-2024_06.pdf
3,91,May 2024 GRIT,may-2024-grit,1.0,2024-05-01 02:51:56,2024-05-01 02:51:56,grit/2024/SHHA-GRIT-2024_05.pdf
4,90,April 2024 GRIT,april-2024-grit,1.0,2024-04-01 17:27:12,2024-04-01 17:27:12,grit/2024/SHHA-GRIT-2024_04.pdf
...,...,...,...,...,...,...,...
344,346,August 1980 GRIT,august-1980-grit,,1980-08-01 00:00:00,,grit/1980/SHHA-GRIT-1980_08.pdf
345,347,May 1980 GRIT,may-1980-grit,,1980-05-01 00:00:00,,grit/1980/SHHA-GRIT-1980_05.pdf
346,348,February 1980 GRIT,february-1980-grit,,1980-02-01 00:00:00,,grit/1980/SHHA-GRIT-1980_02.pdf
347,349,September 1979 GRIT,september-1979-grit,,1979-09-01 00:00:00,,grit/1979/SHHA-GRIT-1979_09.pdf


In [45]:
df.to_csv('content_fixed_filepaths.csv')