In [1]:
import pandas as pd
# from openai import OpenAI
from textwrap import dedent
from dotenv import load_dotenv
from google.cloud import translate
# load environment variables
load_dotenv()

True

### Complexities

In [None]:
df = pd.read_csv('DW-MasterSheetCopies.csv')
df.head()

In [None]:
df.columns = [x.replace(" ", "_") for x in df.columns]
df.columns

In [None]:
# turn Exhibit_Popularity into an int
df['Exhibit_Popularity'] = df['Exhibit_Popularity'].replace('-', 6)
df['Exhibit_Popularity'] = df['Exhibit_Popularity'].astype(int)
df.info()

In [None]:
# give stats on the data
df.describe()

In [None]:
# show spread of exhibit_popularity
df['Exhibit_Popularity'].value_counts()

In [None]:
# drop rows with popularity of 0
df = df[df['Exhibit_Popularity'] != 0]
# drop rows with popularity higher than 3
df = df[df['Exhibit_Popularity'] <= 3]
# drop rows with Exhibit_Copy = "IGNORE"
df = df[df['Exhibit_Copy'] != "IGNORE"]

In [None]:
df['Exhibit_Popularity'].value_counts()

In [None]:
# swap "\r\n" for "\n" in Exhibit_Copy
df['Exhibit_Copy'] = df['Exhibit_Copy'].str.replace("\r\n", "\n")
df.head()

In [None]:
# graph value counts of Exhibit_Popularity by Exhibit_Group
df.groupby('Exhibit_Group')['Exhibit_Popularity'].value_counts().unstack().plot(kind='bar', stacked=True)

1. Append "Language" col (pad with "English")
2. Preprocess all of the "Exhibit_Copy" info into ["Simplified", "Standard", "Technical"]
4. For each entry, translate "Exhibit_Copy" info into ["Spanish", "Hmong"] and change the "Language"
5. Save this as base_translation.csv
6. For each technical entry, create a Standard entry
7. For each standard entry, create a Simplified Entry
8. For each English entry without a Spanish or Hmong entry, create a 

In [None]:
df["Language"] = "English"

In [None]:
client = OpenAI(
    base_url = "http://dh-dgxh100-2.hpc.msoe.edu:8000/v1",
    api_key = "not_used"
)

In [None]:
def classify_exhibit_copy(group, title, body):
    completion = client.chat.completions.create(
        model="meta/llama-3.1-70b-instruct",
        messages=[
            { "role": "system", "content": "You are a helpful and informative AI assistant, tasked with classifying passages into categories. You are capable of understanding complex questions. You respond in one word answers, those being the categories given." },
            {
                "role": "user",
                "content": dedent(f"""Classify the following exhibit information into one of the categories: Simplified, Standard, Technical:
                Definitions:
                - Simplified: Easily accessible to children up though middle school
                - Standard: Easily accesssible to teenagers and adults with at least a middle school education
                - Technical: Technically written, accessible to those with any college degree

                Exhibit Info:
                - Exhibit Group: {group}
                - Exhibit Title: {title}
                - Exhibit Body: {body}
                """),
            },
        ],
        max_tokens=6,
        stream=False
    )
    response = completion.choices[0].message.content
    print(response)
    standard = "standard" in response.lower()
    simplified = "simplified" in response.lower()
    technical = "technical" in response.lower()
    
    if sum([standard, simplified, technical]) >= 2:
        return "Many"
    if standard:
        return "Standard"
    elif simplified:
        return "Simplified"
    elif technical:
        return "Technical"
    else:
        return "Unknown"

In [None]:
# classify the exhibit copy
df['Complexity'] = df.apply(lambda x: classify_exhibit_copy(
    x['Exhibit_Group'], x['Exhibit_Name'], x['Exhibit_Copy']), axis=1)

In [None]:
# print the distributions of the complexities
# graph value counts of complexities by Exhibit_Group
df.groupby('Exhibit_Group')['Complexity'].value_counts().unstack().plot(kind='bar', stacked=True)

In [None]:
# print all the {complexity} exhibit copies
complexity = "Standard"
for index, row in df[df['Complexity'] == complexity].iterrows():
    print(row['Exhibit_Name'])
    print()
    # print the exhibit copy
    print(row['Exhibit_Copy'])
    print("\n\n")

In [None]:
def understand_exhibit_classification(groups, titles, bodies):
    template = """Exhibit Info:
    - Exhibit Group: {group}
    - Exhibit Title: {title}
    - Exhibit Body: {body}
    """

    passages = ",\n".join([template.format(group=x, title=y, body=z) for x, y, z in zip(groups, titles, bodies)])

    completion = client.chat.completions.create(
        model="meta/llama-3.1-70b-instruct",
        messages=[
            { "role": "system", "content": "You are a helpful and informative AI assistant, tasked with explaining the classification of exhibit information into categories. You are capable of understanding complex questions. You respond thoroughly and with examples." },
            {
                "role": "user",
                "content": f"""Explain the common themes behind the exhibit passages associated with one of the categories: Simplified, Standard, Technical:
                
                {passages}
                """,
            },
        ],
        max_tokens=2000,
        stream=False
    )
    response = completion.choices[0].message.content 
    return response

In [None]:
# response = understand_exhibit_classification(df["Exhibit_Group"], df["Exhibit_Name"], df["Exhibit_Copy"])
exhibit_classification_explanation = """The common themes behind the exhibit passages associated with each of the categories: Simplified, Standard, Technical.

**Simplified (8 exhibits)**

The exhibits in this category appear to be designed for a general audience, with simple language and concepts that are easy to understand. They aim to introduce basic ideas and spark curiosity about various topics, such as trains, robots, dinosaurs, and sound. The themes behind these exhibits are:

* Explaining everyday phenomena in simple terms (e.g., how trains stay on tracks, how theremins work)
* Introducing basic concepts related to science and technology (e.g., Newton's laws, compressed air)
* Showcasing fascinating facts and trivia (e.g., about Les Paul, Morse Code)
* Creating an engaging and interactive experience (e.g., with hands-on activities like the Pneumatic Dinosaur)

**Standard (12 exhibits)**

Exhibits in this category present more detailed information about various topics, often including explanations of scientific principles and technologies. They cater to a slightly more informed audience, although they still maintain a relatively accessible tone. The themes behind these exhibits are:
* Delving into the history and evolution of technology (e.g., the development of railways, the invention of the telegraph)
* Explaining complex scientific concepts in a relatively straightforward way (e.g., how electricity works, the principles of aerodynamics)
* Highlighting the impact of technology on society (e.g., the role of automation in industry, the importance of energy efficiency)
* Showcasing innovative solutions and products (e.g., robots, excavators)

**Technical (10 exhibits)**

Exhibits in this category are designed for a more specialized audience, with a deeper understanding of scientific and technical concepts. They often include technical jargon, complex explanations, and specific examples from various fields. The themes behind these exhibits are:

* Providing detailed explanations of technical concepts and systems (e.g., hydraulic power, electromagnetism)
* Exploring the applications and implications of advanced technologies (e.g., automation, robotics)
* Highlighting the scientific principles underlying various phenomena (e.g., the behavior of fluids, the mechanics of helicopters)
* Showcasing cutting-edge research and innovations (e.g., in the fields of energy, transportation, and computing)"""

In [None]:
def complexity_prompt(prior_response, group, title, text, current_complexity, new_complexity):
    completion = client.chat.completions.create(
        model="meta/llama-3.1-70b-instruct",
        messages=[
            { "role": "system", "content": "You are a helpful and informative AI assistant, tasked with changing the complexity of exhibit information. You are capable of understanding complex questions. You respond with the exhibit copy with the new complexity." },
            {
                "role": "user",
                "content": """Explain the common themes behind the exhibit passages associated with one of the categories: Simplified, Standard, Technical"""
            },
            {
                "role": "assistant",
                "content": prior_response,
            },
            {
                "role": "user",
                "content": f"""Exhibit Info:
                - Group: {group}
                - Title: {title}
                - Current Complexity: {current_complexity}
                Change the complexity of the exhibit to {new_complexity}. Respond ONLY with the translated text:
                Text to Simplify:
                ```
                {text}
                ```
                """,
            },
        ],
        max_tokens=1000,
        stream=False
    )
    return completion

def change_complexity(prior_response, group, title, text, current_complexity, new_complexity):
    for x in range(3):
        response = complexity_prompt(prior_response, group, title, text, current_complexity, new_complexity).choices[0].message.content
        wrong_words = [f"{new_complexity}", "translated text", "complexity level"]
        # check if the response contains the key words, if so, try again
        if any([word in response.lower() for word in wrong_words]):
            if x == 2:
                response = "IGNORE: " + response
            
    # remote any "```" from the response
    response = response.replace("```", "")
    return response

In [None]:
def insert_markdown(text):
    completion = client.chat.completions.create(
        model="meta/llama-3.1-70b-instruct",
        messages=[
            { "role": "system", "content": "You are a helpful and informative AI assistant, tasked with recovering the formatting that text has lost. You are capable of understanding complex questions. You respond with an exact copy of the text in markdown." },
            {
                "role": "user",
                "content": f"""Recover the formatting of the following text:
                {text}
                """,
            },
        ],
        max_tokens=1000,
        stream=False
    )
    response = completion.choices[0].message.content
    return response

In [None]:
# test 5 Standard complexity exhibit copies translated to Simplified
for index, row in df[df['Complexity'] == "Standard"].head().iterrows(): 
    print(row['Exhibit_Name'])
    print()
    # print the exhibit copy
    print(row['Exhibit_Copy'])
    print("\n")
    response = change_complexity(
        prior_response = exhibit_classification_explanation,
        group = row['Exhibit_Group'],
        title = row['Exhibit_Name'],
        text = insert_markdown(row['Exhibit_Copy']),
        current_complexity = row['Complexity'],
        new_complexity = "Simplified")
    print("```")
    print(response)
    print("```")
    print("\n\n")

In [None]:
# create Standard copy for all Technical copies
new_rows = []
for index, row in df[df["Complexity"] == "Technical"].iterrows():
    new_row = row.copy()
    new_row["Complexity"] = "Standard"
    
    new_row["Exhibit_Copy"] = change_complexity(
        prior_response = exhibit_classification_explanation,
        group = row['Exhibit_Group'],
        title = row['Exhibit_Name'],
        text = insert_markdown(row['Exhibit_Copy']),
        current_complexity = row['Complexity'],
        new_complexity = "Simplified"
    )
    new_rows.append(new_row)

# create Simplified copy for all Standard copies
for index, row in df[df["Complexity"] == "Standard"].iterrows():
    new_row = row.copy()
    new_row["Complexity"] = "Simplified"
    
    new_row["Exhibit_Copy"] = change_complexity(
        prior_response = exhibit_classification_explanation,
        group = row['Exhibit_Group'],
        title = row['Exhibit_Name'],
        text = insert_markdown(row['Exhibit_Copy']),
        current_complexity = row['Complexity'],
        new_complexity = "Simplified"
    )
    new_rows.append(new_row)

In [None]:
formatted_df = df["Exhibit_Copy"].apply(lambda x: insert_markdown(x))

In [None]:
new_copies = pd.DataFrame(new_rows, columns=df.columns)
df = pd.concat([df, new_copies], axis=0)

In [None]:
# order by index
df = df.sort_index()

In [None]:
# show all the copies with tag "Simplified"
complexity = "Simplified"
for index, row in df[df['Complexity'] == complexity].iterrows():
    print(row['Exhibit_Name'])
    print()
    # print the exhibit copy
    print(row['Exhibit_Copy'])
    print("\n\n")

In [None]:
df.to_csv("DW-SheetCopies-Complexity.csv", index=True)

### Translation

In [2]:
def translate_text(lst, target_language):
    """
    Translates text into the target language using Google Cloud Translation API.

    Args:
        text (str): The text to translate.
        target_language (str): The BCP-47 language code of the target language (e.g., 'es' for Spanish).

    Returns:
        str: The translated text.
    """
    client = translate.TranslationServiceClient()
    project_id = 'discoveryworldhackathon'  # Replace with your actual project ID
    parent = f"projects/{project_id}/locations/global"

    try:
        response = client.translate_text(
            request={
                "parent": parent,
                "contents": lst,
                "mime_type": "text/plain",  # Use "text/html" for HTML content
                "target_language_code": target_language,
            }
        )

        # The API returns a list of translations; we're translating one text, so we get the first item.
        return response.translations

    except Exception as e:
        print(f"An error occurred during translation: {e}")
        return None


In [3]:
df = pd.read_csv("DW-SheetCopies-Complexity.csv")
columns = df.columns
df = df.rename(columns={df.columns[0]: "Id"})
df.head()

Unnamed: 0,Id,Exhibit_Group,Exhibit_Name,Exhibit_Popularity,Exhibit_Copy,Language,Complexity
0,0,All-Aboard,All-Aboard,1,# All Aboard!\nTrains and railways have been a...,English,Simplified
1,5,Automation Everywhere,Connect Four,3,**Meet Gary the Robot**\nGary is a special rob...,English,Simplified
2,5,Automation Everywhere,Connect Four,3,**Fanuc Robot-M-11a**\n\nGary is a sorting rob...,English,Standard
3,6,Automation Everywhere,Dream Machine,1,### Sensor\n\n#### 1. Tiny Light Sensors\nThes...,English,Standard
4,6,Automation Everywhere,Dream Machine,1,### Sensor\n\n#### 1. 42KD Miniature Photoelec...,English,Technical


In [4]:
# translate one of the exhibit copies to Spanish
# translate one of the exhibit copies to Spanish
spanish_lst = df['Exhibit_Copy'].tolist()
hmong_lst = df['Exhibit_Copy'].tolist()

# Ensure the list splits are correct
mid_index = len(df['Exhibit_Copy']) // 2

spanish_lst_1 = spanish_lst[:mid_index]
spanish_lst_2 = spanish_lst[mid_index:]
hmong_lst_1 = hmong_lst[:mid_index]
hmong_lst_2 = hmong_lst[mid_index:]

In [9]:
def translate_2pt_series(lst_1, lst_2, target_language):
    lst_1 = translate_text(lst_1, target_language)
    lst_2 = translate_text(lst_2, target_language)
    lst_1 = [x.translated_text for x in lst_1]
    lst_2 = [x.translated_text for x in lst_2]
    translated_text = lst_1 + lst_2
    return translated_text

In [10]:
# test translation
translated_text = translate_2pt_series(["Hello World"], ["World Hello"], "es")

In [11]:
spanish_lst = translate_2pt_series(spanish_lst_1, spanish_lst_2, "es")
spanish_df = df.drop(columns=['Exhibit_Copy']).copy()
spanish_df['Exhibit_Copy'] = spanish_lst
spanish_df['Language'] = "Spanish"

hmong_lst = translate_2pt_series(hmong_lst_1, hmong_lst_2, "hmn")
hmong_df = df.drop(columns=['Exhibit_Copy']).copy()
hmong_df['Exhibit_Copy'] = hmong_lst
hmong_df['Language'] = "Hmong"

In [12]:
df = pd.concat([df, spanish_df, hmong_df], axis=0)
df = df.sort_index()
# move the copy column to the end
exhibit_copy = df.pop('Exhibit_Copy')
df['Exhibit_Copy'] = exhibit_copy
df.to_csv("DW-SheetCopies-Complexity-Translated.csv", index=False)

## Save Data

In [24]:
import pandas as pd

In [25]:
df = pd.read_csv("DW-SheetCopies-Complexity-Translated.csv")
df.head(9)

Unnamed: 0,Id,Exhibit_Group,Exhibit_Name,Exhibit_Popularity,Language,Complexity,Exhibit_Copy
0,0,All-Aboard,All-Aboard,1,English,Simplified,# All Aboard!\r\nTrains and railways have been...
1,0,All-Aboard,All-Aboard,1,Spanish,Simplified,# ¡Todos a bordo!\r\nLos trenes y los ferrocar...
2,0,All-Aboard,All-Aboard,1,Hmong,Simplified,#Txhua yam hauv qab!\r\nCov tsheb ciav hlau th...
3,5,Automation Everywhere,Connect Four,3,Spanish,Simplified,"**Conoce a Gary, el robot**\r\nGary es un robo..."
4,5,Automation Everywhere,Connect Four,3,Hmong,Simplified,** Ntsib Gary tus neeg hlau **\r\nGary yog tus...
5,5,Automation Everywhere,Connect Four,3,English,Simplified,**Meet Gary the Robot**\r\nGary is a special r...
6,5,Automation Everywhere,Connect Four,3,English,Standard,**Fanuc Robot-M-11a**\r\n\r\nGary is a sorting...
7,5,Automation Everywhere,Connect Four,3,Spanish,Standard,**Robot Fanuc M-11a**\r\n\r\nGary es un robot ...
8,5,Automation Everywhere,Connect Four,3,Hmong,Standard,**Fanuc Neeg Hlau-M-11a**\r\n\r\nGary yog ib t...


In [26]:
# import pandas as pd
# import json

# # Map complexities to difficulty levels
# complexity_to_level = {
#     'Simplified': '1',
#     'Standard': '2',
#     'Technical': '3'
# }

# # Map languages to language codes
# language_to_code = {
#     'English': 'en',
#     'Spanish': 'es',
#     'Hmong': 'Hmong'  # Using 'Hmong' as the code based on your JSON schema
# }

# # Initialize the list to hold exhibits
# exhibits_list = []

# # Group the data by 'Id' and 'Exhibit_Name'
# grouped = df.groupby(['Id', 'Exhibit_Name'])

# for (exhibit_id, exhibit_name), group in grouped:
#     # Prepare the exhibit dictionary
#     exhibit_dict = {
#         "id": f"exhibit{exhibit_id}",
#         "image": "assets/images/flutter_logo.png",  # Update with actual image paths if available
#         "article": {
#             "id": f"exhibit{exhibit_id}",
#             "titles": {},
#             "descriptions": {}
#         },
#         "languageCode": "",       # We'll set this later
#         "difficultyLevel": ""     # We'll set this later
#     }
    
#     # Iterate over each row in the group to populate 'titles' and 'descriptions'
#     for _, row in group.iterrows():
#         language = row['Language']
#         complexity = row['Complexity']
#         exhibit_copy = row['Exhibit_Copy']
        
#         # Get language code and difficulty level
#         language_code = language_to_code.get(language, language)
#         difficulty_level = complexity_to_level.get(complexity, complexity)
        
#         # Create key for titles and descriptions
#         key = f"{language_code}{difficulty_level}"
        
#         # Use 'Exhibit_Name' as the title (assuming it's the display name)
#         exhibit_dict['article']['titles'][key] = exhibit_name
        
#         # 'Exhibit_Copy' is used as the description
#         exhibit_dict['article']['descriptions'][key] = exhibit_copy
        
#         # Optionally, set 'languageCode' and 'difficultyLevel' for the exhibit
#         # Here we set them to the last processed values (you can adjust this as needed)
#         exhibit_dict['languageCode'] = language_code
#         exhibit_dict['difficultyLevel'] = difficulty_level
        
#     # Append the exhibit to the list
#     exhibits_list.append(exhibit_dict)

# # Prepare the final JSON structure
# output_json = {
#     "exhibits": exhibits_list
# }

# # Convert the output dictionary to a JSON string with indentation
# json_output = json.dumps(output_json, indent=4, ensure_ascii=False)

# # Optionally, write the JSON output to a file
# with open('output.json', 'w', encoding='utf-8') as f:
#     f.write(json_output)

# # Print the JSON output
# print(json_output)


In [27]:
spanish_df = df[df['Language'] == "Spanish"]
english_df = df[df['Language'] == "English"]
hmong_df = df[df['Language'] == "Hmong"]

In [28]:
len(spanish_df), len(english_df), len(hmong_df)

(50, 50, 50)

In [32]:
# join english_df and spanish_df on the Id column
df = pd.merge(english_df, spanish_df, on='Id', suffixes=('_en', '_es'))
print(df.head(1))

   Id Exhibit_Group_en Exhibit_Name_en  Exhibit_Popularity_en Language_en  \
0   0       All-Aboard      All-Aboard                      1     English   

  Complexity_en                                    Exhibit_Copy_en  \
0    Simplified  # All Aboard!\r\nTrains and railways have been...   

  Exhibit_Group_es Exhibit_Name_es  Exhibit_Popularity_es Language_es  \
0       All-Aboard      All-Aboard                      1     Spanish   

  Complexity_es                                    Exhibit_Copy_es  
0    Simplified  # ¡Todos a bordo!\r\nLos trenes y los ferrocar...  


In [33]:
# Initialize an empty string to hold the HTML content
html_content = ''

for index, row in df.iterrows():
    # Extract the necessary information
    title_en = row['Exhibit_Name_en']
    complexity = row['Complexity_en']
    copy_en = row['Exhibit_Copy_en']
    copy_es = row['Exhibit_Copy_es']
    
    # Convert Markdown to HTML (optional)
    import markdown
    copy_en_html = markdown.markdown(copy_en)
    copy_es_html = markdown.markdown(copy_es)
    
    # Generate the HTML for this exhibit
    exhibit_html = f'''
    <h1>{title_en}</h1>
    <h2>Complexity: {complexity}</h2>
    <hr>
    <table border="1" style="width:100%; border-collapse: collapse;">
        <tr>
            <th style="width:50%;">English Column</th>
            <th style="width:50%;">Spanish Column</th>
        </tr>
        <tr>
            <td style="vertical-align: top;">{copy_en_html}</td>
            <td style="vertical-align: top;">{copy_es_html}</td>
        </tr>
    </table>
    <hr>
    '''
    # Append to the main HTML content
    html_content += exhibit_html

In [34]:
# Wrap the content in HTML tags
full_html = f'''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Exhibits</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            margin: 20px;
        }}
        h1 {{
            color: #2e6c80;
        }}
        table {{
            width: 100%;
            border-collapse: collapse;
        }}
        th, td {{
            border: 1px solid #dddddd;
            padding: 10px;
        }}
        th {{
            background-color: #f2f2f2;
        }}
    </style>
</head>
<body>
    {html_content}
</body>
</html>
'''

# Write the HTML content to a file
with open('exhibits.html', 'w', encoding='utf-8') as file:
    file.write(full_html)

print("HTML file has been created successfully!")

HTML file has been created successfully!
