## Set Environment Variables

In [1]:
import os
import sys
import re
from dotenv import load_dotenv

print(sys.executable)

# Set Environment Variables
load_dotenv()
GCS_BUCKET_NAME = os.getenv("GCS_BUCKET_NAME")
GOOGLE_APPLICATION_CREDENTIALS_PATH = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_PATH")
OPENAI_KEY = os.getenv("OPENAI_KEY")

if OPENAI_KEY is None or GOOGLE_APPLICATION_CREDENTIALS_PATH is None or GCS_BUCKET_NAME is None:
    print("Error: One or more environment variables are not set.")
    sys.exit(1)

/Users/shotomorisaki/Engineering/cse144-collegeassist/JupyterNotebook/.venv/bin/python


## Import Libraries

In [5]:
import json
import csv
from ragas.testset import TestsetGenerator
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.testset.graph import KnowledgeGraph, Node, NodeType
from ragas.testset.transforms import default_transforms, apply_transforms
from ragas.testset.synthesizers import default_query_distribution
from langchain.schema import Document

# Import the GoogleCloudStorage module
sys.path.append(os.path.abspath('../backend'))
from src.Web.GoogleCloudStorage import list_files, get_gcs_client

## Load Documents from GCS using GoogleCloudStorage.py

In [6]:
def read_file_from_gcs(bucket_name, filename):
    client = get_gcs_client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(filename)
    
    if not blob.exists():
        raise FileNotFoundError(f"The file '{filename}' does not exist in the bucket.")
    
    return blob.download_as_text()

def clean_text(text):
    # Remove punctuation and symbols
    text = re.sub(r'[^\w\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    return text

In [7]:
files = list_files()
print(f"Total files in bucket: {len(files)}")

# Prepare CSV file
csv_filename = 'output.csv'
csv_fields = ['name', 'content', 'metadata']

with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=csv_fields)
    writer.writeheader()

    # Loop over all files
    for idx, selected_file in enumerate(files):
        try:
            # Read the file from GCS
            data_content = read_file_from_gcs(GCS_BUCKET_NAME, selected_file)
            
            # Skip empty files
            if not data_content.strip():
                print(f"Skipping empty file: {selected_file}")
                continue
            
            # Determine file type based on extension
            _, file_extension = os.path.splitext(selected_file)
            
            # Initialize variables
            page_content = ''
            metadata = {}
            
            # Process based on file extension
            if file_extension == '.json':
                # Attempt to parse JSON
                try:
                    data = json.loads(data_content)
                except json.JSONDecodeError as e:
                    print(f"Failed to parse JSON from file '{selected_file}': {e}")
                    continue
                
                # Handle different data structures
                if isinstance(data, dict):
                    # Single JSON object
                    page_content = data.get('page_content', '')
                    metadata = data.get('metadata', {})
                elif isinstance(data, list):
                    # List of JSON objects
                    for item in data:
                        # Ensure item is a dictionary
                        if not isinstance(item, dict):
                            print(f"Unexpected item type in list in file '{selected_file}'")
                            continue
                        page_content = item.get('page_content', '')
                        metadata = item.get('metadata', {})
                        
                        # Write to CSV
                        writer.writerow({
                            'name': selected_file,
                            'content': page_content,
                            'metadata': json.dumps(metadata)
                        })
                else:
                    print(f"Unexpected JSON structure in file '{selected_file}'")
                    continue
            else:
                # Assume it's a text or HTML file
                page_content = data_content
                metadata = {'filename': selected_file}
                
                # Write to CSV
                writer.writerow({
                    'name': selected_file,
                    'content': page_content,
                    'metadata': json.dumps(metadata)
                })
            
            # Optional: Print progress
            if (idx + 1) % 100 == 0 or (idx + 1) == len(files):
                print(f"Processed {idx + 1}/{len(files)} files")
        
        except Exception as e:
            print(f"Error processing file '{selected_file}': {e}")
            continue

print(f"Data written to {csv_filename}")

11:51:29 | [32mINFO    [0m | list_files      | Listed 39969 files from GCS bucket 'scraped_web_data'.[0m


Total files in bucket: 39969
Processed 100/39969 files
Skipping empty file: 11-03/my.ucsc.edu.txt
Skipping empty file: 11-03/www.instagram.com-ucscadmissions-.txt
Skipping empty file: 11-03/www.tiktok.com-@ucsantacruz?lang=en.txt
Processed 200/39969 files
Processed 300/39969 files
Processed 400/39969 files
Skipping empty file: 11-04/cdelsi.ucsc.edu.txt
Skipping empty file: 11-04/crs.ucsc.edu.txt
Skipping empty file: 11-04/cside.ucsc.edu.txt
Processed 500/39969 files
Processed 600/39969 files
Processed 700/39969 files
Processed 800/39969 files
Skipping empty file: 11-04/housing.ucsc.edu-colleges-ten-index.html.txt
Skipping empty file: 11-04/housing.ucsc.edu-prioritygroups-index.html.txt
Processed 900/39969 files
Skipping empty file: 11-04/ias.ucsc.edu.txt
Processed 1000/39969 files
Processed 1100/39969 files
Skipping empty file: 11-04/my.ucsc.edu-.txt
Skipping empty file: 11-04/my.ucsc.edu.txt
Processed 1200/39969 files
Processed 1300/39969 files


KeyboardInterrupt: 

## Initialize LLM and Embedding Models

In [None]:
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

In [None]:
generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(docs, testset_size=50)
testset_df = dataset.to_pandas()


In [None]:
testset_csv = 'collegeassist_evaluation_dataset.csv'
testset_df.to_csv(testset_csv, index=False)

In [None]:
with open(testset_csv, 'rb') as file_stream:
    upload_file(file_stream, testset_csv)

In [None]:
kg = KnowledgeGraph()
for doc in docs:
    kg.nodes.append(
        Node(
            type=NodeType.DOCUMENT,
            properties={
                "page_content": doc.page_content,
                "document_metadata": doc.metadata
            }
        )
    )

trans = default_transforms(llm=generator_llm, embedding_model=generator_embeddings)
apply_transforms(kg, trans)

In [None]:
kg_file = 'knowledge_graph.json'
kg.save(kg_file)

# Upload the Knowledge Graph to GCS
with open(kg_file, 'rb') as file_stream:
    upload_file(file_stream, kg_file)

## Generate the Final Testset with Query Distribution

In [None]:
query_distribution = default_query_distribution(generator_llm)
testset = generator.generate(
    testset_size=50,
    query_distribution=query_distribution
)
final_testset_df = testset.to_pandas()

In [None]:
# Save the final testset locally
final_testset_csv = 'collegeassist_final_evaluation_dataset.csv'
final_testset_df.to_csv(final_testset_csv, index=False)

# Upload the final testset to GCS
with open(final_testset_csv, 'rb') as file_stream:
    upload_file(file_stream, final_testset_csv)

print("Evaluation dataset generated and saved successfully.")
