# Imports

Libraries

In [260]:
# Basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# PyTorch libraries
import torch

#Neural network libraries
import torch.nn as nn
import torch.nn.functional as F

# #Creating optimizer
import torch.optim as optim

# For terminating program (useful for stopping before training is complete and seeing entire result)
import signal
import sys, os

CSV File

In [261]:
script_dir = os.path.abspath('') # absolute file path in Jupyter
file_path = os.path.join(script_dir, "AI EarthHack Dataset.csv") # file path for the text file input

dataset = pd.read_csv(file_path, header=None)
print(dataset[1])

0                                                 problem
1       The construction industry is indubitably one o...
2       I'm sure you, like me, are feeling the heat - ...
3       The massive shift in student learning towards ...
4       The fashion industry is one of the top contrib...
                              ...                        
1296    The linear 'take, make, dispose' model of prod...
1297    The conundrum we face is the improper disposal...
1298               This solution will help the vegetation
1299    Accumulation and improper disposal of single-u...
1300    The excessive and wasteful resource consumptio...
Name: 1, Length: 1301, dtype: object


# Zero-Shot Classification

In [262]:
from transformers import pipeline

# TODO look into changing the modle, bart-large-mnli seems to be the most populat for ZSC
# Other options are T5, GPT, and RoBERTa

# Models tried:
# - facebook/bart-large-mnli
# - sjrhuschlee/flan-t5-base-mnli
# - google/flan-t5-base

classifier = pipeline("zero-shot-classification", model="sjrhuschlee/flan-t5-base-mnli")

# Example class descriptions and input text
# input_text = [["Problem: "+str(dataset[1][i]), "Solution: "+str(dataset[2][i])] for i in range(1,3)]
# Bad solutions: 39, 52, 22, 457, 279
# Good solutions: 3, 117

#region Descriptions

relevance_description=[
    """The problem and solution are detailed, specific, and in-depth.
    Bonus points if they are related to circular economy.""",

    """The problem and/or solution are sloppy, off-topic (i.e., not sustainability related), unsuitable, or vague (such as the over-generic content that prioritizes form over substance, offering generalities instead of specific details).
    In addition, inputs such as, 'Problem: I will research to make solution Solution: Hi' belong in this category."""
]

feasibility_description=[
    "The solution is feasible.",

    """The solution is not feasible.
    This can mean that the solution is too wide in scope to be easily implemented, or puts unreasonable expectations on people, businesses, or the world as a whole.
    Any solution that would take a large amount of resources or manpower to execute successfully would also be considered not feasible."""
]

innovation_description=[
    """The solution is innovative.
    A solution is innovative if it imagines something completely new and novel.""",

    """The solution is not innovative.
    A solution is considered not innovative if it does not present any originality."""
]

scalability_description = [
    """The solution is scalable.
    A scalable solution will often provide a clear path on how the solution can be scaled in the future, if required.
    A scalable solution is one that can start small, and grows to have a larger area of effect over time.""",

    """The solution is not scalable.
    These solutions are ones that only focus on the small scale, and could not easily be done on a large scale."""
]

circularEconomic_description = [
    """The solution is not pertinent to a circular economy.
    In today’s rapidly evolving world, climate change stands as a formidable problem, deeply influencing our daily lives and the health of our planet. The circular economy, with its focus on reusing and recycling resources to minimize waste, emerges as a crucial strategy in this battle. Innovations like car-sharing platforms significantly reduce the carbon footprint of transportation, while modular designs in various products promote waste reduction by allowing individual components to be upgraded rather than discarding the entire item.
    In the face of climate change's criticality, the urgency to identify and implement high-impact circular economy solutions has never been greater. The challenge we confront today, however, extends beyond coming up with solutions to confront this problem. It lies in the daunting task of effectively evaluating a vast and diverse array of solutions, discerning the most impactful ones amidst a sea of possibilities. This process can be overwhelming, given the complexity and the sheer volume of potential solutions, leading to cognitive overload for human evaluators.""",

    "The solution is not pertinent to a circular economy."
]

descriptions = {
    "Relevance": relevance_description,
    "Feasibility": feasibility_description,
    "Innovation": innovation_description,
    "Scalability": scalability_description,
    "Circular Economic": circularEconomic_description
}

#endregion

# CSV Sacrifices
problems = []
solutions = []
scores = {
    "Relevance": [],
    "Feasibility": [],
    "Innovation": [],
    "Scalability": [],
    "Circular Economic": []
}

# Determined through testing
RELEVANCE_CUTOFF = 0.75

start_row = 1
end_row = 10

rows_removed = 0

# Go from line start_row to end_row
for i in range(start_row, end_row + 1):

    # Get the next problem and solution, slightly cleaned up
    problem_text = str(dataset[1][i]).replace("\"\"\"\"", "\"").strip()
    solution_text = str(dataset[2][i]).replace("\"\"\"\"", "\"").strip()

    # Loop for each metric the AI should measure
    for category in descriptions:

        # Pass the problem and solution into the AI model, and let it determine the metric percentage using the prompts defined above
        result = classifier("Problem: " + problem_text + " Solution: " + solution_text, descriptions[category])

        # Append the score of the metric to the scores dictionary
        scores[category].append(result["scores"][0]) if (result["labels"][0] == descriptions[category][0]) else scores[category].append(result["scores"][1])
        
        # If the relevance score is less than the cutoff, don't include it in the new CSV
        if(category == "Relevance" and scores[category][i - start_row - rows_removed] < RELEVANCE_CUTOFF):
            scores[category].pop()
            rows_removed += 1
            break
    else:
        problems.append(problem_text)
        solutions.append(solution_text)

print(len(problems))


9


# Putting in another CSV
obj = {'Id':range(1, len(problems)+1), 'Problem':problems, 'Solution':solutions}
obj.update(scores)
df = pd.DataFrame(obj)

new_csv = 'Analyzed_dataset.csv'
script_dir = os.path.abspath('')
file_path = os.path.join(script_dir,new_csv)
df.to_csv(file_path,index=False)