In [1]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from transformers import BertTokenizer, BertModel
import polars as pl
import re
from sklearn.metrics import roc_curve, roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt

In [2]:
df = pl.read_csv("./Data/MITSolve2023.csv")

In [3]:
df = df[list(rename_map.keys())]
df = df.rename(rename_map)
df = df.fill_nan(" ")

In [4]:
df = df.with_columns(
    pl.concat_str(
        [
            pl.col("DevStage1"),
            pl.col("DevStage2"),
        ],
        separator=" ",
    ).alias("DevStage"))

df = df.with_columns(
    pl.concat_str(
        [
            pl.col("FinancialSustainability1"),
            pl.col("FinancialSustainability2"),
        ],
        separator=" ",
    ).alias("FinancialSustainability"))

df = df.with_columns(
    pl.concat_str(
        [
            pl.col("CoreTech"),
            pl.col("TechValidation"),
        ],
        separator=" ",
    ).alias("Tech"))

df = df.drop(["DevStage1", "DevStage2", "FinancialSustainability1", "FinancialSustainability2", "CoreTech", "TechValidation"])

In [6]:
df.head()

ID,Challenge,Status,Problem,Solution,Target,TeamPosition,Dimension,TheoryOfChange,HQ,CountryHQ,NumServed,WhySolve,SupportArea,Innovative,ImpactGoals,MeasureImpact,DEI,BusinessModel,DevStage,FinancialSustainability,Tech
i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
70315,"""Climate Adapta…","""Semi-Finalist""","""<p>Safe and se…","""<p>Echale is a…","""<p>Echale hous…","""<p>Through fra…","""Support inform…","""<p>Echale Inte…","""Washington D.C…","""United States""","""<p>In Mexico, …","""<p>We are very…","""Financial (e.g…","""<p>The most si…","""<p><strong>Dir…","""<p>We measure …","""<p>Echale Inte…","""<p>Through its…","""Growth: An org…","""<p>Echale Inte…","""<p>Echale’s su…"
70331,"""Climate Adapta…","""Published""","""<p><strong>Cli…","""<figure><img s…","""<p><strong>Who…","""<p><strong>Int…","""Support inform…","""<p>Solarshelte…","""Caloocan City …","""Philippines""","""<p>None so far…","""<p>We are appl…","""Business Model…","""<p>Solarshelte…","""<ol><li><stron…","""<p>To measure …","""<p>At Hypershe…","""<p><strong>Tar…","""Prototype: A v…","""<p><strong>1. …","""<p><strong>Cor…"
70469,"""Climate Adapta…","""Published""","""<p>The specifi…","""<p>Our project…","""<p>Our project…","""<p>Tabit Smart…","""Adapt land and…","""<p><strong>The…","""Aydın/ Turkey""","""Turkiye""","""<p>We have rea…","""<p>We are appl…","""Business Model…","""<p>Our solutio…","""<p>Our impact …","""<p>To measure …","""<p> As a TAbit…","""<p>With an app…","""Prototype: A v…","""<p>Our plan fo…","""<p>The core te…"
70482,"""Climate Adapta…","""Published""","""<p>The specifi…","""<p>To create e…","""<p>Our solutio…","""<p>Our team at…","""Reduce emissio…","""<p>Our solutio…","""Mpigi Town/ Na…","""Uganda""","""<p>Currently, …","""<p>We are appl…","""Business Model…","""<p>Upcycle Afr…","""<p>Our impact …","""<p>As an organ…","""<p>At Upcycle …","""<p>Upcycle Afr…","""Growth: An org…","""<p>Upcycle Afr…","""<p>Our solutio…"
70510,"""Climate Adapta…","""Published""","""<p>Heating and…","""<p>At least an…","""<p>The solutio…","""<p>I have long…","""Reduce emissio…","""<p>The target …","""Hertfordshire""","""United Kingdom…","""<p>Starlite ha…","""<p>My slogan i…","""Financial (e.g…","""<p>Once the pl…","""<p>We need to …","""<p>Progress ha…","""<p>I am an int…","""<p>We would sl…","""Prototype: A v…","""<p>We would ha…","""<p>A barrier t…"


In [7]:
df = df.drop(["HQ", "CountryHQ", "SupportArea"])
for col in df.columns[3:]:
    df[col] = df[col].apply(encoder)

  df[col] = df[col].apply(encoder)


In [None]:
df.write_parquet("./Data/encoded.parquet")

In [None]:
df["PS"] = df.apply(lambda x: x["Problem"] + x["Solution"], axis = 1)
df["PST"] = df.apply(lambda x: x["Problem"] + x["Solution"] + x["Target"], axis = 1)

In [None]:
X = np.array([i for i in df["Problem"] + df["Team"] + df["DevStage"] + df["Target"] ])
y = np.array([0 if i == "Published" else 1 for i in df["Status"]])

rus = RandomUnderSampler(random_state=42)
X, y = rus.fit_resample(X, y)

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict_proba(X_test)
y_pred = [i[1] for i in y_pred]

In [None]:
# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

# Plot the ROC curve
plt.plot(fpr, tpr, label=f'AUC: {auc:.4f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='black')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Published or Selected (Problem + Target + DevStage + TeamPosition)')
plt.legend()
plt.show()

In [None]:
# Predicting finalist from semifinalist
df_2 = df[df["Status"] != "Published"]
X = np.array([i for i in  df_2["Problem"] + df_2["Target"] ])
y = np.array([0 if i == "Semi-Finalist" else 1 for i in df_2["Status"]])

rus = RandomUnderSampler(random_state=42)
X, y = rus.fit_resample(X, y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict_proba(X_test)
y_pred = [i[1] for i in y_pred]

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

# Plot the ROC curve
plt.plot(fpr, tpr, label=f'AUC: {auc:.4f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='black')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Semifinalist and Finalist (Problem + Target)')
plt.legend()
plt.show()