In [None]:
import pandas as pd

# Load the dataset
file_path = 'arg_quality_rank_30k.csv'
df = pd.read_csv(file_path)

## Calculate each element's argument and non-argument probabilities & add to dataset

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("addy88/argument-classifier")
arg_model = AutoModelForSequenceClassification.from_pretrained("addy88/argument-classifier")

import torch

# Function to classify an argument and return probabilities
def classify_argument(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt")

    # Get the model output
    outputs = arg_model(**inputs)

    # Get the predicted probabilities
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

    # Return probabilities as a list
    return probs.tolist()[0]

# Apply the function to each argument in the dataset
df[['non_argument_prob', 'argument_prob']] = df['argument'].apply(lambda x: pd.Series(classify_argument(x)))


## Calculate similarity between topic and argument & add to dataset

In [None]:
from sentence_transformers import SentenceTransformer, util

# Initialize the model
sim_model = SentenceTransformer('annakotarba/sentence-similarity')


# Function to calculate similarity between argument and topic
def calculate_similarity(row):
    argument = row['topic']
    topic = row['argument']

    # Encode the sentences
    topic_embedding = sim_model.encode(topic, convert_to_tensor=True)
    argument_embedding = sim_model.encode(argument, convert_to_tensor=True)

    # Compute cosine similarity
    cosine_score = util.pytorch_cos_sim(topic_embedding, argument_embedding)

    # Return the similarity score
    return cosine_score.item()


# Apply the function to each row in the dataset
df['similarity_score'] = df.apply(calculate_similarity, axis=1)


## Save new dataset and models

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer

# Download and save argument classification model
arg_model_name = "addy88/argument-classifier"
arg_tokenizer = AutoTokenizer.from_pretrained(arg_model_name)
arg_model = AutoModelForSequenceClassification.from_pretrained(arg_model_name)
arg_tokenizer.save_pretrained("./local_models/argument-classifier")
arg_model.save_pretrained("./local_models/argument-classifier")

# Download and save sentence similarity model
sim_model_name = "annakotarba/sentence-similarity"
sim_model = SentenceTransformer(sim_model_name)
sim_model.save("./local_models/sentence-similarity")

In [None]:
# Save the updated DataFrame back to a CSV file
output_file_path = 'augmented-arg-data.csv'
df.to_csv(output_file_path, index=False)

## Proceeding with new dataset

In [None]:
import pandas as pd

# Load the dataset
file_name = 'augmented-arg-data.csv'
df = pd.read_csv(file_name)

## Split dataset into train, val, and test

In [None]:
import numpy as np

# Split the dataset into training, validation, and testing sets
train_df = df[df['set'] == 'train']
val_df = df[df['set'] == 'dev']
test_df = df[df['set'] == 'test']

# Define the feature columns and target column
feature_columns = ['argument_prob', 'similarity_score']
target_column = 'WA'

# Split each set into X and Y
X_train = train_df[feature_columns]
Y_train = train_df[target_column]

X_val = val_df[feature_columns]
Y_val = val_df[target_column]

X_test = test_df[feature_columns]
Y_test = test_df[target_column]

# Print the first few rows of X and Y for each set to verify
print("X_train:\n", X_train)
print("Y_train:\n", Y_train)
print("X_val:\n", X_val)
print("Y_val:\n", Y_val)
print("X_test:\n", X_test)
print("Y_test:\n", Y_test)


## Predict the Quality Metric using Gradient Boosting Regressor

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Initialize the model
gb_model = GradientBoostingRegressor(random_state=42)

# Perform grid search
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, Y_train)

# Best parameters and estimator
best_params = grid_search.best_params_
best_gb_model = grid_search.best_estimator_

# Evaluate on validation set
Y_val_pred = best_gb_model.predict(X_val)
mse = mean_squared_error(Y_val, Y_val_pred)
r2 = r2_score(Y_val, Y_val_pred)
print(f"Best Validation Mean Squared Error: {mse:.4f}")
print(f"Best Validation R^2 Score: {r2:.4f}")


## Final Testing

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Extract test features and target
X_test = test_df[['argument_prob', 'similarity_score']]
Y_test = test_df['combined_quality']

# Make predictions on the test set
Y_test_pred = best_gb_model.predict(X_test)

# Evaluate on test set
test_mse = mean_squared_error(Y_test, Y_test_pred)
test_r2 = r2_score(Y_test, Y_test_pred)

print(f"Test Mean Squared Error: {test_mse:.4f}")
print(f"Test R^2 Score: {test_r2:.4f}")


## Test model on individual inputs

In [None]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from sentence_transformers import SentenceTransformer, util

# Load model directly
tokenizer = AutoTokenizer.from_pretrained("addy88/argument-classifier")
arg_model = AutoModelForSequenceClassification.from_pretrained("addy88/argument-classifier")
sim_model = SentenceTransformer('annakotarba/sentence-similarity')


# Function to calculate similarity between argument and topic
def calculate_similarity(argument, topic):
    # Encode the sentences
    topic_embedding = sim_model.encode(topic, convert_to_tensor=True)
    argument_embedding = sim_model.encode(argument, convert_to_tensor=True)

    # Compute cosine similarity
    cosine_score = util.pytorch_cos_sim(topic_embedding, argument_embedding)

    # Return the similarity score
    return cosine_score.item()


# Function to classify an argument and return probabilities
def classify_argument(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt")

    # Get the model output
    outputs = arg_model(**inputs)

    # Get the predicted probabilities
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

    # Return probabilities as a list
    return probs.tolist()[0][1]


# Define a function to predict the score of an argument
def predict_argument_score(argument, topic):
    argument_prob = classify_argument(argument)
    similarity_score = calculate_similarity(argument, topic)

    # Prepare the input features for the model
    X_test = [[argument_prob, similarity_score]]

    # Predict the accuracy score
    accuracy_score = best_gb_model.predict(X_test)

    return accuracy_score[0]  # Return the predicted accuracy score

In [None]:
# Example usage
argument = "cannabis increases drug trafficking and violence in the streets"
topic = "We should legalize cannabis"

accuracy_score = predict_argument_score(argument, topic)

print(f"The predicted accuracy score for the argument is: {accuracy_score:.4f}")

## Download Model

In [None]:
import joblib

# Save the model to a file
joblib.dump(best_gb_model, 'kelly-model.pkl')