In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

model_df = pd.read_csv("../csv/vdi_entailment_scores_with_cosine_similarity_chunk_size_256_all-MiniLM-L6-v2.csv")
model_df = model_df.sort_values(by=['company', 'year'])

manual_df = pd.read_csv("../csv/vdi_scores_manual.csv")
manual_df = manual_df.sort_values(by=['company', 'year'])

In [None]:
for i in range(1, 70):
    cols = [f'entailment_q{i}_{j}' for j in range(1, 11)]
    model_df[f'entailment_q{i}'] = model_df[cols].max(axis=1)

In [None]:
new_df = pd.merge(
    manual_df,
    model_df,
    on=['company', 'year'],
    how='inner'
).sort_values(by=['company', 'year'])
cols = ['company', 'year', 'vdi_score_unscaled']
for i in range(1, 70):
    cols.append(f'entailment_q{i}')
new_df = new_df.loc[:, cols]

In [None]:
cols = [f'entailment_q{i}' for i in range(1, 70)]
X = new_df.loc[:, cols].to_numpy()
y = new_df.loc[:, "vdi_score_unscaled"].to_numpy()

X_train = np.delete(X, np.s_[15:17], axis=0)
X_test = X[15:17, :]
y_train = np.delete(y, np.s_[15:17])
y_test = y[15:17]

In [None]:
# y_train = scaler.fit_transform(y_train.reshape(-1, 1))
# y_test = scaler.transform(y_test.reshape(-1, 1))

In [None]:
X_train_with_intercept = np.hstack((np.ones((X_train.shape[0], 1)), X_train))

coefficients = np.linalg.inv(X_train_with_intercept.T @ X_train_with_intercept) @ X_train_with_intercept.T @ y_train

intercept = coefficients[0]
slope = coefficients[1:]

print("Intercept:", intercept)
print("Slope:", slope)

In [None]:
coefficients = coefficients.reshape(1, -1)

In [None]:
X_test_with_intercept = np.hstack((np.ones((X_test.shape[0], 1)), X_test))

y_pred = coefficients @ X_test_with_intercept.T
y_pred

In [None]:
y_test - y_pred.reshape(-1, 1)

In [None]:
# scaler.inverse_transform(y_pred)

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Train SVR model
svr = SVR(kernel='rbf', C=1.0, epsilon=0.1)
svr.fit(X_train, y_train)

# Make predictions
y_pred = svr.predict(X_test)

# Calculate metrics
y_test = y_test.squeeze()
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

In [None]:
scaler.inverse_transform(np.expand_dims(y_pred, axis=1))

In [None]:
scaler.inverse_transform(np.expand_dims(y_test, axis=1))

In [None]:
THRESHOLD = 0.9
vdi_scores = (model_df.loc[:, cols] > THRESHOLD).sum(axis=1)
model_df['VDI_score'] = vdi_scores
print(model_df[['company', 'year', 'VDI_score']])