In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

model_df = pd.read_csv("../csv/vdi_entailment_scores_with_cosine_similarity_chunk_size_256_all-MiniLM-L6-v2.csv")
model_df = model_df.sort_values(by=['company', 'year'])

manual_df = pd.read_csv("../csv/vdi_scores_manual.csv")
manual_df = manual_df.sort_values(by=['company', 'year'])

In [2]:
for i in range(1, 70):
    cols = [f'entailment_q{i}_{j}' for j in range(1, 11)]
    model_df[f'entailment_q{i}'] = model_df[cols].max(axis=1)

In [3]:
new_df = pd.merge(
    manual_df,
    model_df,
    on=['company', 'year'],
    how='inner'
)

cols = ['company', 'year', 'vdi_score_unscaled']
for i in range(1, 70):
    cols.append(f'entailment_q{i}')
new_df = new_df.loc[:, cols]
# new_df.to_csv("../csv/vdi_entailment_dataset.csv", index=False)

In [4]:
cols = [f'entailment_q{i}' for i in range(1, 70)]
X = new_df.loc[:, cols].to_numpy()
y = new_df.loc[:, "vdi_score_unscaled"].to_numpy()

X_train = np.delete(X, np.s_[16:18], axis=0)
X_test = X[16:18, :]
y_train = np.delete(y, np.s_[16:18])
y_test = y[16:18]

In [5]:
y_train = scaler.fit_transform(y_train.reshape(-1, 1))
y_test = scaler.transform(y_test.reshape(-1, 1))

In [6]:
X_train_with_intercept = np.hstack((np.ones((X_train.shape[0], 1)), X_train))

coefficients = np.linalg.inv(X_train_with_intercept.T @ X_train_with_intercept) @ X_train_with_intercept.T @ y_train

intercept = coefficients[0]
slope = coefficients[1:]

print("Intercept:", intercept)
print("Slope:", slope)

Intercept: [-22.39011149]
Slope: [[ 11.29778298]
 [ 10.79574443]
 [  3.41834433]
 [  5.58864425]
 [ -4.32193415]
 [ -5.25428983]
 [  2.06618439]
 [-36.92678564]
 [  7.78523744]
 [-23.64556798]
 [ 24.77772559]
 [ 43.09183816]
 [  1.61337315]
 [ 48.08648804]
 [-16.5703355 ]
 [ -4.17835274]
 [ -6.75141628]
 [ -2.72856776]
 [ 25.38296066]
 [ 10.66715322]
 [  4.19878394]
 [ -0.52750714]
 [-22.04948909]
 [-10.49935283]
 [-16.29216952]
 [  9.81223109]
 [ 11.94383533]
 [  4.03953047]
 [-15.98929379]
 [-16.15342618]
 [-12.40320197]
 [  5.39786527]
 [ -0.19913607]
 [-15.50785262]
 [  2.13092002]
 [  8.04998763]
 [  7.80881989]
 [ -8.22423158]
 [  0.52079842]
 [  3.25512744]
 [ -6.75074681]
 [  5.47219581]
 [ 12.45359986]
 [ -3.94303   ]
 [ -4.37817188]
 [ -7.06599351]
 [  5.8605584 ]
 [  4.30842253]
 [  9.18943641]
 [  0.53374792]
 [ -2.57050316]
 [ 17.64208775]
 [-21.60571873]
 [-13.24782285]
 [  5.14181334]
 [  2.5482462 ]
 [ -7.8856416 ]
 [ -3.24230754]
 [  5.52496278]
 [-11.85255872]
 [ -0.3

In [7]:
coefficients = coefficients.reshape(1, -1)

In [8]:
X_test_with_intercept = np.hstack((np.ones((X_test.shape[0], 1)), X_test))

y_pred = coefficients @ X_test_with_intercept.T
y_pred

array([[-15.5907475 ,  -1.25204021]])

In [9]:
coefficients @ X_train_with_intercept.T

array([[2.26290017, 2.47993452, 1.79200774, 2.32277943, 1.71873781,
        2.02586916, 2.6509348 , 2.28900182, 2.01958218, 2.29470147,
        2.44465893, 2.98099651, 3.38142518, 2.84951221, 1.88991216,
        7.01559193, 2.27119261, 1.82700155, 2.63758488, 3.31518755,
        3.10333084, 2.71028992, 1.09900237, 1.63668432, 1.65316935,
        2.20769576, 3.02651892, 3.50017425, 3.60657353, 3.59721634,
        3.61473675, 1.56387952, 1.44337516, 1.26159609, 1.46793588,
        2.39858232]])

In [10]:
y_test - y_pred.reshape(-1, 1)

array([[14.95684553],
       [ 1.09374936]])

In [11]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Train SVR model
svr = SVR(kernel='rbf', C=1.0, epsilon=0.1)
svr.fit(X_train, y_train)

# Make predictions
y_pred = svr.predict(X_test)

# Calculate metrics
y_test = y_test.squeeze()
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Squared Error: 0.2765
R² Score: -3.8894


  y = column_or_1d(y, warn=True)


In [12]:
scaler.inverse_transform(np.expand_dims(y_pred, axis=1))

array([[43.97036792],
       [43.82129391]])

In [13]:
scaler.inverse_transform(np.expand_dims(y_test, axis=1))

array([[38.    ],
       [41.9934]])

In [14]:
THRESHOLD = 0.8
vdi_scores = (model_df.loc[:, cols] > THRESHOLD).sum(axis=1)
model_df['VDI_score'] = vdi_scores
print(model_df[['company', 'year', 'VDI_score']])

          company  year  VDI_score
0   Bharti Airtel  2015         55
1   Bharti Airtel  2016         62
2   Bharti Airtel  2017         46
3   Bharti Airtel  2018         52
4      Coal India  2014         55
5      Coal India  2015         53
6      Coal India  2016         53
7      Coal India  2017         49
8      Coal India  2018         49
9             HUL  2014         46
10            HUL  2015         52
11            HUL  2016         51
12            HUL  2017         53
13            HUL  2018         51
14            ITC  2014         54
15            ITC  2015         56
16            ITC  2016         50
17            ITC  2017         52
18            ITC  2018         52
19        Infosys  2014         50
20        Infosys  2015         55
21        Infosys  2016         50
22        Infosys  2017         50
23        Infosys  2018         46
24           ONGC  2014         51
25           ONGC  2015         51
26           ONGC  2017         49
27           ONGC  2