In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

model_df = pd.read_csv("../csv/vdi_entailment_scores_with_cosine_similarity_chunk_size_256_all-MiniLM-L6-v2.csv")
model_df = model_df.sort_values(by=['company', 'year'])

manual_df = pd.read_csv("../csv/vdi_scores_manual.csv")
manual_df = manual_df.sort_values(by=['company', 'year'])

In [2]:
for i in range(1, 70):
    cols = [f'entailment_q{i}_{j}' for j in range(1, 11)]
    model_df[f'entailment_q{i}'] = model_df[cols].max(axis=1)

In [5]:
cols = [f'entailment_q{i}' for i in range(1, 70)]
X = model_df.loc[:, cols].to_numpy()
# X = model_df.iloc[:, 2:2+69*10].to_numpy()
y = manual_df.iloc[:, -1].to_numpy()

X_train = np.delete(X, np.s_[28:33], axis=0)
X_test = X[28:33, :]
y_train = np.delete(y, np.s_[28:33])
y_test = y[28:33]

In [6]:
y_train = scaler.fit_transform(y_train.reshape(-1, 1))
y_test = scaler.transform(y_test.reshape(-1, 1))

In [7]:
X_train_with_intercept = np.hstack((np.ones((X_train.shape[0], 1)), X_train))

coefficients = np.linalg.inv(X_train_with_intercept.T @ X_train_with_intercept) @ X_train_with_intercept.T @ y_train

intercept = coefficients[0]
slope = coefficients[1:]

print("Intercept:", intercept)
print("Slope:", slope)

Intercept: [-51.51799985]
Slope: [[-10.3482547 ]
 [ 29.00402649]
 [  5.10419911]
 [ -0.35573886]
 [ -0.93938472]
 [  4.84194835]
 [ -8.30909886]
 [  5.33147344]
 [ -1.06591202]
 [ -8.21747503]
 [  0.22506856]
 [ -2.09482872]
 [  6.96244267]
 [ -1.18677951]
 [ -3.33185306]
 [  7.64735662]
 [-17.57854169]
 [  7.81446538]
 [  1.41557298]
 [  6.74325298]
 [ -0.51114898]
 [ -0.89519905]
 [ 12.84601898]
 [ -4.71321025]
 [ -1.69092449]
 [  2.43708852]
 [ -4.02533274]
 [ -0.67219678]
 [ 21.6017137 ]
 [ 13.86217807]
 [ -3.59746373]
 [ -4.46091769]
 [-11.19081498]
 [ -4.93370779]
 [  8.96708346]
 [  6.25900447]
 [ -0.83195138]
 [ -9.07631319]
 [  1.40181941]
 [ 17.69962877]
 [-14.49377384]
 [  5.82422283]
 [ -1.81974562]
 [  0.68817642]
 [ -8.70011274]
 [ -4.56981698]
 [  7.05157745]
 [  1.99248222]
 [  6.82781547]
 [ 18.92513191]
 [ -5.65836276]
 [-13.11630781]
 [-31.29697494]
 [ 13.75235988]
 [  6.3365767 ]
 [  1.10354155]
 [  5.37240046]
 [  4.60295725]
 [  9.65084833]
 [  1.06136889]
 [  8.8

In [8]:
coefficients = coefficients.reshape(1, -1)

In [9]:
X_test_with_intercept = np.hstack((np.ones((X_test.shape[0], 1)), X_test))

y_pred = coefficients @ X_test_with_intercept.T
y_pred

array([[ 6.35131834,  2.44535735, -2.24605114, -7.49383811, -0.61038944]])

In [10]:
coefficients @ X_train_with_intercept.T

array([[-2.52850228, -2.80620437, -2.16196667, -1.99874984, -2.27585219,
        -2.55727498, -2.42064006, -2.36040959, -2.60925683, -2.12471576,
        -1.73649715, -1.12340646, -1.17239557, -1.03280533, -1.91417444,
         3.09484262, -2.8089964 , -2.3836536 , -2.19571684, -2.70021154,
        -2.1097591 , -1.69256215, -1.95322477, -2.09546938, -3.31707987,
        -2.62547966, -2.7016445 , -2.32842336, -3.37424049, -3.58740306,
        -3.06511115, -3.5444082 , -2.83888363, -1.77303222, -1.3555702 ,
        -1.14689591, -0.24047502, -0.79269288, -3.46389168, -3.00787081,
        -2.37773558, -2.20429106, -2.64262191]])

In [11]:
y_test - y_pred.reshape(-1, 1)

array([[-5.38691816],
       [-1.35351948],
       [ 3.4653267 ],
       [ 8.71311367],
       [ 1.95710268]])

In [12]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Train SVR model
svr = SVR(kernel='rbf', C=1.0, epsilon=0.1)
svr.fit(X_train, y_train)

# Make predictions
y_pred = svr.predict(X_test)

# Calculate metrics
y_test = y_test.squeeze()
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Squared Error: 2.6763
R² Score: -157.4555


  y = column_or_1d(y, warn=True)


In [13]:
scaler.inverse_transform(np.expand_dims(y_pred, axis=1))

array([[41.22687939],
       [38.47059369],
       [38.3066222 ],
       [38.94218063],
       [37.7684718 ]])

In [14]:
scaler.inverse_transform(np.expand_dims(y_test, axis=1))

array([[50.],
       [51.],
       [52.],
       [52.],
       [53.]])

In [24]:
THRESHOLD = 0.8
vdi_scores = (model_df.loc[:, cols] > THRESHOLD).sum(axis=1)
model_df['VDI_score'] = vdi_scores
print(model_df[['company', 'year', 'VDI_score']])

          company  year  VDI_score
7   Bharti Airtel  2015         55
26  Bharti Airtel  2016         62
38  Bharti Airtel  2017         46
40  Bharti Airtel  2018         52
46     Coal India  2014         55
1      Coal India  2015         53
34     Coal India  2016         53
22     Coal India  2017         49
39     Coal India  2018         49
36            HUL  2014         46
33            HUL  2015         52
41            HUL  2016         51
28            HUL  2017         53
8             HUL  2018         51
3             ITC  2014         54
12            ITC  2015         56
43            ITC  2016         50
37            ITC  2017         52
47            ITC  2018         52
5         Infosys  2014         50
4         Infosys  2015         55
13        Infosys  2016         50
24        Infosys  2017         50
15        Infosys  2018         46
11           ONGC  2014         51
14           ONGC  2015         51
9            ONGC  2017         49
2            ONGC  2