Testing SVM vs all 4 metrics

In [19]:
import numpy as np
import pandas as pd
import seaborn as sns
from comet_ml import Experiment
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, ElasticNet

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVR

In [8]:
df = pd.read_csv('../50_tweets_companies.zip')
print(df.dtypes)

Unnamed: 0                  int64
username                   object
text                       object
Ticker                     object
Name                       object
Industry                   object
Total ESG Risk Score        int64
Environment Risk Score    float64
Social Risk Score         float64
Governance Risk Score     float64
user_id                     int64
text_tokenized             object
dtype: object


In [9]:
X = df['text_tokenized']

pipe = Pipeline([
    ('vect', TfidfVectorizer(max_features=30_000, stop_words='english', strip_accents='ascii')),
    # ('scale', StandardScaler(with_mean=False)),
    ('model', LinearSVR())
])

search_grid = {
    'model': [LinearSVR(), LinearRegression(), ElasticNet()]
}

grid = GridSearchCV(pipe, search_grid, n_jobs=-1, verbose=2,
                    scoring=['explained_variance',
                             'max_error',
                             'neg_mean_absolute_error',
                             'neg_mean_squared_error',
                             'neg_root_mean_squared_error',
                             'neg_mean_squared_log_error',
                             'neg_median_absolute_error',
                             'r2',
                             'neg_mean_absolute_percentage_error'
                             ], refit='neg_root_mean_squared_error')

y_to_test = ['Total ESG Risk Score',
                  'Environment Risk Score',
                  'Social Risk Score',
                  'Governance Risk Score']

results = {}
for y_key in y_to_test:
    y = df[y_key]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    ss = StandardScaler()
    # y_train = ss.fit_transform(y_train.to_numpy().reshape(-1, 1))
    # y_test = ss.transform(y_test.to_numpy().reshape(-1, 1))

    grid.fit(X_train, y_train)
    results_cv = pd.DataFrame(grid.cv_results_)
    results[y_key] = results_cv
    mean_test_cols = results_cv.columns[results_cv.columns.map(lambda s: s.startswith('mean_test_'))]
    print(y_key)
    print(results_cv[['param_model', *mean_test_cols]].T.to_markdown())


Fitting 5 folds for each of 3 candidates, totalling 15 fits
Total ESG Risk Score
|                                              | 0                    | 1                    | 2                       |
|:---------------------------------------------|:---------------------|:---------------------|:------------------------|
| param_model                                  | LinearSVR()          | LinearRegression()   | ElasticNet()            |
| mean_test_explained_variance                 | 0.7512395445435482   | 0.9232680755301697   | -4.4408920985006264e-17 |
| mean_test_max_error                          | -26.34464229727024   | -13.396528120978655  | -39.02175759520602      |
| mean_test_neg_mean_absolute_error            | -2.1383810928993423  | -1.476952818114634   | -6.025473408101753      |
| mean_test_neg_mean_squared_error             | -14.522703329521537  | -4.40387227401729    | -57.46577452613603      |
| mean_test_neg_root_mean_squared_error        | -3.8079725308846846  | 



Environment Risk Score
|                                              | 0           | 1                  | 2                      |
|:---------------------------------------------|:------------|:-------------------|:-----------------------|
| param_model                                  | LinearSVR() | LinearRegression() | ElasticNet()           |
| mean_test_explained_variance                 | nan         | nan                | 0.0                    |
| mean_test_max_error                          | nan         | nan                | -70.3779980082547      |
| mean_test_neg_mean_absolute_error            | nan         | nan                | -5.009917314592742     |
| mean_test_neg_mean_squared_error             | nan         | nan                | -62.6484452087445      |
| mean_test_neg_root_mean_squared_error        | nan         | nan                | -7.89264042825648      |
| mean_test_neg_mean_squared_log_error         | nan         | nan                | -1.1870823455892896  



Social Risk Score
|                                              | 0                    | 1                  | 2                       |
|:---------------------------------------------|:---------------------|:-------------------|:------------------------|
| param_model                                  | LinearSVR()          | LinearRegression() | ElasticNet()            |
| mean_test_explained_variance                 | 0.7399227455836923   | nan                | -2.2204460492503132e-17 |
| mean_test_max_error                          | -29.002479719690207  | nan                | -41.75474108018693      |
| mean_test_neg_mean_absolute_error            | -1.058961916517766   | nan                | -3.1613412155475626     |
| mean_test_neg_mean_squared_error             | -6.955946896090015   | nan                | -26.140799648201085     |
| mean_test_neg_root_mean_squared_error        | -2.6162947071344527  | nan                | -5.099686190571331      |
| mean_test_neg_mean_squared_l

In [20]:
cv_res = pd.DataFrame(grid.cv_results_)
# print(r.head(1))
for i, row in cv_res.iterrows():
    exp = Experiment(workspace="henrystoll",
                     project_name="nlp-token-sklearn-regressors",
                     api_key="HeH9EtfDC2KUlCOjeQaU1CuOM",)
    for k, v in row.items():
        if k.startswith('mean_') or k.startswith('std_'):
            exp.log_metric(k, v)
        elif k.startswith('param_'):
            exp.log_parameter(k, v)
    exp.end()


ImportError: You must import Comet before these modules: sklearn

In [18]:
# print(grid.best_params_)
# print(grid.best_score_)
#
# reg = grid.best_estimator_.named_steps['model']
# vect = grid.best_estimator_.named_steps['vect']
# zipped = zip(vect.get_feature_names(), reg.feature_importances_)
# feature_importance = pd.DataFrame(zipped, columns=["feature", "value"])
#
# feature_importance = feature_importance.sort_values('value', ascending=False)
#
# sns.barplot(data=feature_importance[:30], y='feature', x='value')
# # results_cv.sort_values('rank_test_score')
# # print(results_cv.T.to_markdown())
# mean_test_cols = results_cv.columns[results_cv.columns.map(lambda s: s.startswith('mean_test_'))]
# print(results_cv[['param_vect', *mean_test_cols]].T.to_markdown())


|Total ESG Risk Score                          | 0                     | 1                     | 2                       |
|:---------------------------------------------|:----------------------|:----------------------|:------------------------|
| param_model                                  | LinearSVR()           | SGDRegressor()        | RandomForestRegressor() |
| mean_test_explained_variance                 | 0.751242580686977     | 0.8154669165991736    | 0.7613766782894824      |
| mean_test_max_error                          | -26.346254265353814   | -19.47149588045149    | -21.598000000000003     |
| mean_test_neg_mean_absolute_error            | -2.138348371848208    | -2.2630201555505347   | -2.5417966824286937     |
| mean_test_neg_mean_squared_error             | -14.523104357803003   | -10.60663541306786    | -13.712961378783309     |
| mean_test_neg_root_mean_squared_error        | -3.8080251247747725   | -3.2555025722622046   | -3.7023434605600416     |
| mean_test_neg_mean_squared_log_error         | -0.018982889292448225 | -0.018974737715976252 | -0.02415040750669572    |
| mean_test_neg_median_absolute_error          | -1.0174032694478012   | -1.564787548340088    | -1.7280000000000002     |
| mean_test_r2                                 | 0.7474499133539391    | 0.8149026086158415    | 0.761093041358932       |
| mean_test_neg_mean_absolute_percentage_error | -0.09171305967740542  | -0.10307275119347678  | -0.11766092295975623    |



|Total ESG Risk Score                             | 0                    | 1                    | 2                       |
|:---------------------------------------------|:---------------------|:---------------------|:------------------------|
| param_model                                  | LinearSVR()          | LinearRegression()   | ElasticNet()            |
| mean_test_explained_variance                 | 0.7512395445435482   | 0.9232680755301697   | -4.4408920985006264e-17 |
| mean_test_max_error                          | -26.34464229727024   | -13.396528120978655  | -39.02175759520602      |
| mean_test_neg_mean_absolute_error            | -2.1383810928993423  | -1.476952818114634   | -6.025473408101753      |
| mean_test_neg_mean_squared_error             | -14.522703329521537  | -4.40387227401729    | -57.46577452613603      |
| mean_test_neg_root_mean_squared_error        | -3.8079725308846846  | -2.097962545933362   | -7.578787308191527      |
| mean_test_neg_mean_squared_log_error         | -0.01898381375549841 | -0.00852186685556139 | -0.0906338825809017     |
| mean_test_neg_median_absolute_error          | -1.0172074335594896  | -1.0631812874606066  | -5.031923231750864      |
| mean_test_r2                                 | 0.7474568990832857   | 0.9232262712182123   | -0.0007438103962948528  |
| mean_test_neg_mean_absolute_percentage_error | -0.0917192033023878  | -0.06817086305555907 | -0.281651381138203      |



| Environment Risk Score  | 0           | 1                  | 2                      |
|:---------------------------------------------|:------------|:-------------------|:-----------------------|
| param_model                                  | LinearSVR() | LinearRegression() | ElasticNet()           |
| mean_test_explained_variance                 | nan         | nan                | 0.0                    |
| mean_test_max_error                          | nan         | nan                | -70.3779980082547      |
| mean_test_neg_mean_absolute_error            | nan         | nan                | -5.009917314592742     |
| mean_test_neg_mean_squared_error             | nan         | nan                | -62.6484452087445      |
| mean_test_neg_root_mean_squared_error        | nan         | nan                | -7.89264042825648      |
| mean_test_neg_mean_squared_log_error         | nan         | nan                | -1.1870823455892896    |
| mean_test_neg_median_absolute_error          | nan         | nan                | -4.622001991745295     |
| mean_test_r2                                 | nan         | nan                | -0.0009080128439347579 |
| mean_test_neg_mean_absolute_percentage_error | nan         | nan                | -4545293924510922.0    |


|Social Risk Score| 0                    | 1                  | 2                       |
|:---------------------------------------------|:---------------------|:-------------------|:------------------------|
| param_model                                  | LinearSVR()          | LinearRegression() | ElasticNet()            |
| mean_test_explained_variance                 | 0.7399227455836923   | nan                | -2.2204460492503132e-17 |
| mean_test_max_error                          | -29.002479719690207  | nan                | -41.75474108018693      |
| mean_test_neg_mean_absolute_error            | -1.058961916517766   | nan                | -3.1613412155475626     |
| mean_test_neg_mean_squared_error             | -6.955946896090015   | nan                | -26.140799648201085     |
| mean_test_neg_root_mean_squared_error        | -2.6162947071344527  | nan                | -5.099686190571331      |
| mean_test_neg_mean_squared_log_error         | -0.02857175283863956 | nan                | -0.1353589776532861     |
| mean_test_neg_median_absolute_error          | -0.46224502218678953 | nan                | -2.245258919813071      |
| mean_test_r2                                 | 0.7378833583414723   | nan                | -0.0007769639667758899  |
| mean_test_neg_mean_absolute_percentage_error | -0.11723673420857865 | nan                | -0.3632620036474954     |


|Governance Risk Score| 0                     | 1                     | 2                       |
|:---------------------------------------------|:----------------------|:----------------------|:------------------------|
| param_model                                  | LinearSVR()           | LinearRegression()    | ElasticNet()            |
| mean_test_explained_variance                 | 0.588476901144899     | 0.9552505238742874    | -4.4408920985006264e-17 |
| mean_test_max_error                          | -47.12521666251071    | -12.885384177866964   | -59.042982295824494     |
| mean_test_neg_mean_absolute_error            | -0.7977794794891      | -0.774383149145812    | -2.60789601094455       |
| mean_test_neg_mean_squared_error             | -13.469313906153996   | -1.3651483742524575   | -31.72179123935563      |
| mean_test_neg_root_mean_squared_error        | -3.623444821949145    | -1.1666727661532619   | -5.59829206621284       |
| mean_test_neg_mean_squared_log_error         | -0.015071546338370222 | -0.017231085206728444 | -0.1252436648787946     |
| mean_test_neg_median_absolute_error          | -0.3170677197231667   | -0.5545123470663698   | -1.9570177041755117     |
| mean_test_r2                                 | 0.5843989372009901    | 0.9552230455552027    | -0.0012306629602100382  |
| mean_test_neg_mean_absolute_percentage_error | -0.07370062737561704  | -0.10971789677582569  | -0.3376518485587495     |