We import make_regression from sklearn.datasets because it will help us create the dataset for todayâ€™s regression problem (recall that up to now, we have no dataset ðŸ™‚ )
From sklearn.multioutput we import MultiOutputRegressor â€“ itâ€™s the wrapper we discussed in the previous section.
As we will convert an SVR model into a multioutput regressor, we must import SVR from sklearn.svm.
After generating the dataset with make_regression, we must split it into train/test sets. We can do so using sklearn.model_selectionâ€˜s train_test_split.
Finally, we import mean_squared_error and mean_absolute_error from sklearn.metrics for evaluating our model. Those are default error functions for regression problems.

In [77]:
import sqlite3
from sklearn import metrics
from scipy.sparse import hstack
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm 
from sklearn.metrics import accuracy_score
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
from sklearn.datasets import make_regression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [78]:
text=pd.read_csv("tokenized.csv")

In [79]:
len(text)

94718

In [80]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(text[["title_final",'text_final',"side"]],text[['gender','religion','ethnicity','ideology']],test_size=0.3)

#encoding the X value
Encoder = LabelEncoder()
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(text["title_final"])
Train_X_Tfidf_title = Tfidf_vect.transform(Train_X["title_final"])
Test_X_Tfidf_title = Tfidf_vect.transform(Test_X["title_final"])
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(text["text_final"])
Train_X_Tfidf_text = Tfidf_vect.transform(Train_X["text_final"])
Test_X_Tfidf_text = Tfidf_vect.transform(Test_X["text_final"])
Train_X_1_pos = Encoder.fit_transform(Train_X['side'])
Test_X_1_pos = Encoder.fit_transform(Test_X['side'])
mat_train= hstack([Train_X_Tfidf_title,Train_X_Tfidf_text])
X_train_tfidf = hstack([mat_train, Train_X_1_pos.reshape(-1, 1)])
mat_test= hstack([Test_X_Tfidf_title,Test_X_Tfidf_text])
X_test_tfidf = hstack([mat_test, Test_X_1_pos.reshape(-1, 1)])

In [81]:
#encoding Y

#gender
Train_Y_gender = Encoder.fit_transform(Train_Y['gender'])
Test_Y_gender = Encoder.fit_transform(Test_Y['gender'])

#religion
Train_Y_rel = Encoder.fit_transform(Train_Y['religion'])
Test_Y_rel = Encoder.fit_transform(Test_Y['religion'])

#ethnicity
Train_Y_ethn = Encoder.fit_transform(Train_Y['ethnicity'])
Test_Y_ethn = Encoder.fit_transform(Test_Y['ethnicity'])

#ideology
Train_Y_id = Encoder.fit_transform(Train_Y['ideology'])
Test_Y_id = Encoder.fit_transform(Test_Y['ideology'])

#stacking into a vector
y_train=np.stack((Train_Y_gender,Train_Y_rel,Train_Y_ethn,Train_Y_id),axis=1)
y_test=np.stack((Test_Y_gender,Test_Y_rel,Test_Y_ethn,Test_Y_id),axis=1)

We can then move forward and construct the SVR regressor:

Here, we set the value for Ïµ (epsilon) to 0.2. It specifies the width of the â€˜error tubeâ€™ where no penalty is assigned to mispredictions, effectively allowing us to take values close to the edges of the error tube as support vectors.
If we want to apply regularization, we can also apply values for C â€“ more information here.

In [82]:
# Create the SVR regressor
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
# Create the Multioutput Regressor
mor = MultiOutputRegressor(SVM)

Finally, we can fit the training data (X_train) and y_train) to our MultiOutputRegressor. This starts the training process. Once fitting the data is complete, we can generate y_pred prediction values for our testing inputs X_test. Using the mean squared error and mean absolute error, we can then evaluate model performance:

In [83]:
# Train the regressor
mor = mor.fit(X_train_tfidf, y_train)

# Generate predictions for testing data
y_pred = mor.predict(X_test_tfidf)



In [84]:
# Evaluate the regressor
mse_one = mean_squared_error(y_test[:,0], y_pred[:,0])
mse_two = mean_squared_error(y_test[:,1], y_pred[:,1])
mse_three = mean_squared_error(y_test[:,2], y_pred[:,2])
mse_four = mean_squared_error(y_test[:,3], y_pred[:,3])
print(f'MSE for gender: {mse_one} - religion: {mse_two} - ethnicity: {mse_three} - ideology: {mse_four}')


MSE for gender: 0.44721283783783783 - religion: 1.7058699324324325 - ethnicity: 7.722233952702703 - ideology: 1.5932573198198199


In [85]:
mae_one = mean_absolute_error(y_test[:,0], y_pred[:,0])
mae_two = mean_absolute_error(y_test[:,1], y_pred[:,1])
mae_three = mean_absolute_error(y_test[:,2], y_pred[:,2])
mae_four = mean_absolute_error(y_test[:,3], y_pred[:,3])
print(f'MAE for gender: {mae_one} - religion: {mae_two} - ethnicity: {mae_three} - ideology: {mae_four}')


MSE for gender: 0.22832207207207209 - religion: 0.8036317567567568 - ethnicity: 1.1828195382882882 - ideology: 0.7105152027027027


In [86]:
print("SVM Metrics for Gender")
print(metrics.classification_report(y_pred[:,0],y_test[:,0],target_names=["Female","LGBTQIAPK+","Male"]))

SVM Metrics for Gender
              precision    recall  f1-score   support

      Female       0.05      0.75      0.10       236
  LGBTQIAPK+       0.23      0.91      0.37        86
        Male       1.00      0.88      0.94     28094

    accuracy                           0.88     28416
   macro avg       0.43      0.84      0.47     28416
weighted avg       0.99      0.88      0.93     28416



In [87]:
print("SVM Metrics for Religion")
print(metrics.classification_report(y_pred[:,1],y_test[:,1],target_names=["Agnostic","Atheist","Christian","Muslim","Other"]))

SVM Metrics for Religion
              precision    recall  f1-score   support

    Agnostic       0.31      0.51      0.39      2675
     Atheist       0.49      0.51      0.50      7314
   Christian       0.77      0.56      0.64     15675
      Muslim       0.27      0.71      0.39       180
       Other       0.28      0.52      0.37      2572

    accuracy                           0.54     28416
   macro avg       0.43      0.56      0.46     28416
weighted avg       0.61      0.54      0.56     28416



In [88]:
print("SVM Metrics for Ethnicity")
print(metrics.classification_report(y_pred[:,2],y_test[:,2],target_names=['Asian','Black','East Indian','Latino','Middle Eastern','Native American','Other','Pacific Islander','White']))

SVM Metrics for Ethnicity
                  precision    recall  f1-score   support

           Asian       0.16      0.70      0.26       588
           Black       0.08      0.77      0.15        93
     East Indian       0.43      0.92      0.59       131
          Latino       0.12      0.71      0.21       150
  Middle Eastern       0.14      0.82      0.24        44
 Native American       0.26      0.84      0.40       130
           Other       0.11      0.70      0.19       284
Pacific Islander       0.35      0.93      0.51        42
           White       0.99      0.78      0.87     26954

        accuracy                           0.78     28416
       macro avg       0.29      0.80      0.38     28416
    weighted avg       0.95      0.78      0.84     28416



In [89]:
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score Ideology")
print(metrics.classification_report(y_pred[:,3],y_test[:,3],target_names=["Center","Left","Other","Right"]))

SVM Accuracy Score Ideology
              precision    recall  f1-score   support

      Center       0.19      0.52      0.28      1418
        Left       0.23      0.60      0.33      1323
       Other       0.27      0.59      0.37      2352
       Right       0.93      0.63      0.75     23323

    accuracy                           0.62     28416
   macro avg       0.40      0.58      0.43     28416
weighted avg       0.80      0.62      0.68     28416

