We import make_regression from sklearn.datasets because it will help us create the dataset for todayâ€™s regression problem (recall that up to now, we have no dataset ðŸ™‚ )
From sklearn.multioutput we import MultiOutputRegressor â€“ itâ€™s the wrapper we discussed in the previous section.
As we will convert an SVR model into a multioutput regressor, we must import SVR from sklearn.svm.
After generating the dataset with make_regression, we must split it into train/test sets. We can do so using sklearn.model_selectionâ€˜s train_test_split.
Finally, we import mean_squared_error and mean_absolute_error from sklearn.metrics for evaluating our model. Those are default error functions for regression problems.

In [63]:
import sqlite3
from sklearn import metrics
from scipy.sparse import hstack
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm 
from sklearn.metrics import accuracy_score
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
from sklearn.datasets import make_regression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [64]:
text=pd.read_csv("tokenized.csv")

In [65]:
len(text)

94718

In [66]:
text_1=text.sample(n=7000)

In [67]:
text_1=text_1.reset_index()

In [26]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(text_1[["title_final",'text_final',"side"]],text_1[['gender','religion','ethnicity','ideology']],test_size=0.3)

#encoding the X value
Encoder = LabelEncoder()
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(text_1["title_final"])
Train_X_Tfidf_title = Tfidf_vect.transform(Train_X["title_final"])
Test_X_Tfidf_title = Tfidf_vect.transform(Test_X["title_final"])
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(text_1["text_final"])
Train_X_Tfidf_text = Tfidf_vect.transform(Train_X["text_final"])
Test_X_Tfidf_text = Tfidf_vect.transform(Test_X["text_final"])
Train_X_1_pos = Encoder.fit_transform(Train_X['side'])
Test_X_1_pos = Encoder.fit_transform(Test_X['side'])
mat_train= hstack([Train_X_Tfidf_title,Train_X_Tfidf_text])
X_train_tfidf = hstack([mat_train, Train_X_1_pos.reshape(-1, 1)])
mat_test= hstack([Test_X_Tfidf_title,Test_X_Tfidf_text])
X_test_tfidf = hstack([mat_test, Test_X_1_pos.reshape(-1, 1)])

In [27]:
#encoding Y

#gender
Train_Y_gender = Encoder.fit_transform(Train_Y['gender'])
Test_Y_gender = Encoder.fit_transform(Test_Y['gender'])

#religion
Train_Y_rel = Encoder.fit_transform(Train_Y['religion'])
Test_Y_rel = Encoder.fit_transform(Test_Y['religion'])

#ethnicity
Train_Y_ethn = Encoder.fit_transform(Train_Y['ethnicity'])
Test_Y_ethn = Encoder.fit_transform(Test_Y['ethnicity'])

#ideology
Train_Y_id = Encoder.fit_transform(Train_Y['ideology'])
Test_Y_id = Encoder.fit_transform(Test_Y['ideology'])

#stacking into a vector
y_train=np.stack((Train_Y_gender,Train_Y_rel,Train_Y_ethn,Train_Y_id),axis=1)
y_test=np.stack((Test_Y_gender,Test_Y_rel,Test_Y_ethn,Test_Y_id),axis=1)

We can then move forward and construct the SVR regressor:

Here, we set the value for Ïµ (epsilon) to 0.2. It specifies the width of the â€˜error tubeâ€™ where no penalty is assigned to mispredictions, effectively allowing us to take values close to the edges of the error tube as support vectors.
If we want to apply regularization, we can also apply values for C â€“ more information here.

In [43]:
# Create the SVR regressor
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
# Create the Multioutput Regressor
mor = MultiOutputRegressor(SVM)

Finally, we can fit the training data (X_train) and y_train) to our MultiOutputRegressor. This starts the training process. Once fitting the data is complete, we can generate y_pred prediction values for our testing inputs X_test. Using the mean squared error and mean absolute error, we can then evaluate model performance:

In [44]:
# Train the regressor
mor = mor.fit(X_train_tfidf, y_train)

# Generate predictions for testing data
y_pred = mor.predict(X_test_tfidf)



In [45]:
# Evaluate the regressor
mse_one = mean_squared_error(y_test[:,0], y_pred[:,0])
mse_two = mean_squared_error(y_test[:,1], y_pred[:,1])
mse_three = mean_squared_error(y_test[:,2], y_pred[:,2])
mse_four = mean_squared_error(y_test[:,3], y_pred[:,3])
print(f'MSE for gender: {mse_one} - religion: {mse_two} - ethnicity: {mse_three} - ideology: {mse_four}')


MSE for gender: 0.43666666666666665 - religion: 2.07 - ethnicity: 9.338571428571429 - ideology: 1.9619047619047618


In [46]:
mae_one = mean_absolute_error(y_test[:,0], y_pred[:,0])
mae_two = mean_absolute_error(y_test[:,1], y_pred[:,1])
mae_three = mean_absolute_error(y_test[:,2], y_pred[:,2])
mae_four = mean_absolute_error(y_test[:,3], y_pred[:,3])
print(f'MSE for gender: {mae_one} - religion: {mae_two} - ethnicity: {mae_three} - ideology: {mae_four}')


MSE for gender: 0.22333333333333333 - religion: 0.9966666666666667 - ethnicity: 1.3947619047619049 - ideology: 0.8657142857142858


In [73]:
print("SVM Metrics for Gender")
print(metrics.classification_report(y_pred[:,0],y_test[:,0],target_names=["Female","LGBTQIAPK+","Male"]))

SVM Metrics for Gender
              precision    recall  f1-score   support

      Female       0.06      0.41      0.10        32
  LGBTQIAPK+       0.00      0.00      0.00         0
        Male       0.99      0.89      0.94      2068

    accuracy                           0.88      2100
   macro avg       0.35      0.43      0.35      2100
weighted avg       0.98      0.88      0.92      2100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [76]:
print("SVM Metrics for Religion")
print(metrics.classification_report(y_pred[:,1],y_test[:,1],target_names=["Agnostic","Atheist","Christian","Muslim","Other"]))

SVM Metrics for Religion
              precision    recall  f1-score   support

    Agnostic       0.21      0.31      0.25       205
     Atheist       0.36      0.38      0.37       547
   Christian       0.64      0.46      0.54      1123
      Muslim       0.10      1.00      0.18         3
       Other       0.21      0.34      0.26       222

    accuracy                           0.42      2100
   macro avg       0.30      0.50      0.32      2100
weighted avg       0.48      0.42      0.44      2100



In [72]:
print("SVM Metrics for Ethnicity")
print(metrics.classification_report(y_pred[:,2],y_test[:,2],target_names=['Asian','Black','East Indian','Latino','Middle Eastern','Native American','Other','Pacific Islander','White']))

SVM Metrics for Ethnicity
                  precision    recall  f1-score   support

           Asian       0.04      0.32      0.07        25
           Black       0.04      1.00      0.07         2
     East Indian       0.22      1.00      0.36         6
          Latino       0.02      0.25      0.04         4
  Middle Eastern       0.00      0.00      0.00         0
 Native American       0.05      0.25      0.08         4
           Other       0.03      0.45      0.06        11
Pacific Islander       0.00      0.00      0.00         1
           White       0.98      0.75      0.85      2047

        accuracy                           0.74      2100
       macro avg       0.15      0.45      0.17      2100
    weighted avg       0.96      0.74      0.83      2100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [75]:
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score Ideology")
print(metrics.classification_report(y_pred[:,3],y_test[:,3],target_names=["Center","Left","Other","Right"]))

SVM Accuracy Score Ideology
              precision    recall  f1-score   support

      Center       0.09      0.23      0.12       109
        Left       0.14      0.48      0.22        84
       Other       0.18      0.39      0.24       171
       Right       0.88      0.58      0.70      1736

    accuracy                           0.55      2100
   macro avg       0.32      0.42      0.32      2100
weighted avg       0.75      0.55      0.61      2100

