# Using Support Vector Classification and Regression for Sentiment Analysis

In [1]:
import sys
import os
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
import pandas as pd
import pickle

### Load pre-trained models and TfidfVectorizer

In [2]:
vectorizer = pickle.load(open("vectorizer.pickle", "rb"))
SVR = pickle.load(open("sentiment_SVR.pickle", "rb")) #Regression model
SVC = pickle.load(open("sentiment_SVC.pickle", "rb")) #Classification model

### Make some predictions for sentiment

In [3]:
from svm_utils import predict_sentiment

In [4]:
print(predict_sentiment(SVC, 'aspect_term suffered enormous losses this quarter', vectorizer))
print(predict_sentiment(SVC, "aspect_term has gone bankrupt", vectorizer))

['[1.0, 0.0]']
['[1.0, 0.0]']


In [5]:
print(predict_sentiment(SVC, "Engine maker aspect_term has revealed it slumped to a record loss of Â£4 million", vectorizer))
print(predict_sentiment(SVC, "aspect_term won a lawsuit against Intel marking a win for the tech giant", vectorizer))

['[1.0, 0.0]']
['[0.0, 1.0]']


In [6]:
print(predict_sentiment(SVC, "Apple won a lawsuit against aspect_term marking a win for the tech giant", vectorizer))
print(predict_sentiment(SVC, "aspect_term won a lawsuit against Samsung marking a win for the tech giant",  vectorizer))

['[1.0, 0.0]']
['[1.0, 0.0]']


As we can see from the outputs, the classifier generally performs quite well for simpler sentences, however sentences with multiple aspects can be challenging for the model to classify.

In [7]:
print(predict_sentiment(SVC, "Samsung has recorded record losses",  vectorizer))
print(predict_sentiment(SVR, "Samsung has recorded record losses",  vectorizer))
print(predict_sentiment(SVC, "Samsung has recorded record profits",  vectorizer))
print(predict_sentiment(SVR, "Samsung has recorded record profits",  vectorizer))
print(predict_sentiment(SVC, "Samsung files for bankruptsy.",  vectorizer))
print(predict_sentiment(SVR, "Samsung files for bankruptsy.",  vectorizer))

['[1.0, 0.0]']
[0.54261226]
['[0.0, 1.0]']
[0.83887178]
['[0.7, 0.3]']
[0.62456279]


The trained SVR model struggles with recognizing negative sentiment, even among simpler sentences.

### Train a new model from pickled data

In [8]:
from svm_utils import train_model
from svm_utils import create_vectorizer

In [9]:
# Create a new vectorizer. 
vectorizer = create_vectorizer(min_df=3, max_df = 1.0, sublinear_tf = True, use_idf=True)

In [10]:
# Train a classification model. SVR requires strings as labels
SVC = train_model(model_type = 'svc', print_results = True, vectorizer=vectorizer, train_data = "train_data.pickle",
                 train_labels = "train_labels.pickle", test_data = "test_data.pickle", test_labels = "test_labels.pickle")

Results for LinearSVC()
Training time: 0.820324s; Prediction time: 0.006179s
             precision    recall  f1-score   support

 [0.0, 1.0]       0.73      0.69      0.71       664
 [0.3, 0.7]       0.77      0.84      0.80      1185
 [0.5, 0.5]       0.64      0.36      0.46        80
 [0.7, 0.3]       0.82      0.73      0.77       666
 [1.0, 0.0]       0.86      0.89      0.88      1063

avg / total       0.80      0.80      0.79      3658



In [11]:
# Train a regression model. SVR requires floats as labels
SVR = train_model(model_type = 'svr', print_results = True, vectorizer=None, train_data = "train_data.pickle",
                 train_labels = "train_labels_SVR.pickle", test_data = "test_data.pickle", test_labels = "test_labels_SVR.pickle")

No vectorizer specified, loading default
Results for LinearSVR()
Training time: 4.435170s; Prediction time: 0.000921s
Mean squared error:  0.04233757775290541
r2 score:  0.6922808038520507


### Save our models and vectorizer for future use

In [None]:
pickle.dump(SVR, open( "sentiment_SVR_01.pickle", "wb" ))
pickle.dump(SVC, open( "sentiment_SVC_01.pickle", "wb" ))
pickle.dump(vectorizer, open( "vectorizer_01.pickle", "wb" ))
