# Using Support Vector Classification and Regression for Sentiment Analysis

In [71]:
import sys
import os
import time
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
import pandas as pd
import pickle

### Load pre-trained models and TfidfVectorizer

In [22]:
vectorizer = pickle.load(open("vectorizer.pickle", "rb"))
SVR = pickle.load(open("sentiment_SVR.pickle", "rb")) #Regression model
SVC = pickle.load(open("sentiment_SVC.pickle", "rb")) #Classification model

### Make some predictions for sentiment

In [23]:
from svm_utils import predict_sentiment

In [4]:
print(predict_sentiment(SVC, 'aspect_term suffered enormous losses this quarter', vectorizer))
print(predict_sentiment(SVC, "aspect_term has gone bankrupt", vectorizer))

['[1.0, 0.0]']
['[1.0, 0.0]']


In [10]:
print(predict_sentiment(SVC, "Engine maker aspect_term has revealed it slumped to a record loss of £4 million", vectorizer))
print(predict_sentiment(SVC, "aspect_term wins lawsuit against Intel marking a win for the tech giant", vectorizer))

['[1.0, 0.0]']
['[0.3, 0.7]']


In [12]:
print(predict_sentiment(SVC, "Apple wins lawsuit against aspect_term", vectorizer))
print(predict_sentiment(SVC, "aspect_term wins lawsuit against Samsung",  vectorizer))
print(predict_sentiment(SVC, "aspect_term won a lawsuit against Samsung",  vectorizer))

['[1.0, 0.0]']
['[0.3, 0.7]']
['[0.7, 0.3]']


As we can see from the outputs, the classifier generally performs quite well for simpler sentences, however sentences with multiple aspects can be challenging for the model to classify.

In [7]:
print(predict_sentiment(SVC, "Samsung has recorded record losses",  vectorizer))
print(predict_sentiment(SVR, "Samsung has recorded record losses",  vectorizer))
print(predict_sentiment(SVC, "Samsung has recorded record profits",  vectorizer))
print(predict_sentiment(SVR, "Samsung has recorded record profits",  vectorizer))
print(predict_sentiment(SVC, "Samsung files for bankruptsy.",  vectorizer))
print(predict_sentiment(SVR, "Samsung files for bankruptsy.",  vectorizer))

['[1.0, 0.0]']
[0.54261226]
['[0.0, 1.0]']
[0.83887178]
['[0.7, 0.3]']
[0.62456279]


The trained SVR model struggles with recognizing negative sentiment, even among simpler sentences.

### Train a new model from pickled data

In [8]:
from svm_utils import train_model
from svm_utils import create_vectorizer

In [9]:
# Create a new vectorizer. 
vectorizer = create_vectorizer(min_df=3, max_df = 1.0, sublinear_tf = True, use_idf=True)

In [10]:
# Train a classification model. SVR requires strings as labels
SVC = train_model(model_type = 'svc', print_results = True, vectorizer=vectorizer, train_data = "train_data.pickle",
                 train_labels = "train_labels.pickle", test_data = "test_data.pickle", test_labels = "test_labels.pickle")

Results for LinearSVC()
Training time: 0.820324s; Prediction time: 0.006179s
             precision    recall  f1-score   support

 [0.0, 1.0]       0.73      0.69      0.71       664
 [0.3, 0.7]       0.77      0.84      0.80      1185
 [0.5, 0.5]       0.64      0.36      0.46        80
 [0.7, 0.3]       0.82      0.73      0.77       666
 [1.0, 0.0]       0.86      0.89      0.88      1063

avg / total       0.80      0.80      0.79      3658



In [11]:
# Train a regression model. SVR requires floats as labels
SVR = train_model(model_type = 'svr', print_results = True, vectorizer=None, train_data = "train_data.pickle",
                 train_labels = "train_labels_SVR.pickle", test_data = "test_data.pickle", test_labels = "test_labels_SVR.pickle")

No vectorizer specified, loading default
Results for LinearSVR()
Training time: 4.435170s; Prediction time: 0.000921s
Mean squared error:  0.04233757775290541
r2 score:  0.6922808038520507


### Save our models and vectorizer for future use

In [None]:
pickle.dump(SVR, open( "sentiment_SVR_01.pickle", "wb" ))
pickle.dump(SVC, open( "sentiment_SVC_01.pickle", "wb" ))
pickle.dump(vectorizer, open( "vectorizer_01.pickle", "wb" ))


# Take outputs from the summarizer and perform sentiment classification

In [19]:
summarizer_output = pickle.load(open("summarizer_output1.pickle", "rb"))

In [26]:
summarizer_output = pd.DataFrame.from_dict(summarizer_output)
summarizer_output.head()

Unnamed: 0,ids,stories,summaries_3sent,summaries_more_coverage,summaries_no_coverage,summaries_some_coverage
0,3AA7765F8C65D4F162EDDC96D45F9921,Net management fees fell by 9pc over the cours...,Net management fees fell by 9pc over the cours...,Net management fees fell by 9pc over the year ...,the company prefers to focus on adjusted pre-t...,Net management fees fell by 9pc over the cours...
1,80D13E3FFA11AA74CDCD4C583D283BB2,"Tue, 17th Jan 2017 07:07(ShareCast News) - Bri...","Tue, 17th Jan 2017 07:07(ShareCast News) - Bri...",British American tobacco took the plastic film...,the FTSE 100 company said the deal has been un...,British American tobacco took the plastic film...
2,D1E2864466E893B47EA1A8CBF0E973D0,LONDON-- Glencore PLC rode a wave of surging c...,LONDON-- Glencore PLC rode a wave of surging c...,Glencore PLC rode a wave of surging commodity ...,Glencore PLC rode a wave of surging commodity ...,Glencore PLC rode a wave of surging commodity ...
3,F4BEE88A2BF951CC7850CFBB45418F86,"SANTA CLARA, Calif. (AP) -- Intel will buy Isr...","SANTA CLARA, Calif. (AP) -- Intel will buy Isr...",Intel will buy Israel's Mobileye in a deal val...,Intel will buy Israel's Mobileye in a deal val...,Intel Corp. will pay $63.54 for each share of ...
4,487FFE96C5959CFAC14DD77CF53E0D1E,Stalwarts such as Sears (SHLD) and JCPenney (J...,Stalwarts such as Sears (SHLD) and JCPenney (J...,Amazon has the AmazonFresh grocery delivery se...,on area that Amazon had n't ventured too far i...,"Macy's -lrb- M -rrb-, Kohl's -lrb- KSS -rrb- a..."


In [36]:
def classify_text(input_items, model, vectorizer):
    inputs = []
    labels = []
    for item in input_items:
        inputs.append(item)
        labels.append(predict_sentiment(model, item,  vectorizer)[0])
    col1 = input_items.name
    col2 = str(input_items.name + '_label')
    d = {col1: inputs, col2: labels}
    
    return(pd.DataFrame.from_dict(d))

In [37]:
stories_sent = classify_text(summarizer_output['stories'], SVC, vectorizer)
summaries_3sent_sent = classify_text(summarizer_output['summaries_3sent'], SVC, vectorizer)
summaries_no_coverage_sent = classify_text(summarizer_output['summaries_no_coverage'], SVC, vectorizer)
summaries_some_coverage_sent = classify_text(summarizer_output['summaries_some_coverage'], SVC, vectorizer)
summaries_more_coverage_sent = classify_text(summarizer_output['summaries_more_coverage'], SVC, vectorizer)

In [39]:
stories_sent.head()

Unnamed: 0,stories,stories_label
0,Net management fees fell by 9pc over the cours...,"[0.7, 0.3]"
1,"Tue, 17th Jan 2017 07:07(ShareCast News) - Bri...","[0.0, 1.0]"
2,LONDON-- Glencore PLC rode a wave of surging c...,"[0.0, 1.0]"
3,"SANTA CLARA, Calif. (AP) -- Intel will buy Isr...","[0.0, 1.0]"
4,Stalwarts such as Sears (SHLD) and JCPenney (J...,"[0.0, 1.0]"


In [41]:
summaries_3sent_sent.head()

Unnamed: 0,summaries_3sent,summaries_3sent_label
0,Net management fees fell by 9pc over the cours...,"[1.0, 0.0]"
1,"Tue, 17th Jan 2017 07:07(ShareCast News) - Bri...","[0.0, 1.0]"
2,LONDON-- Glencore PLC rode a wave of surging c...,"[0.0, 1.0]"
3,"SANTA CLARA, Calif. (AP) -- Intel will buy Isr...","[0.0, 1.0]"
4,Stalwarts such as Sears (SHLD) and JCPenney (J...,"[0.3, 0.7]"


In [42]:
summaries_no_coverage_sent.head()

Unnamed: 0,summaries_no_coverage,summaries_no_coverage_label
0,the company prefers to focus on adjusted pre-t...,"[1.0, 0.0]"
1,the FTSE 100 company said the deal has been un...,"[0.3, 0.7]"
2,Glencore PLC rode a wave of surging commodity ...,"[0.0, 1.0]"
3,Intel will buy Israel's Mobileye in a deal val...,"[0.3, 0.7]"
4,on area that Amazon had n't ventured too far i...,"[0.0, 1.0]"


In [43]:
summaries_some_coverage_sent.head()

Unnamed: 0,summaries_some_coverage,summaries_some_coverage_label
0,Net management fees fell by 9pc over the cours...,"[1.0, 0.0]"
1,British American tobacco took the plastic film...,"[0.3, 0.7]"
2,Glencore PLC rode a wave of surging commodity ...,"[0.0, 1.0]"
3,Intel Corp. will pay $63.54 for each share of ...,"[0.3, 0.7]"
4,"Macy's -lrb- M -rrb-, Kohl's -lrb- KSS -rrb- a...","[1.0, 0.0]"


In [44]:
summaries_more_coverage_sent.head()

Unnamed: 0,summaries_more_coverage,summaries_more_coverage_label
0,Net management fees fell by 9pc over the year ...,"[0.7, 0.3]"
1,British American tobacco took the plastic film...,"[0.3, 0.7]"
2,Glencore PLC rode a wave of surging commodity ...,"[0.0, 1.0]"
3,Intel will buy Israel's Mobileye in a deal val...,"[0.3, 0.7]"
4,Amazon has the AmazonFresh grocery delivery se...,"[0.3, 0.7]"


In [52]:
summaries = pd.concat([stories_sent, summaries_3sent_sent, summaries_no_coverage_sent, summaries_some_coverage_sent, summaries_more_coverage_sent], axis=1)

In [57]:
summaries

Unnamed: 0,stories,stories_label,summaries_3sent,summaries_3sent_label,summaries_no_coverage,summaries_no_coverage_label,summaries_some_coverage,summaries_some_coverage_label,summaries_more_coverage,summaries_more_coverage_label
0,Net management fees fell by 9pc over the cours...,"[0.7, 0.3]",Net management fees fell by 9pc over the cours...,"[1.0, 0.0]",the company prefers to focus on adjusted pre-t...,"[1.0, 0.0]",Net management fees fell by 9pc over the cours...,"[1.0, 0.0]",Net management fees fell by 9pc over the year ...,"[0.7, 0.3]"
1,"Tue, 17th Jan 2017 07:07(ShareCast News) - Bri...","[0.0, 1.0]","Tue, 17th Jan 2017 07:07(ShareCast News) - Bri...","[0.0, 1.0]",the FTSE 100 company said the deal has been un...,"[0.3, 0.7]",British American tobacco took the plastic film...,"[0.3, 0.7]",British American tobacco took the plastic film...,"[0.3, 0.7]"
2,LONDON-- Glencore PLC rode a wave of surging c...,"[0.0, 1.0]",LONDON-- Glencore PLC rode a wave of surging c...,"[0.0, 1.0]",Glencore PLC rode a wave of surging commodity ...,"[0.0, 1.0]",Glencore PLC rode a wave of surging commodity ...,"[0.0, 1.0]",Glencore PLC rode a wave of surging commodity ...,"[0.0, 1.0]"
3,"SANTA CLARA, Calif. (AP) -- Intel will buy Isr...","[0.0, 1.0]","SANTA CLARA, Calif. (AP) -- Intel will buy Isr...","[0.0, 1.0]",Intel will buy Israel's Mobileye in a deal val...,"[0.3, 0.7]",Intel Corp. will pay $63.54 for each share of ...,"[0.3, 0.7]",Intel will buy Israel's Mobileye in a deal val...,"[0.3, 0.7]"
4,Stalwarts such as Sears (SHLD) and JCPenney (J...,"[0.0, 1.0]",Stalwarts such as Sears (SHLD) and JCPenney (J...,"[0.3, 0.7]",on area that Amazon had n't ventured too far i...,"[0.0, 1.0]","Macy's -lrb- M -rrb-, Kohl's -lrb- KSS -rrb- a...","[1.0, 0.0]",Amazon has the AmazonFresh grocery delivery se...,"[0.3, 0.7]"
5,08:45 31 January 2017Ravender SembhyAround 400...,"[0.3, 0.7]",08:45 31 January 2017Ravender SembhyAround 400...,"[0.3, 0.7]",Yui Mok/PA WireRoyal WireRoyal WireRoyal WireR...,"[0.3, 0.7]",08:45 31 January 2017ravender SembhyAround 400...,"[0.3, 0.7]",Shell of assets in the North Sea to Chrysaor i...,"[0.3, 0.7]"
6,Tesco has been hit with a £129 million fine fr...,"[1.0, 0.0]",Tesco has been hit with a £129 million fine fr...,"[1.0, 0.0]",the supermarket giant said its subsidiary - Te...,"[1.0, 0.0]",Tesco has reached a Deferred prosecution Agree...,"[1.0, 0.0]",Tesco has reached a Deferred prosecution Agree...,"[1.0, 0.0]"
7,Hewlett Packard Enterprise (HPE) announced Tue...,"[0.3, 0.7]",Hewlett Packard Enterprise (HPE) announced Tue...,"[0.3, 0.7]",Nimble Storage has gapped open dramatically hi...,"[0.0, 1.0]",Hewlett Packard Enterprise announced Tuesday t...,"[0.3, 0.7]",Hewlett Packard Enterprise -lrb- HPE -rrb- ann...,"[0.3, 0.7]"
8,Hindustan Motors has sold off its iconic Ambas...,"[0.3, 0.7]",Hindustan Motors has sold off its iconic Ambas...,"[0.7, 0.3]",Hindustan Motors has sold off its iconic Ambas...,"[0.7, 0.3]",Hindustan Motors has sold off its iconic Ambas...,"[0.7, 0.3]",Hindustan Motors has formed an alliance with t...,"[0.3, 0.7]"
9,"10:54 am January 27, 2017 By Roland Hutchinson...","[0.3, 0.7]","10:54 am January 27, 2017 By Roland Hutchinson...","[0.3, 0.7]",Ford SmartLink will surprise and delight owner...,"[0.3, 0.7]","10:54 am January 27, 2017 by Roland Hutchinson...","[0.3, 0.7]","10:54 am January 27, 2017 by Roland Hutchinson...","[0.3, 0.7]"


In [55]:
pickle.dump(summaries, open("summaries_sentiment_comparison.pickle", "wb"))
summaries.to_csv("summaries_sentiment_comparison.csv")

# Take outputs from the NER and perform sentiment classification

In [58]:
with open("Aspect_3Sent.txt") as f:
    content = f.readlines()

In [76]:
def classify_aspects(model, file_name, input_source_name, vectorizer):
    with open(file_name) as f:
        content = f.readlines()
    max = len(content)
    i = 0
    j = i +1
    aspects = []
    sentences = []
    labels = []
    while j < max-1:
        sentences.append(content[i])
        aspects.append(re.sub('\\n', '', content[i+1]))
        labels.append(predict_sentiment(model, content[i],  vectorizer)[0])
        i += 3
        j = i+1
    col1=str('text'+'_'+input_source_name)
    col2=str('aspects'+'_'+input_source_name)
    col3=str('labels'+'_'+input_source_name)
    d = {col1: sentences, col2: aspects, col3: labels}
    d = pd.DataFrame.from_dict(d)
    return(d)

In [77]:
aspect_3sent = classify_aspects(SVC, "Aspect_3Sent.txt", '3Sent', vectorizer)
aspect_no_coverage = classify_aspects(SVC, "Aspect_NoCoverage.txt", 'NoCoverage', vectorizer)
aspect_some_coverage = classify_aspects(SVC, "Aspect_SomeCoverage.txt", 'SomeCoverage', vectorizer)
aspect_more_coverage = classify_aspects(SVC, "Aspect_MoreCoverage.txt", 'MoreCoverage', vectorizer)

In [79]:
aspects_output = pd.concat([aspect_3sent, aspect_no_coverage, aspect_some_coverage, aspect_more_coverage], axis=1)

In [82]:
aspects_output.to_csv("aspects_output.csv")
pickle.dump(aspects_output, open("aspects_output.pickle", "wb"))

In [83]:
aspects_output

Unnamed: 0,aspects_3Sent,labels_3Sent,text_3Sent,aspects_NoCoverage,labels_NoCoverage,text_NoCoverage,aspects_SomeCoverage,labels_SomeCoverage,text_SomeCoverage,aspects_MoreCoverage,labels_MoreCoverage,text_MoreCoverage
0,Reynolds,"[0.3, 0.7]",8% of aspect_term American it doesn't already ...,Reynolds,"[0.3, 0.7]",the FTSE 100 company said the deal has been un...,BAT,"[0.3, 0.7]","5260 aspect_term ordinary shares, with the asp...",''Chief,"[0.7, 0.3]","Chief executive Luke Ellis, who took the rein..."
1,Reynolds,"[0.0, 1.0]",The FTSE 100 company said the deal has been u...,Reynolds,"[0.3, 0.7]",the FTSE 100 company said the deal has been u...,BAT,"[0.3, 0.7]","5260 aspect_term ordinary shares, with the asp...",BAT,"[0.0, 1.0]",the FTSE 100 company said the deal has been u...
2,Reynolds,"[0.0, 1.0]",The FTSE 100 company said the deal has been u...,Reynolds,"[0.3, 0.7]","under the offer, aspect_term shareholders wil...",Reynolds,"[0.3, 0.7]",aspect_term shareholders will receive $29\n,BAT,"[0.3, 0.7]",5260 aspect_term ordinary shares for each Reyn...
3,Reynolds,"[0.3, 0.7]","Under the offer, aspect_term shareholders wil...",Reynolds,"[0.3, 0.7]",5260 BAT ordinary shares for each aspect_term ...,Reynolds,"[0.3, 0.7]",64 per aspect_term share\n,Reynolds,"[0.0, 1.0]",the FTSE 100 company said the deal has been u...
4,Reynolds,"[0.3, 0.7]",5260 BAT ordinary shares for each aspect_term ...,BAT,"[0.3, 0.7]",5260 aspect_term ordinary shares for each Reyn...,JCPenney,"[0.3, 0.7]","Macy's -lrb- M -rrb-, Kohl's -lrb- KSS -rrb- a...",Reynolds,"[0.3, 0.7]",aspect_term shareholders will receive $29\n
5,BAT,"[0.0, 1.0]",The FTSE 100 company said the deal has been u...,Glencore,"[0.3, 0.7]",aspect_term PLC rode a wave of surging commodi...,Chrysaor,"[0.3, 0.7]",08:45 31 January 2017ravender SembhyAround 400...,Reynolds,"[0.3, 0.7]",5260 BAT ordinary shares for each aspect_term ...
6,BAT,"[0.0, 1.0]",The FTSE 100 company said the deal has been u...,Glencore,"[0.3, 0.7]",aspect_term PLC rode a wave of surging commod...,Chrysaor,"[1.0, 0.0]",around 400 staff are expected to transfer to ...,Mobileye,"[0.0, 1.0]",Intel will buy Israel's aspect_term in a deal ...
7,BAT,"[0.3, 0.7]",5260 aspect_term ordinary shares for each Reyn...,Glencore,"[0.0, 1.0]",Glasenberg's emphasis on dividends is ``a shi...,Chrysaor,"[1.0, 0.0]",around 400 staff are expected to transfer to ...,Intel,"[0.0, 1.0]",aspect_term will buy Israel's Mobileye in a de...
8,BAT,"[0.3, 0.7]",5260 aspect_term ordinary shares for each Reyn...,Mobileye,"[0.0, 1.0]",Intel will buy Israel's aspect_term in a deal ...,Chrysaor,"[1.0, 0.0]",around 400 staff are expected to transfer to ...,Shell,"[0.3, 0.7]",aspect_term of assets in the North Sea to Chry...
9,Mobileye,"[0.0, 1.0]",(AP) -- Intel will buy Israel's aspect_term i...,Intel,"[0.0, 1.0]",aspect_term will buy Israel's Mobileye in a de...,Tesco,"[1.0, 0.0]",aspect_term has reached a Deferred prosecution...,Shell,"[0.3, 0.7]",aspect_term will pocket an initial 3bn US dol...
