# SVM

### $$f(x) = \mathbf{w}^T \mathbf{x} + b$$

### Hard Margin
$$\begin{align}
\min_{w,b} \quad & \frac{1}{2} \|w\|^2 \\
\quad & y_i(\mathbf{w}^T \mathbf{x}_i + b) \geq 1 \quad \forall i
\end{align}$$

### Soft Margin
$$
\min_{w, b} \quad \frac{1}{2} \|w\|^2 + C\sum_{i = 1}^{n}{\max(0, 1 - y_i(w^Tx_i + b))}
$$

In [80]:
import pandas as pd
import numpy as np
import sklearn
import nltk
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

In [6]:
data = pd.read_csv("SVM.csv")

data.head(5)

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [18]:
data['Sentence'] = [sen.lower() for sen in data['Sentence']]
data = data[data['Sentiment'] != 'neutral']
data = data.reset_index(drop=True)

In [19]:
data

Unnamed: 0,Sentence,Sentiment
0,the geosolutions technology will leverage bene...,positive
1,"$esi on lows, down $1.50 to $2.50 bk a real po...",negative
2,"for the last quarter of 2010 , componenta 's n...",positive
3,$spy wouldn't be surprised to see a green close,positive
4,shell's $70 billion bg deal meets shareholder ...,negative
...,...,...
464,the company 's consolidated operating profit a...,positive
465,astrazeneca to buy zs pharma for $2.7 billion,positive
466,finnish waste management and cleaning group la...,negative
467,helsinki thomson financial - shares in cargote...,negative


In [20]:
mp = {'positive': 1, 'negative': -1}
data['Sentiment'] = data['Sentiment'].map(mp)

In [21]:
data

Unnamed: 0,Sentence,Sentiment
0,the geosolutions technology will leverage bene...,1
1,"$esi on lows, down $1.50 to $2.50 bk a real po...",-1
2,"for the last quarter of 2010 , componenta 's n...",1
3,$spy wouldn't be surprised to see a green close,1
4,shell's $70 billion bg deal meets shareholder ...,-1
...,...,...
464,the company 's consolidated operating profit a...,1
465,astrazeneca to buy zs pharma for $2.7 billion,1
466,finnish waste management and cleaning group la...,-1
467,helsinki thomson financial - shares in cargote...,-1


In [30]:
def pre_processing_data(data):
    sentences = []
    for sentence in data:
        sentences.append(sentence.split())
    for i in range(len(sentences)):
        sentences[i] = [word for word in sentences[i] if word.isalnum()]
    return sentences

In [31]:
sentences = pre_processing_data(data['Sentence'])
sentences[0]

['the',
 'geosolutions',
 'technology',
 'will',
 'leverage',
 'benefon',
 'gps',
 'solutions',
 'by',
 'providing',
 'location',
 'based',
 'search',
 'technology',
 'a',
 'communities',
 'platform',
 'location',
 'relevant',
 'multimedia',
 'content',
 'and',
 'a',
 'new',
 'and',
 'powerful',
 'commercial',
 'model']

In [32]:
data_train, data_test = train_test_split(data, test_size=0.2)

In [33]:
data_test

Unnamed: 0,Sentence,Sentiment
208,finnish financial group aktia reports operatin...,1
327,`` i 'm pleased to receive the nomination comm...,1
304,current holdings via options - long: $aapl $v ...,1
297,`` the announced investment of the carmaker hy...,1
313,finnish-swedish tietoenator is expanding its b...,1
...,...,...
44,"in september 2010 , the finnish group agreed t...",1
48,$tsla recall,-1
449,netapp downgraded by macquarie to underperform...,-1
222,a helsinki : eliiv today reported eps of eur1 ...,1


In [34]:
data_train

Unnamed: 0,Sentence,Sentiment
447,under this agreement biohit becomes a focus su...,1
283,$lscc accumulation chart continues to make new...,1
35,most bullish stocks on twitter during this dip...,1
188,@chessnwine: $iwm 30-minute chart. small caps ...,-1
183,"with the measures , suominen corporation aims ...",1
...,...,...
350,"ebit excluding non-recurring items , totalled ...",1
399,"operating profit totaled eur 3.8 mn , down fro...",-1
429,"eps for the quarter was eur0 .00 , as compared...",1
319,the total headcount reduction will be 50 perso...,-1


In [35]:
X_train = [' '.join(sentence) for sentence in pre_processing_data(data_train['Sentence'])]
X_train

['under this agreement biohit becomes a focus supplier of pipettors and disposable pipettor tips to vwr customers throughout europe',
 'accumulation chart continues to make new highs showing accumulation continuing to take place',
 'most bullish stocks on twitter during this',
 'small caps threatening descending triangle breakdown under',
 'with the measures suominen corporation aims to ensure the competitiveness of the flexible packaging business in the long term',
 'the six breweries recorded a percent growth in domestic beer sales last year to million liters from million liters sold in 2005',
 'rolling over for a new intraday bottom pickers continue to get destroyed',
 'see some higher highs and higher lows rest of the day slow and steady',
 'standard rbs escape capital raising in stress test',
 'marimekko has today 20 november signed a license agreement with the swedish chain whereby marimekko will license some of its popular patterns from the 1950s 1960s and 1970s to',
 'breaking 

In [36]:
X_test = [' '.join(sentence) for sentence in pre_processing_data(data_test['Sentence'])]

X_test

['finnish financial group aktia reports operating profit of eur mn in 2009 up from eur mn in the corresponding period in 2008',
 'i pleased to receive the nomination committee request and confidence says jon risfelt',
 'current holdings via options',
 'the announced investment of the carmaker hyundai for example sounds optimistically for us as of course new cars mean new tires',
 'tietoenator is expanding its business quickly in russia',
 'the company profit totaled ls in h1 2007 down',
 'finnlines estimated in its annual general meeting that 2008 will be financially a tough year due to large investments',
 'billion bg deal meets shareholder skepticism',
 'tesco sales rise shows tentative recovery continues',
 'profitability ebit was compared to in the period',
 'tesco and morrisons will all cut petrol prices as oil falls',
 'retailers kingfisher and sports direct rise in share index',
 'the move was triggered by weak demand for forestry equipment and the uncertain market situation',
 

In [39]:
y_train = data_train['Sentiment'].values
y_test = data_test['Sentiment'].values
y_train

array([ 1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,
       -1, -1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1, -1, -1, -1,
        1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1, -1,
       -1, -1, -1,  1, -1,  1,  1, -1,  1,  1,  1, -1,  1, -1,  1,  1,  1,
        1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1,
        1,  1,  1, -1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1,  1,  1,
       -1,  1, -1, -1,  1,  1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1,
        1, -1, -1, -1,  1,  1, -1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1,
       -1, -1, -1,  1,  1,  1, -1,  1, -1,  1,  1,  1, -1,  1,  1,  1,  1,
        1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,
        1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1,
       -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
       -1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1,
        1,  1,  1,  1,  1

In [85]:
tfidf = TfidfVectorizer(max_features=300)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.fit_transform(X_test)



In [76]:
print(X_train_tfidf.toarray())


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [92]:
def convert_to_dense_if_sparse(X):
    if hasattr(X, 'toarray'):
        return X.toarray()
    return X
class SVM_classifier():
    def __init__(self, learning_rate = 0.01, C = 10, iteration = 1000):
        self.learning_rate = learning_rate
        self.C = C
        self.iteration = iteration
        self.W = None
        self.B = None
    def compute_hingle_loss(self, W, B, X_batch, y_batch):
        N = X_batch.shape[0]
        distance = []
        for idx, x in enumerate(X_batch):
            distance.append(max(0, 1 - y_batch[idx]*np.dot(x, W) + B))
        distance = np.array(distance)
        hingle_loss = np.dot(W, W)/2 + self.C*(sum(distance)/N)
        return hingle_loss
    def gradient_descent(self, W, B, X_batch, y_batch):
        distance = []
        for idx, x in enumerate(X_batch):
            distance.append(1 - y_batch[idx] * (np.dot(x, W) + B))
        dW = np.zeros(len(W))
        dB = 0
        for idx, d in enumerate(distance):
            if max(0, d) == 0:
                dW += W
                dB += 0
            else:
                dW += (W - self.C*y_batch[idx]*X_batch[idx])
                dB += 0 - self.C*y_batch[idx]
        dW = dW / len(y_batch)
        dB = dB / len(y_batch)
        return dW, dB
    def fit(self, features, outputs) -> bool:
        features = convert_to_dense_if_sparse(features)

        max_epochs = self.iteration
        weights = np.zeros(features.shape[1])
        bias = 0
        nth = 0
        cost_threshold = 0.01
        prev_cost = float("inf")
        for epoch in range(1, max_epochs):
            gradW, gradB = self.gradient_descent(weights, bias, features, outputs)
            if epoch == 2 ** nth or epoch == max_epochs - 1:
                cost = self.compute_hingle_loss(weights, bias, features, outputs)
                print("Epoch is:{} and Cost is: {}".format(epoch, cost))
                if abs(prev_cost - cost) < cost_threshold * prev_cost:
                    self.W = weights
                    self.B = bias
                    return True
                prev_cost = cost
                nth += 1
            prev_cost = cost
            weights = weights - (self.learning_rate * gradW)
            bias = bias - (self.learning_rate * gradB)
        self.W = weights
        self.B = bias
        return True

    def predict(self, X):
        X = convert_to_dense_if_sparse(X)
        prediction = []
        for x in X:
            prediction.append(np.dot(x, self.W) + self.B)
        return np.sign(prediction)
    

In [93]:
svm = SVM_classifier(learning_rate=0.01, C=100)
svm.fit(X_train_tfidf, y_train)

Epoch is:1 and Cost is: 100.0
Epoch is:2 and Cost is: 139.918788656214
Epoch is:4 and Cost is: 219.84025411202856
Epoch is:8 and Cost is: 196.71965983307456
Epoch is:16 and Cost is: 191.1428376989759
Epoch is:32 and Cost is: 183.3576606662524
Epoch is:64 and Cost is: 175.72822400962676
Epoch is:128 and Cost is: 166.47337515617667
Epoch is:256 and Cost is: 163.54263361981592
Epoch is:512 and Cost is: 162.80432026731754


True

In [94]:
y_pred = svm.predict(X_test_tfidf)
print(y_pred)

[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1. -1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
 -1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.]


In [96]:
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)


0.6914893617021277
