# Support Vector Machine Classification

&nbsp;

In [1]:
# Importing Libraries
import numpy as np
import pandas as pd

In [2]:
# Loading DataFrame

# Required Library
import pickle

file = "pklFiles/DATAFRAME.pkl"
fileobj = open(file, 'rb')
df = pickle.load(fileobj)
fileobj.close()

print(type(df))
df.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,TITLE,CATEGORY,ENCODED_CATEGORY
0,fed offici say weak data caus weather slow taper,Business,0
1,fed charl plosser see high bar chang pace taper,Business,0
2,us open stock fall fed offici hint acceler taper,Business,0
3,fed risk fall 'behind curv charl plosser say,Business,0
4,fed plosser nasti weather curb job growth,Business,0


In [3]:
# News Headlines
X = df['TITLE']

# Encoded News Category
y = df['ENCODED_CATEGORY']

In [4]:
# Splitting the dataset into Training set & Testing set

# Required Library
from sklearn.model_selection import train_test_split

# Testing_set = 25%  and  Training_set = 75%
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.25, random_state = 51)

In [5]:
print("Shape of X : " + str(X.shape))
print("Shape of y : " + str(y.shape))

print("\n")
print("Shape of X_train : " + str(X_train.shape))
print("Shape of y_train : " + str(y_train.shape))
print("Shape of X_test  : " + str(X_test.shape))
print("Shape of y_test  : " + str(y_test.shape))

Shape of X : (422419,)
Shape of y : (422419,)


Shape of X_train : (316814,)
Shape of y_train : (316814,)
Shape of X_test  : (105605,)
Shape of y_test  : (105605,)


&nbsp;

### Feature Selection : TF-IDF Approach

In [6]:
# Feature Extraction

# Required Library
from sklearn.feature_extraction.text import TfidfVectorizer

# Loading Fitted TfidfVectorizer
tfidf_vectorizer = pickle.load(open("pklFiles/tfidf_vectorizer.pkl","rb"))

# Tranforming Training Data (X_train)
tfidf_X_train = tfidf_vectorizer.transform(X_train.values)

# Tranforming Testing Data (X_test)
tfidf_X_test = tfidf_vectorizer.transform(X_test.values)

In [7]:
# Support Vector Machine

# Required Library
from sklearn.svm import SVC

# Instantiating SVM Classifier with Regularization Parameter, C = 1.0
svm_classifier = SVC(C=1.0, kernel='linear', gamma='auto')

# Fitting svm_classifier to Training Data
svm_classifier.fit(tfidf_X_train, y_train)

# Saving svm_classifier for tfidf_vectorizer
pickle.dump(svm_classifier, open("pklFiles/svm_classifier_for_tfidf_vectorizer.pkl", "wb"))

In [8]:
# Prediction
pred = svm_classifier.predict(tfidf_X_test)

In [9]:
# Accuracy Score & Confusion Matrix

# Required Library
from sklearn import metrics

print("Support Vector Machine : (TF-IDF Approach) \n")

# Accuracy
a_score = metrics.accuracy_score(y_test, pred)
print("Accuracy : " + str("{:.2f}".format(a_score*100)),'%')

print("\n")

# Confusion Matrix
# Labels : 0(Business), 1(Entertainment), 2(Health), 3(Science & Technology)
# By defualt, Horizontally, Labels are from 0 to 3
# By defualt, Vertically,   Labels are from 0 to 3
confusion_matrix = metrics.confusion_matrix(y_test, pred)

print("Confusion Matrix : ")
print(confusion_matrix)

Support Vector Machine : (TF-IDF Approach) 

Accuracy : 94.59 %


Confusion Matrix : 
[[27032   410   228  1521]
 [  456 37190   128   273]
 [  425   268 10500   137]
 [ 1338   432    99 25168]]


&nbsp;

### Feature Selection : Bag of Words (BOG) Approach

In [10]:
# Feature Extraction

# Required Library
from sklearn.feature_extraction.text import CountVectorizer

# Loading Fitted CountVectorizer
count_vectorizer = pickle.load(open("pklFiles/count_vectorizer.pkl","rb"))

# Tranforming Training Data (X_train)
count_X_train = count_vectorizer.transform(X_train.values)

# Tranforming Testing Data (X_test)
count_X_test = count_vectorizer.transform(X_test.values)

In [11]:
# Support Vector Machine

# Required Library
from sklearn.svm import SVC

# Instantiating SVM Classifier with Regularization Parameter, C = 1.0
svm_classifier = SVC(C=1.0, kernel='linear', gamma='auto')

# Fitting svm_classifier to Training Data
svm_classifier.fit(count_X_train, y_train)

# Saving svm_classifier for count_vectorizer
pickle.dump(svm_classifier, open("pklFiles/svm_classifier_for_count_vectorizer.pkl", "wb"))

In [12]:
# Prediction
pred = svm_classifier.predict(count_X_test)

In [13]:
# Accuracy Score & Confusion Matrix

# Required Library
from sklearn import metrics

print("Support Vector Machine : (BOG Approach) \n")

# Accuracy
a_score = metrics.accuracy_score(y_test, pred)
print("Accuracy : " + str("{:.2f}".format(a_score*100)),'%')

print("\n")

# Confusion Matrix
# Labels : 0(Business), 1(Entertainment), 2(Health), 3(Science & Technology)
# By defualt, Horizontally, Labels are from 0 to 3
# By defualt, Vertically,   Labels are from 0 to 3
confusion_matrix = metrics.confusion_matrix(y_test, pred)

print("Confusion Matrix :")
print(confusion_matrix)

Support Vector Machine : (BOG Approach) 

Accuracy : 94.44 %


Confusion Matrix :
[[27009   402   267  1513]
 [  533 37083   151   280]
 [  437   245 10520   128]
 [ 1372   431   111 25123]]
