# Given a dataset which contains a excerpts of text written by some author and thecorresponding author tag, implement an SVM classifier to predict the author tag of the test text excerpts

In [31]:
import numpy as np
import pandas as pd
from numpy.random import RandomState
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import svm
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
import re
from nltk.corpus import stopwords

# Accuracy calculation

In [32]:
def accuracy(y_real, y_pred):
    accuracy = np.sum(y_real == y_pred) / len(y_real)
    return accuracy

# Loading the data

In [33]:
df = pd.read_csv('/media/indranil/New Volume1/second sem/SMAI/Assignment 2/q5/data/Train(1).csv')
df = df.iloc[:,1:]
rng = RandomState()
train = df.sample(frac=0.8,random_state = rng)
validation = df.loc[~df.index.isin(train.index)]
X_train,Y_train_temp = train.iloc[:,:], train.iloc[:,:]
X_validation,Y_validation_temp = validation.iloc[:, :], validation.iloc[:,:]
Y_train = []
Y_validation = []
for i in np.array(Y_train_temp):
    Y_train.append(i[0])
for i in np.array(Y_validation_temp):
    Y_validation.append(i[0])

# Support vector machine with the parameter for the soft margin cost function C=1000

In [313]:
svc1 = Pipeline([('vect', TfidfVectorizer()),
                     ('clf', SVC(C=1000)),
])
svc1.fit(X_train.text, X_train.author)
predicted1 = svc1.predict(X_validation.text)

In [314]:
np.mean(predicted1 == X_validation.author)

0.8155122885413342

In [310]:
print(confusion_matrix(X_validation.author, predicted1))

[[1018   88   84]
 [ 156  743   41]
 [ 198   58  747]]


In [311]:
print(classification_report(X_validation.author, predicted1))

              precision    recall  f1-score   support

         EAP       0.74      0.86      0.79      1190
         HPL       0.84      0.79      0.81       940
         MWS       0.86      0.74      0.80      1003

    accuracy                           0.80      3133
   macro avg       0.81      0.80      0.80      3133
weighted avg       0.81      0.80      0.80      3133



# Support vector machine with the parameter for the soft margin cost function C=100

In [363]:
svc2 = Pipeline([('vect', TfidfVectorizer()),
                     ('clf', SVC(C=100)),
])
svc2.fit(X_train.text, X_train.author)
predicted2 = svc2.predict(X_validation.text)

In [364]:
np.mean(predicted2 == X_validation.author)

0.8078518991382062

In [365]:
print(confusion_matrix(X_validation.author, predicted2))

[[1070   97   71]
 [ 158  707   55]
 [ 166   55  754]]


In [366]:
print(classification_report(X_validation.author, predicted2))

              precision    recall  f1-score   support

         EAP       0.77      0.86      0.81      1238
         HPL       0.82      0.77      0.79       920
         MWS       0.86      0.77      0.81       975

    accuracy                           0.81      3133
   macro avg       0.82      0.80      0.81      3133
weighted avg       0.81      0.81      0.81      3133



# Support vector machine with the parameter for the soft margin cost function C=10

In [346]:
svc3 = Pipeline([('vect', TfidfVectorizer()),
                     ('clf', SVC(C=10)),
])
svc3.fit(X_train.text, X_train.author)
predicted3 = svc3.predict(X_validation.text)

In [347]:
np.mean(predicted3 == X_validation.author)

0.8078518991382062

In [348]:
print(confusion_matrix(X_validation.author, predicted3))

[[1070   97   71]
 [ 158  707   55]
 [ 166   55  754]]


In [349]:
print(classification_report(X_validation.author, predicted3))

              precision    recall  f1-score   support

         EAP       0.77      0.86      0.81      1238
         HPL       0.82      0.77      0.79       920
         MWS       0.86      0.77      0.81       975

    accuracy                           0.81      3133
   macro avg       0.82      0.80      0.81      3133
weighted avg       0.81      0.81      0.81      3133



# Support vector machine using Linear Kernel

In [350]:
svc4 = Pipeline([('vect', TfidfVectorizer()),
                     ('clf', SVC(kernel='linear')),
])
svc4.fit(X_train.text, X_train.author)
predicted4 = svc4.predict(X_validation.text)

In [351]:
np.mean(predicted4 == X_validation.author)

0.7992339610596872

In [352]:
print(confusion_matrix(X_validation.author, predicted4))

[[1041  113   84]
 [ 155  711   54]
 [ 168   55  752]]


In [353]:
print(classification_report(X_validation.author, predicted))

              precision    recall  f1-score   support

         EAP       0.70      0.75      0.73      1238
         HPL       0.77      0.68      0.72       920
         MWS       0.72      0.73      0.73       975

    accuracy                           0.73      3133
   macro avg       0.73      0.72      0.73      3133
weighted avg       0.73      0.73      0.73      3133



# Support vector machine using Polynomial Kernel

In [354]:
svc5 = Pipeline([('vect', TfidfVectorizer()),
                     ('clf', SVC( kernel='poly')),
])
svc5.fit(X_train.text, X_train.author)
predicted5 = svc5.predict(X_validation.text)

In [355]:
np.mean(predicted5 == X_validation.author)

0.42642834344079156

In [356]:
print(confusion_matrix(X_validation.author, predicted5))

[[1223    4   11]
 [ 892   23    5]
 [ 885    0   90]]


In [357]:
print(classification_report(X_validation.author, predicted5))

              precision    recall  f1-score   support

         EAP       0.41      0.99      0.58      1238
         HPL       0.85      0.03      0.05       920
         MWS       0.85      0.09      0.17       975

    accuracy                           0.43      3133
   macro avg       0.70      0.37      0.26      3133
weighted avg       0.68      0.43      0.29      3133



# Support vector machine using RBF Kernel

In [359]:
svc6 = Pipeline([('vect', TfidfVectorizer()),
                     ('clf', SVC( kernel='rbf')),
])
svc6.fit(X_train.text, X_train.author)
predicted6 = svc6.predict(X_validation.text)

In [360]:
np.mean(predicted6 == X_validation.author)

0.7931694861155442

In [361]:
print(confusion_matrix(X_validation.author, predicted6))

[[1080   86   72]
 [ 182  689   49]
 [ 202   57  716]]


In [362]:
print(classification_report(X_validation.author, predicted6))

              precision    recall  f1-score   support

         EAP       0.74      0.87      0.80      1238
         HPL       0.83      0.75      0.79       920
         MWS       0.86      0.73      0.79       975

    accuracy                           0.79      3133
   macro avg       0.81      0.79      0.79      3133
weighted avg       0.80      0.79      0.79      3133



# Support vector machine applied with some Data pre processing. All the cases have been lower cases. Digits and punctuation have been removed. Stop words like and, this, or, so in, to have been removed.

In [34]:
df.loc[:,"text"] = df.text.apply(lambda x : str.lower(x))
df.loc[:,"text"] = df.text.apply(lambda x : " ".join(re.findall('[\w]+',x)))
df = df.replace(to_replace =["and","And","this","This","or","So","in","In","to","To"],value ="") 

# Loading the data

In [35]:
rng = RandomState()
train = df.sample(frac=0.8,random_state = rng)
validation = df.loc[~df.index.isin(train.index)]
X_train,Y_train_temp = train.iloc[:,:], train.iloc[:,:]
X_validation,Y_validation_temp = validation.iloc[:, :], validation.iloc[:,:]
Y_train = []
Y_validation = []
for i in np.array(Y_train_temp):
    Y_train.append(i[0])
for i in np.array(Y_validation_temp):
    Y_validation.append(i[0])

# After applying SVM on pre processed data the result has been improved. The unnecessary words like or,and so, the has been removed which carry no value to our result. All the words are lower cases so e.g. The and the are same now. Punctuation and digits have been removed which carry no meaning in our result. As a result of this the result has been imporved.

In [37]:
text_clf = Pipeline([('vect', TfidfVectorizer()),
                     ('clf', SVC(C=1000)),
])
text_clf.fit(X_train.text, X_train.author)
predicted = text_clf.predict(X_validation.text)
np.mean(predicted == X_validation.author)

0.8276412384296201

In [39]:
print(confusion_matrix(X_validation.author, predicted))

[[1123   70   90]
 [ 151  709   44]
 [ 122   63  761]]


In [41]:
print(classification_report(X_validation.author, predicted))

              precision    recall  f1-score   support

         EAP       0.80      0.88      0.84      1283
         HPL       0.84      0.78      0.81       904
         MWS       0.85      0.80      0.83       946

    accuracy                           0.83      3133
   macro avg       0.83      0.82      0.83      3133
weighted avg       0.83      0.83      0.83      3133

