In [12]:
import sklearn
import numpy as np
import pandas as pd
import csv
import os
import joblib
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
# Add the Data using pandas
Corpus = pd.read_csv(r"C:\\Users\\INTEL\\Desktop\\Deep Learning\\preprocesstext_final_V1.csv",encoding='latin-1')

In [3]:
print(Corpus['Text'].head())

0    ses cac eared put hh last aspen beats eadinglo...
1                                      x bos fs st art
2    ste took ears make caldgirene tobacco flavor p...
3                               ny asean mon uu buried
4    ren halos ban dan els takes healthy nerves cha...
Name: Text, dtype: object


In [4]:
#Explore null cells
Corpus.isnull()

Unnamed: 0,Text,Class
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
5,False,False
6,False,False
7,False,False
8,False,False
9,False,False


In [5]:
#View total of null values by column
Corpus.isnull().sum()

Text     0
Class    0
dtype: int64

In [6]:
# Split the model into Train and Test Data set
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['Text'],Corpus['Class'],train_size=0.8)

In [7]:
# Label encode the target variable  - This is done to transform Categorical data of string type in the data set into numerical values
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [8]:
# Vectorize the words by using TF-IDF Vectorizer - This is done to find how important a word in document is in comaprison to the corpus
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['Text'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [9]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)


SVM Accuracy Score ->  86.87068114511352


In [10]:
predictions_SVM

array([4, 0, 4, ..., 4, 4, 1], dtype=int64)

In [11]:
print(confusion_matrix(Test_Y,predictions_SVM))
print(classification_report(Test_Y,predictions_SVM))
print(accuracy_score(Test_Y,predictions_SVM))

[[649  28  10  35  12]
 [ 50 439  24  50   9]
 [ 16  21 562  25   1]
 [ 17  16  17 421   3]
 [ 39  15   1  10 569]]
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       734
           1       0.85      0.77      0.80       572
           2       0.92      0.90      0.91       625
           3       0.78      0.89      0.83       474
           4       0.96      0.90      0.93       634

    accuracy                           0.87      3039
   macro avg       0.87      0.87      0.87      3039
weighted avg       0.87      0.87      0.87      3039

0.8687068114511353


In [13]:
#save vectorizer
joblib.dump(Tfidf_vect, "C:\\Users\\INTEL\\Desktop\\Deployment\\SVM_vectorizer.pkl")

['C:\\Users\\INTEL\\Desktop\\Deployment\\SVM_vectorizer.pkl']

In [14]:
#save model
joblib.dump(SVM, "C:\\Users\\INTEL\\Desktop\\Deployment\\svm_model.pkl")

['C:\\Users\\INTEL\\Desktop\\Deployment\\svm_model.pkl']

In [15]:
model = joblib.load('C:\\Users\\INTEL\\Desktop\\Deployment\\svm_model.pkl') 

In [16]:
vectorizer = joblib.load('C:\\Users\\INTEL\\Desktop\\Deployment\\SVM_vectorizer.pkl')

In [17]:
data = ['Mumbai is the best city for bollywood']
class_dict = {0: 'letter', 1: 'form', 2: 'invoice', 3: 'advertisement', 4:'email'}
vect = vectorizer.transform(data).toarray()
my_prediction = model.predict(vect)

In [18]:
my_prediction

array([3], dtype=int64)

In [19]:
result = class_dict[my_prediction[0]]

In [20]:
print('Predicted Document is {}'.format(result))

Predicted Document is advertisement
