In [15]:
import sklearn
import numpy as np
import pandas as pd
import csv
import os
import joblib
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
# Add the Data using pandas
Corpus = pd.read_csv(r"C:\\Users\\INTEL\\Desktop\\Deep Learning\\preprocesstext_final_V1.csv",encoding='latin-1')

In [3]:
print(Corpus['Text'].head())

0    ses cac eared put hh last aspen beats eadinglo...
1                                      x bos fs st art
2    ste took ears make caldgirene tobacco flavor p...
3                               ny asean mon uu buried
4    ren halos ban dan els takes healthy nerves cha...
Name: Text, dtype: object


In [4]:
#Explore null cells
Corpus.isnull()

Unnamed: 0,Text,Class
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
5,False,False
6,False,False
7,False,False
8,False,False
9,False,False


In [5]:
#View total of null values by column
Corpus.isnull().sum()

Text     0
Class    0
dtype: int64

In [6]:
# Split the model into Train and Test Data set
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['Text'],Corpus['Class'],train_size=0.8)

In [7]:
# Label encode the target variable  - This is done to transform Categorical data of string type in the data set into numerical values
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [8]:
# Vectorize the words by using TF-IDF Vectorizer - This is done to find how important a word in document is in comaprison to the corpus
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['Text'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [18]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)


SVM Accuracy Score ->  87.62750904902929


In [10]:
predictions_SVM

array([3, 0, 2, ..., 1, 1, 0], dtype=int64)

In [11]:
print(confusion_matrix(Test_Y,predictions_SVM))
print(classification_report(Test_Y,predictions_SVM))
print(accuracy_score(Test_Y,predictions_SVM))

[[691  36  16  27  14]
 [ 42 449  26  34   5]
 [ 10  23 524  26   1]
 [ 16  19  22 434   2]
 [ 19  17   1  20 565]]
              precision    recall  f1-score   support

           0       0.89      0.88      0.88       784
           1       0.83      0.81      0.82       556
           2       0.89      0.90      0.89       584
           3       0.80      0.88      0.84       493
           4       0.96      0.91      0.93       622

    accuracy                           0.88      3039
   macro avg       0.87      0.87      0.87      3039
weighted avg       0.88      0.88      0.88      3039

0.8762750904902928


In [13]:
lin_svm = svm.LinearSVC()
lin_svm.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_linearSVM = lin_svm.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Linear SVM Accuracy Score -> ",accuracy_score(predictions_linearSVM, Test_Y)*100)

Linear SVM Accuracy Score ->  86.93649226719316


In [14]:
print(confusion_matrix(Test_Y,predictions_linearSVM))
print(classification_report(Test_Y,predictions_linearSVM))
print(accuracy_score(Test_Y,predictions_linearSVM))

[[685  41  18  21  19]
 [ 40 443  32  34   7]
 [  9  18 532  24   1]
 [ 21  25  23 417   7]
 [ 19  21   2  15 565]]
              precision    recall  f1-score   support

           0       0.89      0.87      0.88       784
           1       0.81      0.80      0.80       556
           2       0.88      0.91      0.89       584
           3       0.82      0.85      0.83       493
           4       0.94      0.91      0.93       622

    accuracy                           0.87      3039
   macro avg       0.87      0.87      0.87      3039
weighted avg       0.87      0.87      0.87      3039

0.8693649226719316


In [16]:
dt = DecisionTreeClassifier()
dt.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_dt = dt.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Decision Tree Accuracy Score -> ",accuracy_score(predictions_dt, Test_Y)*100)

Decision Tree Accuracy Score ->  76.50542941757156


In [17]:
rf = RandomForestClassifier(n_estimators = 150)
rf.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_rf = rf.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Random Forest Accuracy Score -> ",accuracy_score(predictions_rf, Test_Y)*100)

Decision Tree Accuracy Score ->  86.54162553471537


In [19]:
from sklearn.model_selection import GridSearchCV

#try tuning the parameters. For LinearSVC, C is a tunable parameter
params = {'C': [0.1, 1, 10, 100, 1000]}

#use the GridSearchCV class. The core arguments are (classifier, parameters) for which we want to run a best param search.
grid = GridSearchCV(svm.SVC(), params, refit=True, verbose=3)

#fit the training data to our grid_search model, and check predictions on our test set
grid.fit(Train_X_Tfidf,Train_Y)
predictions_grid = grid.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Tuned Model Accuracy Score -> ",accuracy_score(predictions_grid, Test_Y)*100)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] C=0.1 ...........................................................

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.



[CV] ............................... C=0.1, score=0.739, total= 4.7min
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.7min remaining:    0.0s


[CV] ............................... C=0.1, score=0.735, total= 4.5min
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  9.1min remaining:    0.0s


[CV] ............................... C=0.1, score=0.724, total= 4.7min
[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.733, total= 4.0min
[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.725, total= 3.3min
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.872, total= 3.0min
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.884, total= 3.5min
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.877, total= 3.4min
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.878, total= 3.4min
[CV] C=1 .............................................................
[CV] .

[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed: 101.0min finished


Tuned Model Accuracy Score ->  88.2856202698256


In [20]:
#save vectorizer
joblib.dump(Tfidf_vect, "C:\\Users\\INTEL\\Desktop\\Deployment\\SVM_vectorizer.pkl")

['C:\\Users\\INTEL\\Desktop\\Deployment\\SVM_vectorizer.pkl']

In [27]:
#save model
joblib.dump(grid, "C:\\Users\\INTEL\\Desktop\\Deployment\\svm_model.pkl")

['C:\\Users\\INTEL\\Desktop\\Deployment\\svm_model.pkl']

In [28]:
model = joblib.load('C:\\Users\\INTEL\\Desktop\\Deployment\\svm_model.pkl') 

In [29]:
vectorizer = joblib.load('C:\\Users\\INTEL\\Desktop\\Deployment\\SVM_vectorizer.pkl')

In [67]:
data = ['Billed To : Ashwini Items: Lahenga Price : $800 Payment Date : 28.9.2019']
class_dict = {0: 'letter', 1: 'form', 2: 'invoice', 3: 'advertisement', 4:'email'}
vect = vectorizer.transform(data).toarray()
my_prediction = model.predict(vect)

In [68]:
my_prediction

array([2], dtype=int64)

In [69]:
result = class_dict[my_prediction[0]]

In [70]:
print('Predicted Document is {}'.format(result))

Predicted Document is invoice
