In [1]:
# Import the necessary libraries at first 
import pandas as pd 
import sklearn
import numpy as np
import re
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from numpy import array
from scipy.sparse import coo_matrix
from scipy.sparse import hstack
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
#Load the csv file
# encoding= "unicode_escape" used to escape Non-ASII characters from data file, otherwise it will through error   

df =   pd.read_csv("C:/Users/umesh.kumar/udemy/parsely_7.csv",encoding= "unicode_escape")
df.tail(10)

Unnamed: 0,Donor,Tag,num_tag
525602,Vulcan Materials Company,b,1
525603,"Weil, Gotshal & Manges LLP",b,1
525604,"Wespac Construction, Inc.",b,1
525605,Western Alliance Bank,b,1
525606,Westfield Capital Management,b,1
525607,Whitney Bank,b,1
525608,Winston & Strawn,b,1
525609,Wintrust Financial Corporation,b,1
525610,World Bank Community Connections Fund,fd,3
525611,Zachry Corporation,b,1


In [3]:
#Check the shape of dataset
df.shape

(525612, 3)

In [4]:
# Add New blank column in your dataset

df["clean"]= ""

In [5]:
%%timeit

#Decode the Non-ASII characters from dataset
#Remove the unwanted special characters from dataset

def is_ascii_changed(text):
    text_1 =text.encode("ascii", "ignore")
    text_2 =text_1.decode()
    text_2 = " ".join(re.findall("[A-z*&.]+", text))
    return text_2

df["clean"]= df["Donor"].apply(lambda x :is_ascii_changed(x))

975 ms ± 11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
# Add new cloumns for characters length & also count the space in each word
df["string_len"]= df["Donor"].str.len()
df["string_space"]= df["Donor"].str.count(' ')

In [7]:
#View the dataset
df.head(7)

Unnamed: 0,Donor,Tag,num_tag,clean,string_len,string_space
0,The 355 Group Inc.,b,1,The Group Inc.,18,3
1,A.B. Hatchery & Garden Center,o,6,A.B. Hatchery & Garden Center,29,4
2,Mr. Sigurd D. Aanonsen,p,0,Mr. Sigurd D. Aanonsen,22,3
3,Mrs. Roma W. Abbott '46,p,0,Mrs. Roma W. Abbott,23,4
4,Mr. Munib S. Abdulrehman '02,p,0,Mr. Munib S. Abdulrehman,28,4
5,Mr. A. Beecher Abeles,p,0,Mr. A. Beecher Abeles,21,3
6,Mrs. Robin L. Abeles '87,p,0,Mrs. Robin L. Abeles,24,4


In [8]:
# Initialize and fit CountVectorizer with given Names
vectorizer = CountVectorizer().fit(df["clean"])

# use the vectorizer to transform the names into word count vectors (Sparse)
word_mat = vectorizer.transform(df["clean"])

In [9]:
#Check the shape of dataset after vectorization
word_mat.shape

(525612, 138236)

In [10]:
#Add created features in vectorization dataset, using scipy library to convert into sparse matrix

word_mat = hstack((word_mat,np.array(df["string_len"])[:,None]))
word_mat = hstack((word_mat,np.array(df["string_space"])[:,None]))

In [11]:
#Check the shape again of dataset whether feature column added into dataset
word_mat.shape

(525612, 138238)

In [12]:
#convert matrix into csr_matrix

word_mat= coo_matrix((word_mat)).tocsr()
type(word_mat)

scipy.sparse.csr.csr_matrix

In [13]:
# LabelEncoder can be re-used to encode and decode the labels
encoder = LabelEncoder().fit(df["num_tag"])

# using the encoder to encode the entire dataset
y = encoder.transform(df["num_tag"])

In [14]:
#Slipt dataset into train & test dataset for training & validating the model

x_train, x_test, y_train, y_test = train_test_split(word_mat, y, test_size=0.3)

In [15]:
%%time
# instantiate the model as clf(classifier) and train it
np.random.seed(20)
clf = MultinomialNB()
clf.fit(x_train, y_train)

Wall time: 128 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
#check the score of new created model
clf.score(x_test, y_test)

0.9664582329215393

In [17]:
#Classify the test dataset, using predict function from sklearn
y_clf_pred = clf.predict(x_test)

y_clf_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [18]:
#Print the classification result
print(classification_report(y_test, y_clf_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98    152230
           1       0.98      0.08      0.15      1935
           2       0.00      0.00      0.00        79
           3       0.00      0.00      0.00       249
           4       1.00      0.00      0.01      1154
           5       0.00      0.00      0.00        19
           6       0.75      0.00      0.00      1628
           7       0.00      0.00      0.00       213
           8       0.00      0.00      0.00       177

    accuracy                           0.97    157684
   macro avg       0.41      0.12      0.13    157684
weighted avg       0.96      0.97      0.95    157684



  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
#Print the confusion_matrix results
print(confusion_matrix(y_test, y_clf_pred))

[[152230      0      0      0      0      0      0      0      0]
 [  1777    158      0      0      0      0      0      0      0]
 [    79      0      0      0      0      0      0      0      0]
 [   246      2      0      0      0      0      1      0      0]
 [  1150      0      0      0      4      0      0      0      0]
 [    19      0      0      0      0      0      0      0      0]
 [  1624      1      0      0      0      0      3      0      0]
 [   213      0      0      0      0      0      0      0      0]
 [   177      0      0      0      0      0      0      0      0]]


In [22]:
#Save the classification model into a pickle file
pickle.dump(clf, open("TextClassification_Regex_Feature creation.pickle","wb"))

In [23]:
#Load the classification model to future use
load_model_rf = pickle.load(open("TextClassification_Regex_Feature creation.pickle","rb"))