# Import data

https://thinkingneuron.com/how-to-classify-text-using-word2vec/

In [1]:
import pandas as pd

In [2]:
org_df = pd.read_csv('QI NERs.csv')

In [3]:
display(print(org_df.groupby('Class').size()))
display(org_df.head())

Class
Government Agency     19
National Lab          66
Quantum_comp         428
group_centre         229
investor             459
university           186
dtype: int64


None

Unnamed: 0,name,Class
0,Advanced Materials And Process Engineering Lab...,group_centre
1,Advancing Quantum Architecture Group,group_centre
2,Ag Quantenoptik,group_centre
3,Institute of Materials Research and Engineering,group_centre
4,Air Force Research Laboratory Quantum Group,group_centre


In [4]:
test = org_df.groupby("Class").sample(frac=0.2, random_state=2)
print(test.groupby('Class').size())
train = org_df.drop(test.index)
print(train.groupby('Class').size())

Class
Government Agency     4
National Lab         13
Quantum_comp         86
group_centre         46
investor             92
university           37
dtype: int64
Class
Government Agency     15
National Lab          53
Quantum_comp         342
group_centre         183
investor             367
university           149
dtype: int64


In [5]:
# Count vectorization of text
from sklearn.feature_extraction.text import CountVectorizer
 
# Ticket Data
corpus = org_df['name'].values
 
# Creating the vectorizer
vectorizer = CountVectorizer(stop_words='english')
 
# Converting the text to numeric data
X = vectorizer.fit_transform(corpus)
 
#print(vectorizer.get_feature_names())
 
# Preparing Data frame For machine learning
# Priority column acts as a target variable and other columns as predictors
CountVectorizedData=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
CountVectorizedData['Priority']=org_df['Class']
print(CountVectorizedData.shape)
CountVectorizedData.head()

(1387, 1592)


Unnamed: 0,1517,180,1qbit,32,3i,415,500,5y,7percent,8vc,...,zhongtian,zte,zu,zurich,zy4,zyvex,zürcher,zürich,école,Priority
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,group_centre
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,group_centre
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,group_centre
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,group_centre
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,group_centre


# Model Construction

In [6]:
import gensim

In [7]:
GoogleModel = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [8]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [9]:
# Defining a function which takes text input and returns one vector for each sentence
def FunctionText2Vec(inpTextData):
    # Converting the text to numeric data
    X = vectorizer.transform(inpTextData)
    CountVecData=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    
    # Creating empty dataframe to hold sentences
    W2Vec_Data=pd.DataFrame()
    
    # Looping through each row for the data
    for i in range(CountVecData.shape[0]):

        # initiating a sentence with all zeros
        Sentence = np.zeros(300)

        # Looping thru each word in the sentence and if its present in 
        # the Word2Vec model then storing its vector
        for word in WordsVocab[CountVecData.iloc[i,:] >=1]:
            #print(word)
            if word in GoogleModel:    
                Sentence=Sentence+GoogleModel[word]
        # Appending the sentence to the dataframe
        W2Vec_Data=W2Vec_Data.append(pd.DataFrame([Sentence]))
    return(W2Vec_Data)

In [10]:
# Creating the list of words which are present in the Document term matrix
WordsVocab=CountVectorizedData.columns[:-1]
 
# Printing sample words
WordsVocab[0:10]

Index(['1517', '180', '1qbit', '32', '3i', '415', '500', '5y', '7percent',
       '8vc'],
      dtype='object')

In [11]:
# Calling the function to convert all the text data to Word2Vec Vectors
W2Vec_Data=FunctionText2Vec(org_df['name'])
 
# Checking the new representation for sentences
W2Vec_Data.shape

(1387, 300)

In [12]:
CountVectorizedData.shape

(1387, 1592)

# Prep Data for ML

In [13]:
# Adding the target variable
W2Vec_Data.reset_index(inplace=True, drop=True)
W2Vec_Data['Priority']=CountVectorizedData['Priority']
 
# Assigning to DataForML variable
DataForML=W2Vec_Data
DataForML.head()
# DataForML = DataForML.dropna()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,Priority
0,-0.483643,0.641449,0.570801,0.213623,-0.491699,0.324738,0.412842,-0.856934,-0.431641,-0.374695,...,-0.22937,-0.914062,1.14624,-0.153809,-0.227905,-0.166504,-0.301758,0.162598,0.095703,group_centre
1,-0.007812,0.004578,0.252075,0.221313,-0.105732,-0.313965,0.133789,-1.097656,0.235565,0.316406,...,0.057129,-0.526611,-0.153076,0.240723,-0.08374,0.252686,-0.244141,0.192261,0.280457,group_centre
2,-0.148438,-0.010559,-0.066895,0.213867,-0.192383,-0.120117,0.063477,0.0625,-0.038086,-0.208008,...,0.022949,0.118164,0.078125,0.121094,0.267578,-0.126953,0.057373,0.1875,0.034668,group_centre
3,-0.337646,0.159912,0.450195,0.348389,0.128418,0.15625,0.491943,-0.708008,-0.116211,-0.68457,...,-0.256592,-0.881836,1.051758,0.207886,-0.454651,-0.1492,-0.305176,0.189453,0.131042,group_centre
4,-0.052002,-0.010101,0.633667,-0.110474,-0.142597,-0.339111,-0.01123,-1.04834,0.583954,0.00293,...,-0.097961,-0.705078,0.359421,-0.032288,-0.517883,-0.020676,0.191406,-0.044678,-0.373657,group_centre


In [14]:
CountVectorizedData.Priority

0       group_centre
1       group_centre
2       group_centre
3       group_centre
4       group_centre
            ...     
1382    Quantum_comp
1383    Quantum_comp
1384    Quantum_comp
1385    Quantum_comp
1386    Quantum_comp
Name: Priority, Length: 1387, dtype: object

In [15]:
DataForML.Priority

0       group_centre
1       group_centre
2       group_centre
3       group_centre
4       group_centre
            ...     
1382    Quantum_comp
1383    Quantum_comp
1384    Quantum_comp
1385    Quantum_comp
1386    Quantum_comp
Name: Priority, Length: 1387, dtype: object

In [16]:
# Separate Target Variable and Predictor Variables
TargetVariable=DataForML.columns[-1]
Predictors=DataForML.columns[:-1]
 
X=DataForML[Predictors].values
y=DataForML[TargetVariable].values
 
# Split the data into training and testing set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Sanity check for the sampled data
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1109, 300)
(1109,)
(278, 300)
(278,)


In [17]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Choose either standardization or Normalization
# On this data Min Max Normalization is used because we need to fit Naive Bayes
 
# Choose between standardization and MinMAx normalization
#PredictorScaler=StandardScaler()
PredictorScaler=MinMaxScaler()
 
# Storing the fit object for later reference
PredictorScalerFit=PredictorScaler.fit(X)
 
# Generating the standardized values of X
X=PredictorScalerFit.transform(X)
 
# Split the data into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
 
# Sanity check for the sampled data
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1109, 300)
(1109,)
(278, 300)
(278,)


# Naive Bayes

In [18]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB, MultinomialNB
 
# GaussianNB is used in Binomial Classification
# MultinomialNB is used in multi-class classification
#clf = GaussianNB()
clf = MultinomialNB()
 
# Printing all the parameters of Naive Bayes
# print(clf)
 
NB=clf.fit(X_train,y_train)
prediction=NB.predict(X_test)
 
# Measuring accuracy on Testing Data
from sklearn import metrics
print(metrics.classification_report(y_test, prediction))
print(metrics.confusion_matrix(y_test, prediction))
 
# Printing the Overall Accuracy of the model
F1_Score=metrics.f1_score(y_test, prediction, average='weighted')
print('Accuracy of the model on Testing Sample Data:', round(F1_Score,2))
 
# Importing cross validation function from sklearn
from sklearn.model_selection import cross_val_score
 
# Running 10-Fold Cross validation on a given algorithm
# Passing full data X and y because the K-fold will split the data and automatically choose train/test
Accuracy_Values=cross_val_score(NB, X , y, cv=10, scoring='f1_weighted')
print('\nAccuracy values for 10-fold Cross Validation:\n',Accuracy_Values)
print('\nFinal Average Accuracy of the model:', round(Accuracy_Values.mean(),2))

              precision    recall  f1-score   support

National Lab       1.00      0.06      0.12        16
Quantum_comp       0.58      0.78      0.66        94
group_centre       0.64      0.71      0.67        49
    investor       0.80      0.69      0.74        87
  university       0.95      0.62      0.75        32

    accuracy                           0.68       278
   macro avg       0.79      0.57      0.59       278
weighted avg       0.73      0.68      0.67       278

[[ 1  9  3  3  0]
 [ 0 73 14  7  0]
 [ 0 11 35  2  1]
 [ 0 25  2 60  0]
 [ 0  8  1  3 20]]
Accuracy of the model on Testing Sample Data: 0.67

Accuracy values for 10-fold Cross Validation:
 [0.60449126 0.70438548 0.65423947 0.69113329 0.64253307 0.65258799
 0.79026783 0.73554916 0.64297133 0.69716073]

Final Average Accuracy of the model: 0.68


# KNN

In [19]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=15)
 
# Printing all the parameters of KNN
print(clf)
 
# Creating the model on Training Data
KNN=clf.fit(X_train,y_train)
prediction=KNN.predict(X_test)
 
# Measuring accuracy on Testing Data
from sklearn import metrics
print(metrics.classification_report(y_test, prediction))
print(metrics.confusion_matrix(y_test, prediction))
 
# Printing the Overall Accuracy of the model
F1_Score=metrics.f1_score(y_test, prediction, average='weighted')
print('Accuracy of the model on Testing Sample Data:', round(F1_Score,2))
 
# Importing cross validation function from sklearn
from sklearn.model_selection import cross_val_score


# Running 10-Fold Cross validation on a given algorithm
# Passing full data X and y because the K-fold will split the data and automatically choose train/test
Accuracy_Values=cross_val_score(KNN, X , y, cv=10, scoring='f1_weighted')
print('\nAccuracy values for 10-fold Cross Validation:\n',Accuracy_Values)
print('\nFinal Average Accuracy of the model:', round(Accuracy_Values.mean(),2))
 
# Plotting the feature importance for Top 10 most important columns
# There is no built-in method to get feature importance in KNN


KNeighborsClassifier(n_neighbors=15)
              precision    recall  f1-score   support

National Lab       0.83      0.31      0.45        16
Quantum_comp       0.57      0.99      0.73        94
group_centre       0.81      0.61      0.70        49
    investor       1.00      0.56      0.72        87
  university       0.92      0.69      0.79        32

    accuracy                           0.72       278
   macro avg       0.83      0.63      0.68       278
weighted avg       0.80      0.72      0.71       278

[[ 5  7  4  0  0]
 [ 0 93  1  0  0]
 [ 0 18 30  0  1]
 [ 1 35  1 49  1]
 [ 0  9  1  0 22]]
Accuracy of the model on Testing Sample Data: 0.71

Accuracy values for 10-fold Cross Validation:
 [0.62208234 0.66877961 0.65063642 0.67941357 0.71114647 0.72464419
 0.64749022 0.63049949 0.61051064 0.58333714]

Final Average Accuracy of the model: 0.65


# Making predictions on New cases

In [20]:

# Defining a function which converts words into numeric vectors for prediction
def FunctionPredictUrgency(inpText):
    
    # Generating the Glove word vector embeddings
    X=FunctionText2Vec(inpText)
    #print(X)
    
    # If standardization/normalization was done on training
    # then the above X must also be converted to same platform
    # Generating the normalized values of X
    X=PredictorScalerFit.transform(X)
    
    # Generating the prediction using Naive Bayes model and returning
    Prediction=NB.predict(X)
    Result=pd.DataFrame(data=inpText, columns=['Name'])
    Result['Prediction']=Prediction
    return(Result)

In [21]:
NewTicket=["wilfrid laurier university", "Nokia Bell Lab","Brookfield investments", "Alphabet","Arqit Quantum Inc","Acme"]
FunctionPredictUrgency(inpText=NewTicket)

Unnamed: 0,Name,Prediction
0,wilfrid laurier university,university
1,Nokia Bell Lab,investor
2,Brookfield investments,investor
3,Alphabet,Quantum_comp
4,Arqit Quantum Inc,Quantum_comp
5,Acme,Quantum_comp


In [22]:
# org_df.to_csv('NE_vectors3.tsv', sep="\t")