##### AIT 726 - Final Project Naive Bayes Code
##### By Japneet Kohli

####  The code is executed in the following order.

1.   Load data.
2.   Select only translation writer and line translations data from the list of attributes.
3.   Assign labels of translation writer attribute binary numeric values (0,1).
4.   Preprocess text using regex.
5.   Split data into train, validation, and test.
6.   Create pipeline with functions including Multinomial Naive Bayes, Count Vectorizer, and TF-IDF Transformer, and assign range of hyerparameters to be tuned.
7.   Perform 10 fold cross validation using GridSearchCV to find best values for hyperparameters. 
8.   Fit the model with the tuned parameters on training data.
9.   Predict values for validation and test data and compute accuracy using classfication report and confusion matrix.
10.  Save the validation prediction results for offline analysis.

In [1]:
# import packages
import os
import re
import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
import logging

# use logging to save the results
logging.basicConfig(filename='baselinemodel.log', level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler())
print('Done')

Done


In [2]:
# assign stemmer
stemmer = PorterStemmer()
print('Done')

Done


In [71]:
# read data
directory = r'csv_output_files'
dfpath = directory + "\\" + 'datadf.csv'
data=pd.read_csv(dfpath)
print('Done')

Done


In [73]:
# display data
data.head()

Unnamed: 0.1,Unnamed: 0,writer,lines_translations_english_writer,lines_translations_english,lines_type
0,1,Guru Nanak Dev Ji,Dr. Sant Singh Khalsa,One Universal Creator God. The Name Is Truth. ...,Manglacharan
1,2,Guru Nanak Dev Ji,Bhai Manmohan Singh,"There is but one God. True is His Name, creati...",Manglacharan
2,5,Guru Nanak Dev Ji,Dr. Sant Singh Khalsa,Chant And Meditate:,Sirlekh
3,6,Guru Nanak Dev Ji,Bhai Manmohan Singh,Embrace His meditation.,Sirlekh
4,9,Guru Nanak Dev Ji,Dr. Sant Singh Khalsa,True In The Primal Beginning. True Throughout ...,Pankti


In [74]:
# keep only listed columns in dataframe
columns_to_keep = ['writer','lines_translations_english_writer','lines_translations_english','lines_type']
newdata=data[columns_to_keep]

In [13]:
# display data
newdata.head()

Unnamed: 0,writer,lines_translations_english_writer,lines_translations_english,lines_type
0,Guru Nanak Dev Ji,Dr. Sant Singh Khalsa,One Universal Creator God. The Name Is Truth. ...,Manglacharan
1,Guru Nanak Dev Ji,Bhai Manmohan Singh,"There is but one God. True is His Name, creati...",Manglacharan
2,Guru Nanak Dev Ji,Dr. Sant Singh Khalsa,Chant And Meditate:,Sirlekh
3,Guru Nanak Dev Ji,Bhai Manmohan Singh,Embrace His meditation.,Sirlekh
4,Guru Nanak Dev Ji,Dr. Sant Singh Khalsa,True In The Primal Beginning. True Throughout ...,Pankti


In [14]:
# describe data
newdata.describe()

Unnamed: 0,writer,lines_translations_english_writer,lines_translations_english,lines_type
count,121020,121020,121020,121020
unique,35,2,112029,4
top,Guru Arjan Dev Ji,Bhai Manmohan Singh,One Universal Creator God. By The Grace Of The...,Pankti
freq,49746,60510,525,100400


In [15]:
# display data info
newdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121020 entries, 0 to 121019
Data columns (total 4 columns):
writer                               121020 non-null object
lines_translations_english_writer    121020 non-null object
lines_translations_english           121020 non-null object
lines_type                           121020 non-null object
dtypes: object(4)
memory usage: 3.7+ MB


In [16]:
# display writer column by grouping count values
newdata.writer.value_counts()

Guru Arjan Dev Ji       49746
Guru Nanak Dev Ji       24078
Guru Amardas Ji         20230
Guru Ramdas Ji          13086
Bhagat Kabir Ji          6770
Bhagat Namdev Ji         1502
Guru Tegh Bahadur Ji     1182
Bhagat Ravidas Ji         924
Sheikh Farid Ji           636
Guru Angad Dev Ji         614
Bhatt Kalh Sahar          566
Satta and Balwand         180
Bhagat Beni Ji            178
Bhatt Nalh                168
Bhatt Gayand              160
Bhatt Mathura             128
Poet Alam                 124
Bhagat Trilochan Ji       114
Bhatt Keerat               80
Baba Sundar                76
Bhatt Jaalap               62
Bhagat Dhanna Ji           62
Bhatt Balh                 52
Bhagat Jaidev Ji           46
Bhatt Bhikha               38
Bhagat Ramanand Ji         32
Bhagat Bheekhan Ji         32
Bhatt Harbans              24
Bhatt Salh                 24
Bhagat Sadhna Ji           24
Bhagat Sain Ji             22
Bhagat Parmanand Ji        20
Bhagat Surdas Ji           18
Bhagat Pip

In [17]:
# display translator data by grouping count values
newdata.lines_translations_english_writer.value_counts()

Bhai Manmohan Singh      60510
Dr. Sant Singh Khalsa    60510
Name: lines_translations_english_writer, dtype: int64

In [42]:
# keep data for only listed columns. here, its the label (translator writer) and text (translations) 
translator_columns=['lines_translations_english_writer','lines_translations_english']
translator_data=newdata[translator_columns]

In [43]:
# display data
translator_data.head()

Unnamed: 0,lines_translations_english_writer,lines_translations_english
0,Dr. Sant Singh Khalsa,One Universal Creator God. The Name Is Truth. ...
1,Bhai Manmohan Singh,"There is but one God. True is His Name, creati..."
2,Dr. Sant Singh Khalsa,Chant And Meditate:
3,Bhai Manmohan Singh,Embrace His meditation.
4,Dr. Sant Singh Khalsa,True In The Primal Beginning. True Throughout ...


In [44]:
# display data shape
translator_data.shape

(121020, 2)

In [45]:
# describe data
translator_data.describe()

Unnamed: 0,lines_translations_english_writer,lines_translations_english
count,121020,121020
unique,2,112029
top,Bhai Manmohan Singh,One Universal Creator God. By The Grace Of The...
freq,60510,525


In [46]:
# display data info
translator_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121020 entries, 0 to 121019
Data columns (total 2 columns):
lines_translations_english_writer    121020 non-null object
lines_translations_english           121020 non-null object
dtypes: object(2)
memory usage: 1.8+ MB


In [47]:
# display data at random row
translator_data.lines_translations_english[14]

'Hundreds of thousands of clever tricks, but not even one of them will go along with you in the end.'

In [48]:
# display data for translations
translator_data["lines_translations_english"]

0         One Universal Creator God. The Name Is Truth. ...
1         There is but one God. True is His Name, creati...
2                                       Chant And Meditate:
3                                   Embrace His meditation.
4         True In The Primal Beginning. True Throughout ...
                                ...                        
121015    These are the names of the sons of Maygh Raag....
121016    So all together, they sing the six Raagas and ...
121017    So all together, they sing the six Raagas and ...
121018    and all the forty-eight sons of the Raagas. ||...
121019    and all the forty-eight sons of the Raagas. ||...
Name: lines_translations_english, Length: 121020, dtype: object

In [49]:
# display values for translator writer
translator_data.lines_translations_english_writer.values

array(['Dr. Sant Singh Khalsa', 'Bhai Manmohan Singh',
       'Dr. Sant Singh Khalsa', ..., 'Bhai Manmohan Singh',
       'Dr. Sant Singh Khalsa', 'Bhai Manmohan Singh'], dtype=object)

In [50]:
# replace translator writer label with binary 0 and 1
translator_data["lines_translations_english_writer"].replace({"Dr. Sant Singh Khalsa": "0", "Bhai Manmohan Singh": "1"}, inplace=True)

In [51]:
# display data to see if binary values were correctly placed
translator_data.lines_translations_english_writer.values

array(['0', '1', '0', ..., '1', '0', '1'], dtype=object)

In [52]:
# display value counts for label
translator_data.lines_translations_english_writer.value_counts()

1    60510
0    60510
Name: lines_translations_english_writer, dtype: int64

In [53]:
# preprocess data to remove punctuation, html tags, non ascii characters, etc.
def preprocess_text(txt):
        txt = re.sub(r'(?:<[^>]+>)', '', txt)#Html tags
        txt = re.sub('[^\x00-\x7F]+', ' ', txt) ##remove non ascii charcters
        txt = re.sub('#\S+', '', txt)
        txt = re.sub('_', '', txt)
        txt = re.sub('-',' ',txt)
        txt = re.sub('/','',txt)
        txt = re.sub('~','',txt)
        txt = re.sub('|','',txt)
        txt = re.sub('\s+', ' ', txt) ## remove double spaces
        txt = re.sub(r'([A-Z][a-z]+)',lambda x:x.group().lower(),txt)##Lower case captilized words only but not words with all capitals
        return txt.strip()
data = [preprocess_text(t) for t in translator_data["lines_translations_english"]]
labels= translator_data.lines_translations_english_writer.values
print('Done')

Done


In [35]:
data

["one universal creator god. the name is truth. creative being personified. no fear. no hatred. image of the undying, beyond birth, self existent. by guru's grace",
 "there is but one god. true is his name, creative his personality and immortal his form. he is without fear sans enmity, unborn and self illumined. by the guru's grace he is obtained.",
 'chant and meditate:',
 'embrace his meditation.',
 'true in the primal beginning. true throughout the ages.',
 'true in the prime, true in the beginning of ages,',
 'true here and now. O nanak, forever and ever true. ||1||',
 'true he is even now and true he verily, shall be, O nanak!',
 'by thinking, he cannot be reduced to thought, even by thinking hundreds of thousands of times.',
 'by pondering on god, man cannot have a conception of him, even though he may ponder over lacs of times.',
 'by remaining silent, inner silence is not obtained, even by remaining lovingly absorbed deep within.',
 "even though one be silent and remains absorb

In [54]:
labels

array(['0', '1', '0', ..., '1', '0', '1'], dtype=object)

In [55]:
# import required packages for NB and split data into train, test, validation
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.20, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.20, random_state=42)
print('Done')

Done


In [56]:
x_train

['one whose heart is filled with jealousy of others, never comes to any good.',
 'the creator himself yokes man to his service.',
 'the treasure of the naam, the name of the lord, is within all, but how rare are those who, as gurmukh, obtain it. ||5||',
 'any other task would be worthless to them you have enjoined them to your true service.',
 'you are infinite, eternal and incomprehensible; all this is your creation.',
 'he is one alone, O brother. he is one in one. pause.',
 'one whom you inspire to understand the truth, is easily absorbed into the naam. ||8||1||23||',
 "sorat'h, fifth mehl:",
 'without the true guru, the faithless cynics rot away and die. the noose of death is around the necks of those who have no guru. ||11||',
 'they, who are pleasing to their god, attain to this wealth.',
 'pondering over the lord master, my soul has obtained peace.',
 "there is but one god. by the true guru's grace, he is obtained.",
 'who would cut his throat, to have meat with his bread? ||188

In [57]:
y_train

array(['0', '1', '0', ..., '0', '1', '1'], dtype=object)

In [58]:
# assign pipeine with functions and hyperparameters for turing
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     #('to_dense', DenseTransformer()),
                     ('clf', MultinomialNB())])

tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1e-1, 1e-2, 1e-3]
}
print('Done')

Done


In [59]:
df_y_train= pd.DataFrame(y_train, columns=['y_train'])

In [60]:
df_y_train.y_train.value_counts()

0    38798
1    38654
Name: y_train, dtype: int64

In [61]:
len(x_val)

19364

In [62]:
len(x_test)

24204

In [84]:
# tune hyperparameters and get best value using 10-fold cross validation
# fit model on training data and print mean, standard deviation values for hyperparameters tuned
from sklearn.metrics import classification_report

score = 'f1_micro'
print("# Tuning hyper-parameters for %s" % score)
print()
np.errstate(divide='ignore')
clf = GridSearchCV(text_clf, tuned_parameters, cv=10, scoring=score)
clf.fit(x_train, y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
for mean, std, params in zip(clf.cv_results_['mean_test_score'], 
                             clf.cv_results_['std_test_score'], 
                             clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")

# Tuning hyper-parameters for f1_micro

Best parameters set found on development set:

{'clf__alpha': 0.1, 'tfidf__norm': 'l2', 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}

Grid scores on development set:

0.888 (+/-0.006) for {'clf__alpha': 0.1, 'tfidf__norm': 'l1', 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
0.917 (+/-0.005) for {'clf__alpha': 0.1, 'tfidf__norm': 'l1', 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
0.896 (+/-0.005) for {'clf__alpha': 0.1, 'tfidf__norm': 'l1', 'tfidf__use_idf': True, 'vect__ngram_range': (2, 2)}
0.887 (+/-0.007) for {'clf__alpha': 0.1, 'tfidf__norm': 'l1', 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
0.917 (+/-0.006) for {'clf__alpha': 0.1, 'tfidf__norm': 'l1', 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}
0.899 (+/-0.004) for {'clf__alpha': 0.1, 'tfidf__norm': 'l1', 'tfidf__use_idf': False, 'vect__ngram_range': (2, 2)}
0.887 (+/-0.006) for {'clf__alpha': 0.1, 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect

In [64]:
# predict training data
y_train_pred = clf.predict(x_train)

In [65]:
# classification report for training data
print(classification_report(y_train, y_train_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9701    0.9679    0.9690     38798
           1     0.9678    0.9700    0.9689     38654

    accuracy                         0.9689     77452
   macro avg     0.9689    0.9689    0.9689     77452
weighted avg     0.9689    0.9689    0.9689     77452



In [66]:
# predict validation data
y_val_pred= clf.predict(x_val)

In [67]:
# classification report for validation data
print(classification_report(y_val, y_val_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9190    0.9152    0.9171      9706
           1     0.9151    0.9189    0.9170      9658

    accuracy                         0.9171     19364
   macro avg     0.9171    0.9171    0.9171     19364
weighted avg     0.9171    0.9171    0.9171     19364



In [68]:
# confusion matrix for validation data
from sklearn import metrics
print(metrics.confusion_matrix(y_val, y_val_pred))

[[8883  823]
 [ 783 8875]]


In [75]:
# predict test data
y_test_pred = clf.predict(x_test)

In [76]:
# classification report for test data
print(classification_report(y_test, y_test_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9173    0.9213    0.9193     12006
           1     0.9222    0.9183    0.9202     12198

    accuracy                         0.9198     24204
   macro avg     0.9198    0.9198    0.9198     24204
weighted avg     0.9198    0.9198    0.9198     24204



In [77]:
# confusion matrix for test data 
print(metrics.confusion_matrix(y_test, y_test_pred))

[[11061   945]
 [  997 11201]]


In [78]:
# create dataframe of validation data
df_y_val_true=pd.DataFrame(y_val, columns=['y_true']) 
df_y_val_true

Unnamed: 0,y_true
0,0
1,1
2,0
3,1
4,0
...,...
19359,1
19360,1
19361,1
19362,0


In [79]:
# add validation predictions in df
df_y_val_true['y_pred']=y_val_pred
df_y_val_true

Unnamed: 0,y_true,y_pred
0,0,0
1,1,1
2,0,0
3,1,1
4,0,0
...,...,...
19359,1,1
19360,1,1
19361,1,1
19362,0,0


In [80]:
# rename
df_val=df_y_val_true

In [81]:
# add text column (translations)
df_val['lines_translations_english'] = x_val

In [82]:
# display data
df_val.head(5)

Unnamed: 0,y_true,y_pred,lines_translations_english
0,0,0,"that wealth, for which they slander others, do..."
1,1,1,for the man of wisdom of mouthful is of the gn...
2,0,0,"you lick the grinding bowl, and eat the flour."
3,1,1,the longing of my soul and body is for god and...
4,0,0,"that palace is beautiful, in which the kirtan ..."


In [83]:
# export data
df_val.to_excel(r'validation_nb.xlsx')