In [1]:
import os
import string
import nltk
import numpy as np
import pandas as pd
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score , cohen_kappa_score

In [2]:
# Define path to the folder containing the text files
path = r"D:\D_MLpractice\implementation\imp_author\ref_author"
path1 = r"D:\D_MLpractice\implementation\imp_author\test_author"

In [3]:
# Define a list of stop words
stop_words = set(stopwords.words("english"))

In [4]:
# Define a list of punctuation marks
punctuations = set(string.punctuation)
print(punctuations)

{'`', '@', '|', '*', '{', '=', '!', ':', '\\', '$', '-', '_', '+', '#', '/', '~', ';', '>', ')', '%', '(', '<', '[', "'", '}', '^', '.', ',', '&', ']', '?', '"'}


In [5]:
# Define a function to preprocess the text data
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Convert to lowercase
    tokens = [token.lower() for token in tokens]
    # Remove stop words and punctuation marks
    tokens = [token for token in tokens if token not in stop_words and token not in punctuations]
    # Compute the frequency distribution of the tokens
    freq_dist = FreqDist(tokens)
    # Return the 100 most common tokens
    return " ".join([token for token, freq in freq_dist.most_common(100)])


In [6]:
# Read in the text files and preprocess the data
documents = []
authors = []
for filename in os.listdir(path):
    if filename.endswith(".txt"):
        with open(os.path.join(path, filename), "r") as f:
            text = f.read()
            processed_text = preprocess_text(text)
            documents.append(processed_text)
            authors.append(filename.split("_")[0])


In [7]:
new_data=pd.DataFrame(documents, columns=['doc'])
new_data['auth']=authors
new_data.head(7)

Unnamed: 0,doc,auth
0,-- way successful model response would look li...,a
1,american economic average family better americ...,a
2,us choices integrative thinking business schoo...,a
3,change social book martin osberg entrepreneurs...,a
4,strategy roger time p g three decades work mar...,a
5,problems insecurity thoughts clients distress ...,b
6,viewed feeling characteristic feel differences...,b


In [8]:
X_train, X_test, y_train, y_test = train_test_split(new_data.doc, new_data.auth, test_size=0.15, random_state=10)

In [9]:
len(X_train), len(X_test), X_train.shape, y_train.shape, X_test.shape, y_test.shape

(26, 5, (26,), (26,), (5,), (5,))

In [10]:
# Vectorize the text data using the TF-IDF algorithm
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [11]:
# Train a Naive Bayes classifier`
nb_classifier = MultinomialNB()

In [12]:
nb_classifier.fit(X_train, y_train)

In [13]:
y_predict= nb_classifier.predict(X_test)
y_predict

array(['d', 'f', 'e', 'f', 'f'], dtype='<U1')

In [14]:
y_predict_p= nb_classifier.predict_proba(X_test)
y_predict_p

array([[0.10939093, 0.11162351, 0.17372784, 0.21931586, 0.1966331 ,
        0.18930876],
       [0.1056727 , 0.1647051 , 0.17528865, 0.1779386 , 0.18059978,
        0.19579517],
       [0.10761362, 0.1287263 , 0.19074481, 0.18530265, 0.19385722,
        0.1937554 ],
       [0.12852098, 0.11937349, 0.17312534, 0.1894403 , 0.18275398,
        0.20678591],
       [0.14234116, 0.12019753, 0.18203776, 0.17715614, 0.18810535,
        0.19016206]])

 Convert the test data to a feature matrix
test_features = vectorizer.transform(test_features)

In [15]:
z1 = cohen_kappa_score(y_test, y_predict)

In [16]:
np.unique(authors)

array(['a', 'b', 'c', 'd', 'e', 'f'], dtype='<U1')

In [17]:
z=pd.DataFrame((y_predict_p)*100, columns=np.unique(authors), index = y_test)
z

Unnamed: 0_level_0,a,b,c,d,e,f
auth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
d,10.939093,11.162351,17.372784,21.931586,19.66331,18.930876
b,10.56727,16.47051,17.528865,17.79386,18.059978,19.579517
b,10.761362,12.87263,19.074481,18.530265,19.385722,19.37554
a,12.852098,11.937349,17.312534,18.94403,18.275398,20.678591
a,14.234116,12.019753,18.203776,17.715614,18.810535,19.016206


In [18]:

rounded_df=np.around(z,decimals = 4)

rounded_df.to_csv('Prob_values.csv')
rounded_df


Unnamed: 0_level_0,a,b,c,d,e,f
auth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
d,10.9391,11.1624,17.3728,21.9316,19.6633,18.9309
b,10.5673,16.4705,17.5289,17.7939,18.06,19.5795
b,10.7614,12.8726,19.0745,18.5303,19.3857,19.3755
a,12.8521,11.9373,17.3125,18.944,18.2754,20.6786
a,14.2341,12.0198,18.2038,17.7156,18.8105,19.0162


In [19]:

accuracy = accuracy_score(y_test, y_predict)
precision = precision_score(y_test, y_predict, average="weighted")
recall = recall_score(y_test, y_predict, average="weighted")
f1 = f1_score(y_test, y_predict, average="weighted")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)




Accuracy: 0.2
Precision: 0.2
Recall: 0.2
F1 Score: 0.2


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Test the document author in real time implemetation

In [20]:
# Read in the text files and preprocess the data
doc_p = []
auth_p = []
for filename in os.listdir(path1):
    if filename.endswith(".txt"):
        with open(os.path.join(path, filename), "r") as f:
            text = f.read()
            processed_text = preprocess_text(text)
            doc_p.append(processed_text)
            auth_p.append(filename.split("_")[0])


In [21]:
doc_p

["problems insecurity thoughts clients distress minds makes source counterproductive behavior periodically pass dismiss remain secure ideal selves easygoing joyful compassionate wise harbor end state analyzing expert n't change you.counselors ask list step vivid thereby lowering spirits delvelop problem detail.problems seem formidable discouraged",
 "viewed feeling characteristic feel differences partners low moods every little problem looks like tip iceberg maintaining sense well-being takes make relationship enjoyable easy warm respectful toward even hard time.when respect complementary.the discontent willmake seem incompatible makes difference.the negative call `` incompatibility '' change heart think good unimportant would compatible",
 "thoughts negative drunk state 're low feelings positive relationship communication pipe pass uplifted couple 's level closeness drop.what say think minde angry mind.so happy mind special feelings.when mood tempted communicate let others know down.b

In [22]:
X_val=vectorizer.transform(doc_p)
X_val

<7x1498 sparse matrix of type '<class 'numpy.float64'>'
	with 419 stored elements in Compressed Sparse Row format>

In [23]:
y1_predict=nb_classifier.predict(X_val)
y1_predict

array(['e', 'b', 'f', 'b', 'b', 'c', 'f'], dtype='<U1')

In [24]:
y1_predict_p= nb_classifier.predict_proba(X_val)
y1_predict_p

array([[0.10761362, 0.1287263 , 0.19074481, 0.18530265, 0.19385722,
        0.1937554 ],
       [0.09737502, 0.26732407, 0.15461961, 0.16142334, 0.15810968,
        0.16114827],
       [0.1056727 , 0.1647051 , 0.17528865, 0.1779386 , 0.18059978,
        0.19579517],
       [0.09882403, 0.27683145, 0.14724213, 0.16585241, 0.15235688,
        0.1588931 ],
       [0.09482552, 0.26533986, 0.15402512, 0.15776591, 0.15815063,
        0.16989295],
       [0.09537051, 0.09774873, 0.36273551, 0.15098104, 0.14628498,
        0.14687923],
       [0.09213595, 0.09638131, 0.13434066, 0.14812215, 0.13671935,
        0.39230058]])

In [25]:
np.unique(authors)

array(['a', 'b', 'c', 'd', 'e', 'f'], dtype='<U1')

In [26]:

z1=pd.DataFrame((y1_predict_p)*100, columns=np.unique(authors))#, index = auth_p)
rounded_df=np.around(z1,decimals = 4)
rounded_df.to_csv('Prob_values_predicted.csv')
rounded_df



Unnamed: 0,a,b,c,d,e,f
0,10.7614,12.8726,19.0745,18.5303,19.3857,19.3755
1,9.7375,26.7324,15.462,16.1423,15.811,16.1148
2,10.5673,16.4705,17.5289,17.7939,18.06,19.5795
3,9.8824,27.6831,14.7242,16.5852,15.2357,15.8893
4,9.4826,26.534,15.4025,15.7766,15.8151,16.9893
5,9.5371,9.7749,36.2736,15.0981,14.6285,14.6879
6,9.2136,9.6381,13.4341,14.8122,13.6719,39.2301


In [27]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([('vectorizer', vectorizer), ('multinomialNB', nb_classifier)])
pipe.fit(X_train,y_train)

AttributeError: lower not found

In [None]:
y_pred2 = pipe.predict(X_test)
ac2 = accuracy_score(y_test, y_pred2)
print("Accuracy is :",ac2)

Accuracy is : 0.2


In [None]:
with open('03_student_auth.pkl', 'wb') as f:
    pickle.dump(pipe,f)

In [None]:
text = "Hello, how are you?"
y = pipe.predict([text])
y

array(['f'], dtype='<U1')