In [6]:
import pandas as pd 
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import csv

In [28]:
try :
    with open ('edgar_allan_poe.txt','r', encoding='utf-8')as allan ,open ('robert_frost.txt','r', encoding='utf-8')as robert,open ('csv_file.csv','w',newline='')as csv_file:
        writer=csv.DictWriter(csv_file,fieldnames=["text","label"],delimiter=',')
        writer.writeheader()
        for line in allan :
            if line.strip() == '':
                continue  
            else:
                writer.writerow({'text':line.strip(),'label':0})
        for line in robert :
            if line=='\\n':
                continue  
            else:
                writer.writerow({'text':line,'label':1})
except FileNotFoundError as error:
    print(error)
        

In [29]:
import pandas as pd

try:
    # Try reading the file with a different encoding, like 'ISO-8859-1'
    csv_data = pd.read_csv('csv_file.csv', encoding='ISO-8859-1')
    
    X = csv_data['text']  # Features (texts)
    y = csv_data['label']  # Labels
    
except UnicodeDecodeError as e:
    print(f"Encoding error: {e}")
train_text, test_text, Ytrain, Ytest = train_test_split(X, y)


In [30]:
len(Ytrain), len(Ytest)

(1724, 575)

In [31]:
idx = 1
word2idx = {'<unk>': 0}

In [32]:
# populate word2idx
for text in train_text:
    tokens = text.split()
    for token in tokens:
        if token not in word2idx:
            word2idx[token] = idx
            idx += 1

In [33]:
word2idx

{'<unk>': 0,
 'What': 1,
 'signify': 2,
 'a': 3,
 "donkey's": 4,
 'cars': 5,
 'and': 6,
 'bottle,': 7,
 'They': 8,
 'follow': 9,
 'me-': 10,
 'they': 11,
 'lead': 12,
 'me': 13,
 'through': 14,
 'the': 15,
 'years.': 16,
 "He's": 17,
 'after': 18,
 'an': 19,
 'open': 20,
 'door': 21,
 'to': 22,
 'get': 23,
 'out-doors.': 24,
 'With': 25,
 'gourd': 26,
 'grape': 27,
 'luxuriant': 28,
 'grew.': 29,
 "Haven't": 30,
 'as': 31,
 'good.': 32,
 "don't": 33,
 'go': 34,
 'with': 35,
 'farm.': 36,
 'And': 37,
 'every': 38,
 'gentle': 39,
 'air': 40,
 'that': 41,
 'dallied,': 42,
 'You': 43,
 'said': 44,
 "you'd": 45,
 'seen': 46,
 'stone': 47,
 'baptismal': 48,
 'font.': 49,
 "'Take": 50,
 'all': 51,
 'you': 52,
 "want.'": 53,
 "'I": 54,
 'shall': 55,
 'suspect-': 56,
 '-': 57,
 "'": 58,
 'Was': 59,
 'counting': 60,
 'winter': 61,
 'dinners,': 62,
 'one': 63,
 'hill,': 64,
 'Would': 65,
 'have': 66,
 'been': 67,
 'Starks,': 68,
 'doubtless': 69,
 'here': 70,
 "to-day.'": 71,
 'I': 72,
 'thought'

In [34]:
len(word2idx)


3721

In [35]:
# convert data into integer format
train_text_int = []
test_text_int = []

for text in train_text:
    tokens = text.split()
    line_as_int = [word2idx[token] for token in tokens]
    train_text_int.append(line_as_int)

for text in test_text:
    tokens = text.split()
    line_as_int = [word2idx.get(token, 0) for token in tokens]
    test_text_int.append(line_as_int)

In [36]:
# initialize A and pi matrices - for both classes
V = len(word2idx)

A0 = np.ones((V, V))
pi0 = np.ones(V)

A1 = np.ones((V, V))
pi1 = np.ones(V)

In [43]:
# compute counts for A and pi
def compute_counts(text_as_int, A, pi):
    for tokens in text_as_int:
        last_idx = None
    for idx in tokens:
        if last_idx is None:
        # it's the first word in a sentence
            pi[idx] += 1
        else:
        # the last word exists, so count a transition
            A[last_idx, idx] += 1

      # update last idx
        last_idx = idx


compute_counts([t for t, y in zip(train_text_int, Ytrain) if y == 0], A0, pi0)
compute_counts([t for t, y in zip(train_text_int, Ytrain) if y == 1], A1, pi1)

In [44]:
# normalize A and pi so they are valid probability matrices
# convince yourself that this is equivalent to the formulas shown before
A0 /= A0.sum(axis=1, keepdims=True)
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1 /= pi1.sum()

In [45]:
# log A and pi since we don't need the actual probs
logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)

In [46]:
# compute priors
count0 = sum(y == 0 for y in Ytrain)
count1 = sum(y == 1 for y in Ytrain)
total = len(Ytrain)
p0 = count0 / total
p1 = count1 / total
logp0 = np.log(p0)
logp1 = np.log(p1)
p0, p1

(0.3091647331786543, 0.6908352668213457)

In [57]:
# build a classifier
class Classifier:
    def __init__(self, logAs, logpis, logpriors):
        self.logAs = logAs
        self.logpis = logpis
        self.logpriors = logpriors
        self.K = len(logpriors) # number of classes
  
    def _compute_log_likelihood(self, input_, class_):
        logA = self.logAs[class_]
        logpi = self.logpis[class_]

        last_idx = None
        logprob = 0
        for idx in input_:
            if last_idx is None:
        # it's the first token
                logprob += logpi[idx]
            else:
                logprob += logA[last_idx, idx]
      
      # update last_idx
            last_idx = idx
    
        return logprob
  
    def predict(self, inputs):
        predictions = np.zeros(len(inputs))
        for i, input_ in enumerate(inputs):
            posteriors = [self._compute_log_likelihood(input_, c) + self.logpriors[c] \
                 for c in range(self.K)]
            pred = np.argmax(posteriors)
            predictions[i] = pred
        return predictions

In [58]:
clf = Classifier([logA0, logA1], [logpi0, logpi1], [logp0, logp1])


In [62]:
from sklearn.metrics import confusion_matrix, f1_score

f1_score(Ytrain, Ptrain)


0.8177136972193615

In [63]:
f1_score(Ytest, Ptest)


0.8082901554404144