In [410]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pickle

In [411]:
x_train_file = open("x_train.csv")
x_train = []
for idx, i in enumerate(x_train_file):
    if idx == 0:
        continue
    x = i.strip().split(",")
    x = x[0] + ": " + x[1]
    x_train.append(x)

y_train_file = open("y_train.csv")
y_train = []
for idx, i in enumerate(y_train_file):
    if idx == 0:
        continue
    y_train.append(i)

y_train = np.asarray(y_train,dtype=int).reshape(len(y_train),1)


vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
vectorizer.get_feature_names_out()
corpus_size = x_train.shape[1]
x_train = x_train.toarray()


x_val, y_val = None, None 
print(y_train.shape)
print(x_train.shape)
print(f'corpus size {corpus_size}')

(2400, 1)
(2400, 3631)
corpus size 3631


In [412]:
indices = np.random.permutation(x_train.shape[0])
training_idx, test_idx = indices[:2000], indices[2000:] #16% Validation, 84% Training
x_train, x_val = x_train[training_idx,:], x_train[test_idx,:]
y_train, y_val = y_train[training_idx,:], y_train[test_idx,:]

print(x_train.shape, x_val.shape)
print(y_train.shape, y_val.shape)



(2000, 3631) (400, 3631)
(2000, 1) (400, 1)


In [413]:
class LG_Model():
    def __init__(self, corpus_size) -> None:
        self.W_T = np.random.rand(corpus_size,1)
        self.B = 0
        self.learning_rate = 0.2
    
    def sigmoid_activation(self, r):
        return 1/(1+np.exp(-r))


    def forward(self, X, Y):
        #print("Y shape", Y.shape)
        #print("X shape", X.shape)
        #print("W shape", self.W_T.shape)
        
        m = X.shape[0]
        h_x = self.sigmoid_activation(np.dot(X, self.W_T)+self.B)
        dw = np.dot(X.T, (h_x - Y))/m
        db = np.sum(h_x - Y)/m
       
        return dw, db
    
    def predict_prob(self,X):
        h_x = self.sigmoid_activation(np.dot(X, self.W_T)+self.B)
        return h_x


    def predict(self,X):
        m = X.shape[0]
        h_x = self.sigmoid_activation(np.dot(X, self.W_T)+self.B)
        y_pred = np.zeros((m,1))
        for i in range(m):
            if h_x[i] > 0.5:
                y_pred[i] = 1
        return y_pred

    def save_model(self):
        model = {"W":self.W_T, "B": self.B}
        with open("model.pkl", "wb") as f:
            pickle.dump(model,f)

    def load_model(self):
        with open("model.pkl", "rb") as f:
            model = pickle.load(f)
        self.W_T = model["W"]
        self.B = model["B"]

    def train(self,X_train, Y_train, X_val=None, Y_val=None, iterations=10):
   
        for e in range(iterations):
            dw, db = self.forward(X_train,Y_train)

            #print(dw.shape, db.shape)
            self.W_T -= self.learning_rate * dw
            self.B -= self.learning_rate * db

            if e%100 == 0:
                print(f'Epoch {e} / {iterations}')
                if  isinstance(X_val,np.ndarray)  and isinstance(Y_val,np.ndarray):
                    y_pred = self.predict(X_val)
                    m = X_val.shape[0]
                    acc = np.sum(y_pred == Y_val)/m
                    print("Validation ACC: ", acc)
                y_pred = self.predict(X_train)
                m = X_train.shape[0]
                acc = np.sum(y_pred == Y_train)/m
                print("Train ACC: ", acc)
                print()

                self.save_model()




In [414]:
print(x_train.shape, y_train.shape)
model = LG_Model(corpus_size)
model.train(X_train=x_train, Y_train=y_train, X_val=x_val, Y_val=y_val, iterations=3000)

(2000, 3631) (2000, 1)
Epoch 0 / 3000
Validation ACC:  0.48
Train ACC:  0.504

Epoch 100 / 3000
Validation ACC:  0.475
Train ACC:  0.5075

Epoch 200 / 3000
Validation ACC:  0.485
Train ACC:  0.5255

Epoch 300 / 3000
Validation ACC:  0.5075
Train ACC:  0.546

Epoch 400 / 3000
Validation ACC:  0.51
Train ACC:  0.5635

Epoch 500 / 3000
Validation ACC:  0.535
Train ACC:  0.5765

Epoch 600 / 3000
Validation ACC:  0.5525
Train ACC:  0.5915

Epoch 700 / 3000
Validation ACC:  0.565
Train ACC:  0.61

Epoch 800 / 3000
Validation ACC:  0.5775
Train ACC:  0.625

Epoch 900 / 3000
Validation ACC:  0.5825
Train ACC:  0.638

Epoch 1000 / 3000
Validation ACC:  0.5825
Train ACC:  0.645

Epoch 1100 / 3000
Validation ACC:  0.6
Train ACC:  0.6555

Epoch 1200 / 3000
Validation ACC:  0.6075
Train ACC:  0.664

Epoch 1300 / 3000
Validation ACC:  0.6125
Train ACC:  0.6695

Epoch 1400 / 3000
Validation ACC:  0.6225
Train ACC:  0.6745

Epoch 1500 / 3000
Validation ACC:  0.625
Train ACC:  0.6835

Epoch 1600 / 3000

In [None]:
print(x_val[0])

In [415]:
x_test_file = open("x_test.csv")
x_test = []
for idx, i in enumerate(x_test_file):
    if idx == 0:
        continue
    x = i.strip().split(",")
    x = x[0] + ": " + x[1]
    x_test.append(x)
x_test = vectorizer.transform(x_test).toarray()

In [416]:
model = LG_Model(corpus_size)
model.load_model()

if  isinstance(x_val,np.ndarray)  and isinstance(y_val,np.ndarray):
        y_pred = model.predict(x_val)
        m = x_val.shape[0]
        acc = np.sum(y_pred == y_val)/m
        print("Testing Validation ACC: ", acc)

y_pred = model.predict(x_train)
m = x_train.shape[0]
acc = np.sum(y_pred == y_train)/m
print("Testing Train ACC: ", acc)


yprob_test = model.predict_prob(x_test)
yprob_test.shape

np.savetxt('yprob_test.txt',yprob_test, delimiter=',')
print(yprob_test)

Testing Validation ACC:  0.6675
Testing Train ACC:  0.7715
[[0.51055266]
 [0.6262346 ]
 [0.46332037]
 [0.27911102]
 [0.43043685]
 [0.33779967]
 [0.27103559]
 [0.35609782]
 [0.510062  ]
 [0.63122819]
 [0.34832411]
 [0.47810235]
 [0.3099395 ]
 [0.26394491]
 [0.54870379]
 [0.33443672]
 [0.28979178]
 [0.25101635]
 [0.44468465]
 [0.42470182]
 [0.51701235]
 [0.4163143 ]
 [0.35549512]
 [0.24644516]
 [0.48470201]
 [0.60807674]
 [0.56554683]
 [0.47448211]
 [0.35831797]
 [0.31102313]
 [0.47105921]
 [0.39572884]
 [0.48544887]
 [0.27161251]
 [0.5156316 ]
 [0.56883649]
 [0.39991264]
 [0.15099081]
 [0.48605419]
 [0.49949325]
 [0.27791854]
 [0.34515711]
 [0.3409223 ]
 [0.4794213 ]
 [0.48286522]
 [0.24182564]
 [0.27586477]
 [0.27768682]
 [0.44396813]
 [0.35831797]
 [0.32452169]
 [0.31520529]
 [0.7688538 ]
 [0.39410865]
 [0.34553802]
 [0.27524402]
 [0.41317472]
 [0.39430913]
 [0.5082919 ]
 [0.38878686]
 [0.50659487]
 [0.40065834]
 [0.34795196]
 [0.28435745]
 [0.34289921]
 [0.36546116]
 [0.29016759]
 [0