# Implementare (cod propriu) kMeans pt clusterizare

In [23]:
from numpy.random import choice
from math import sqrt

In [24]:
class KMeans():
    def __init__(self, numarCentroizi) -> None:
        self.numarCentroizi = numarCentroizi
        self.centroizi = []

    def alegereCentroizi(self, input):
        pozitii = [i for i in range(input.shape[0])]
        pozitii_centroizi = choice(pozitii, self.numarCentroizi)
        self.centroizi = [input[i] for i in pozitii_centroizi]

    def distantaDintreDouaPuncte(self, punct1, punct2):
        x = [(punct1[0, i] - punct2[0, i]) **2 for i in range(punct1.shape[1])]
        distanta = sqrt(sum(x))
        return distanta

    def closeCentroidForAPoint(self, punct):
        ind = 0
        distantaMinima = self.distantaDintreDouaPuncte(punct, self.centroizi[0])

        for i in range(len(self.centroizi)):
            distanta = self.distantaDintreDouaPuncte(punct, self.centroizi[i])
            if distanta < distantaMinima:
                distantaMinima = distanta
                ind = i
        return ind

    def _sumaPuncte(self, input, c, indiceCentroid):
        return sum([input[i] for i in range(input.shape[0]) if c[i] == indiceCentroid])

    def _numarPuncte(self, c, indiceCentroid):
        return c.count(indiceCentroid)
    
    def train(self, trainingInput):
        self.alegereCentroizi(trainingInput)
        convergent = False

        while not convergent:
            c = []
            for i in range(trainingInput.shape[0]):
                punct = trainingInput[i]
                ind = self.closeCentroidForAPoint(punct)
                c.append(ind)
            
            schimbarePozitieCentroidMaxima = -1
            for indiceCentroid in range(0, self.numarCentroizi):
                centroidNou = self._sumaPuncte(trainingInput, c, indiceCentroid) / self._numarPuncte(c, indiceCentroid)
                distanta = self.distantaDintreDouaPuncte(self.centroizi[indiceCentroid], centroidNou)

                if distanta > schimbarePozitieCentroidMaxima:
                    schimbarePozitieCentroidMaxima = distanta
                self.centroizi[indiceCentroid] = centroidNou
            
            if schimbarePozitieCentroidMaxima < 0.05:
                convergent = True
    
    def predict(self, input):
        return [self.closeCentroidForAPoint(i) for i in input]

In [25]:
import pandas as pd
import numpy as np

def readData(filePath:str):
    df = pd.read_csv(filePath)
    return df

def getTrainingAndValidationSets(dataFrame, trainingSize=0.8):
    dataSize = dataFrame.shape[0]
    
    trainingIndexSet = np.random.choice(range(dataSize), size=int(trainingSize * dataSize), replace=False)
    validationIndexSet = [i for i in range(dataSize) if i not in trainingIndexSet]

    trainingInputSet = [dataFrame["Text"].iloc[index] for index in trainingIndexSet]
    trainingOutputSet = [dataFrame["Sentiment"].iloc[index] for index in trainingIndexSet]

    validationInputSet = [dataFrame["Text"].iloc[index] for index in validationIndexSet]
    validationOutputSet = [dataFrame["Sentiment"].iloc[index] for index in validationIndexSet]

    return trainingInputSet, trainingOutputSet, validationInputSet, validationOutputSet


def clasificationPerformance(ground_truth, computed_values, positive_label):
    """
    Returneaza TN (True Negative), FP(False Positive), FN(False Negative), TP(True Positive)
    """
    TN = 0
    FP = 0
    FN = 0
    TP = 0
    
    for i in range(0, len(ground_truth)):
        if ground_truth[i] == positive_label:
            if computed_values[i] == positive_label:
                TP += 1
            else:
                FP += 1
        else:
            if computed_values[i] != positive_label:
                TN += 1
            else:
                FN += 1
    return TN, FP, FN, TP


def getAccuracy(TN, FP, FN, TP):
    """ 
    accuracy represents the overall performance of classification model:
    (TP+TN)/(TN+FP+FN+TP)
    """
    if (TN+FP+FN+TP) == 0:
        return 0
    return (TP+TN)/(TN+FP+FN+TP)

def getPrecision(FP, TP):
    """
    precision indicates how accurate the positive predictions are 
    TP/(TP+FP)
    """
    if (TP+FP) == 0:
        return 0
    return TP/(TP+FP)

def getRecall(TP, FN):
    """ 
    recall indicates the coverage of actual positive sample
    TP/(TP+FN)
    """
    if (TP+FN) == 0:
        return 0
    return TP/(TP+FN)


def getClassifier(trainingFeatures, numberClusters):
    clasificator = KMeans(numberClusters)
    clasificator.train(trainingFeatures)
    return clasificator

def testClassifier(classifier, validationInput, validationOutput, labels, positiveLabel):
    computedIndexes = classifier.predict(validationInput)
    computedOutput = [labels[index] for index in computedIndexes]
    TN, FP, FN, TP = clasificationPerformance(validationOutput, computedOutput, positiveLabel)
    accuracy = getAccuracy(TN, FP, FN, TP)
    precision = getPrecision(FP, TP)
    recall = getRecall(TP, FN)
    print("Accuracy: {}\nPrecision: {}\nRecall: {}".format(accuracy, precision, recall))


In [26]:
from sklearn.feature_extraction.text import CountVectorizer

def bagOfWords(trainingInput, validationInput):
    v = CountVectorizer()
    featuresForTraining = v.fit_transform(trainingInput)
    featuresForValidation = v.transform(validationInput)
    return featuresForTraining, featuresForValidation


In [28]:
datas = readData("reviews_mixed.csv")
trainingInputSet, trainingOutputSet, validationInputSet, validationOutputSet = getTrainingAndValidationSets(datas)
trainingFeatures, validationFeatures = bagOfWords(trainingInputSet, validationInputSet)
labels = [label for label in set(trainingOutputSet)]
clasificator = getClassifier(trainingFeatures, len(labels))
testClassifier(clasificator, validationFeatures, validationOutputSet, labels, 'positive')

Accuracy: 0.2857142857142857
Precision: 1.0
Recall: 0.2857142857142857
