# **Importing**

In [475]:
import numpy as np
import pandas  as pd
import math
import keras
import sklearn.model_selection
from sklearn.cluster import KMeans
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **Tokenization**

In [476]:
def tokenization(data):

  vocabulary = {}
  finalDocumentIndices = []

  for i in data.keys():
    text = data[i]
    text.translate(str.maketrans('', '', string.punctuation))

    tokensText = nltk.word_tokenize(text) # tokenizacija dokumenata
    stopwords=nltk.corpus.stopwords.words('english') # micanje stopwords (the, at ...)

    tokensText=[w for w in tokensText if w.lower() not in stopwords]
    tokensText=[w.lower() for w in tokensText] # lower case slova

    tokensText=[w for w in tokensText if len(w)>2] # samo tokeni duljine >2
    p= PorterStemmer() # uzimanje korijena rijeci algoritmom Porter Stemmer
    tokensText = [p.stem(w) for w in tokensText]   

    tokenIndices = []
    counter = len(vocabulary) - 1

    for token in tokensText:
      if token not in vocabulary:
        counter = counter + 1

        vocabulary[token] = counter
        tokenIndices.append(counter)
      else:
        tokenIndices.append(vocabulary[token])
      
    finalDocumentIndices.append(tokenIndices)
   
  return vocabulary, finalDocumentIndices 

def tokenizationWithVocabulary(data, vocabulary):
  finalDocumentIndices = []

  for i in data.keys():
    text = data[i]
    text.translate(str.maketrans('', '', string.punctuation))

    tokensText = nltk.word_tokenize(text) 
    stopwords=nltk.corpus.stopwords.words('english') 

    tokensText=[w for w in tokensText if w.lower() not in stopwords]
    tokensText=[w.lower() for w in tokensText] 

    tokensText=[w for w in tokensText if len(w)>2] 
    p= PorterStemmer() 
    tokensText = [p.stem(w) for w in tokensText]   

    tokenIndices = []

    for token in tokensText:
      if token in vocabulary:
        tokenIndices.append(vocabulary[token])
    finalDocumentIndices.append(tokenIndices)
  return finalDocumentIndices

# **Feature extraction and TFIDF matrix construction**

In [477]:
def features(finalDocumentIndices, frequencyTreshold, dataframeClass):
  
  wordFrequency = {}
  hamCounter = 0
  spamCounter = 0

  for k,document in enumerate(finalDocumentIndices):
    k = k + dataframeClass.keys().start
    if dataframeClass[k] == "ham":
      hamCounter = hamCounter + 1 

    if dataframeClass[k] == "ham":
      continue

    for index in document:
      if index in wordFrequency:
        wordFrequency[index] = wordFrequency[index] + 1
      else:
        wordFrequency[index] = 1

      mostFrequentWords = []

      for tokenIndex in wordFrequency.keys():
        if wordFrequency[tokenIndex] > frequencyTreshold:
            mostFrequentWords.append(tokenIndex)

  mostFrequentWordsDict = {}
  i = 0

  for tokenIndex in mostFrequentWords:
    mostFrequentWordsDict[i] = tokenIndex
    i = i + 1

  return mostFrequentWordsDict
  

def createTfidfMatrix(mostFrequentWordsDict, finalDocumentIndices):
  featureIndices = mostFrequentWordsDict.values()

  rows = []
  rows = list(featureIndices)
  columns = []

  for listIndices in finalDocumentIndices:
    featureVector = [0] * (len(featureIndices))

    for i in listIndices:
      if i in rows:
        featureVector[rows.index(i)] = listIndices.count(i)

    columns.append(featureVector)

  tfidf = TfidfTransformer(norm=False,use_idf=True,sublinear_tf=True, smooth_idf=True)
  tfidf.fit(columns)
  tfidfMatrix = tfidf.transform(columns)

  return tfidfMatrix

# **Reading data**

In [478]:
dataframe = pd.read_csv('spam.csv', encoding= "ISO-8859-1")

dataframe = dataframe.iloc[:,:2]

dataframe.columns = ["Class","Content"]
dataframe["Class_number"] = pd.factorize(dataframe["Class"])[0]

#char -> int
def contentToIntegers(dataframeRow):
  return [ord(character) for character in dataframeRow["Content"]]

dataframe["Content_integers"] = dataframe.apply(contentToIntegers, axis=1)

#naci maxInteger iz Content_integers za skaliranje na [0,1]
maxInteger = -math.inf
for i in range(len(dataframe["Content_integers"])):
  for j in range(len(dataframe["Content_integers"][i])):
    if dataframe["Content_integers"][i][j] > maxInteger:
      maxInteger = dataframe["Content_integers"][i][j]

X = dataframe["Content_integers"].values

for i in range(len(dataframe)):
  X[i] = np.asarray(X[i])

y = dataframe["Class_number"].values
y = y.reshape(len(y), 1)

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.3)

maxLength = 100
X_train = keras.preprocessing.sequence.pad_sequences(X_train, maxlen=maxLength) #ujednacavanje duljine poruka
X_test = keras.preprocessing.sequence.pad_sequences(X_test, maxlen=maxLength)

X_train = X_train / maxInteger #skaliranje na [0,1]
X_test = X_test / maxInteger

# reshape
X_test = X_test.T
X_train = X_train.T
y_test = y_test.T
y_train = y_train.T

# **K-means**

In [479]:
def kMeans(X_train,y_train, X_test, y_test):
  print('Kmeans algoritam:')
  X_train = X_train.T
  X_test = X_test.T
  kmeans = KMeans(n_clusters = 2, init = 'k-means++', max_iter = 1000, tol = 1e-5, copy_x = False).fit(X_train)
  trainLabels = kmeans.labels_ #(procitaj oznake)
  
  testPredict = kmeans.predict(X_test)

  numberOfZeros = 0
  numberOfOnes = 0
  for i in range(X_test.shape[0]):
    if testPredict[i] == 0:
      if y_test[0][i] == 0:
        numberOfZeros = numberOfZeros + 1
      else:
        numberOfOnes = numberOfOnes + 1

  print("Prva klasa:")
  print('Broj "ham" SMS poruka:', numberOfZeros)
  print('Broj "spam" SMS poruka:', numberOfOnes)

  numberOfZeros = 0
  numberOfOnes = 0

  for i in range(X_test.shape[0]):
    if testPredict[i] == 1:
      if y_test[0][i] == 0:
        numberOfZeros = numberOfZeros + 1
      else:
        numberOfOnes = numberOfOnes + 1
  print("Druga klasa:")
  print('Broj "ham" SMS poruka:', numberOfZeros)
  print('Broj "spam" SMS poruka:', numberOfOnes)



# **K-nearest neighbors**

In [480]:
def kNearestNeighbors(X_train,y_train, X_test, y_test):
  print('KNN algoritam:')
  X_train = X_train.T
  y_train = np.squeeze(y_train)
  X_test = X_test.T
  y_test = np.squeeze(y_test)
  knn = KNeighborsClassifier(n_neighbors = 2).fit(X_train, y_train)

  trainLabels = []
  testPredict = knn.predict(X_test)

  numberOfZeros = 0
  numberOfOnes = 0
  for i in range(X_test.shape[0]):
    if testPredict[i] == 0:
      if y_test[i] == 0:
        numberOfZeros = numberOfZeros + 1
      else:
        numberOfOnes = numberOfOnes + 1
  print("Prva klasa:")
  print('Broj "ham" SMS poruka:', numberOfZeros)
  print('Broj "spam" SMS poruka:', numberOfOnes)

  numberOfZeros = 0
  numberOfOnes = 0

  for i in range(X_test.shape[0]):
    if testPredict[i] == 1:
      if y_test[i] == 0:
        numberOfZeros = numberOfZeros + 1
      else:
        numberOfOnes = numberOfOnes + 1
  print("Druga klasa:")
  print('Broj "ham" SMS poruka:', numberOfZeros)
  print('Broj "spam" SMS poruka:', numberOfOnes)


# **Testing**

In [481]:
# Randomiziranje uzorka

numberOfMessages = dataframe['Class'].shape[0]
def randomChoices(number, trainPercent):

  percentage = int(trainPercent * number)

  randomTrain = np.random.choice(number, percentage, replace = False)

  randomTest = []
  for i in range(number):
    if i not in randomTrain:
      randomTest.append(i)
  return randomTrain, randomTest

randomTrain, randomTest = randomChoices(numberOfMessages, 0.7)
randomizedOrder = np.concatenate((randomTrain,randomTest))

tempDataframe = dataframe.copy()
tempy = y.copy()

for i in range(numberOfMessages):
  dataframe.iloc[i] = tempDataframe.iloc[randomizedOrder[i]]
  y[i][0] = tempy[randomizedOrder[i]][0]

vocabulary, finalDocumentIndices = tokenization(dataframe["Content"][0:3899])

mostFrequentWordsDict = features(finalDocumentIndices, 12, dataframe["Class"][0:3899])

matrix = createTfidfMatrix(mostFrequentWordsDict, finalDocumentIndices)


finalDocumentIndicesTest = tokenizationWithVocabulary(dataframe["Content"][3899:], vocabulary)

matrixTest = createTfidfMatrix(mostFrequentWordsDict, finalDocumentIndicesTest)

matrix = matrix.T
y = y.T
yTest = y[0][3899:].copy()
yTest = yTest.T
yTrain = y[0][:3899].copy()
yTrain = yTrain.T
matrixTest = matrixTest.T

yTest = np.expand_dims(yTest, axis = 0)
yTrain = np.expand_dims(yTrain, axis = 0)

matrixOutput = matrix.toarray()
matrixTestOutput = matrixTest.toarray()
np.savetxt("X_trainTokenized.csv", matrixOutput, delimiter=",")
np.savetxt("X_testTokenized.csv", matrixTestOutput, delimiter=",")
np.savetxt("ShuffledLabels.csv",y,delimiter=",")

kMeans(X_train, y_train, X_test, y_test)
print("\n")
kMeans(matrix, yTrain, matrixTest, yTest)
print("\n")
kNearestNeighbors(X_train, y_train, X_test, y_test)
print("\n")
kNearestNeighbors(matrix, yTrain, matrixTest, yTest)

Kmeans algoritam:
Prva klasa:
Broj "ham" SMS poruka: 882
Broj "spam" SMS poruka: 12
Druga klasa:
Broj "ham" SMS poruka: 558
Broj "spam" SMS poruka: 220


Kmeans algoritam:
Prva klasa:
Broj "ham" SMS poruka: 13
Broj "spam" SMS poruka: 148
Druga klasa:
Broj "ham" SMS poruka: 1426
Broj "spam" SMS poruka: 86


KNN algoritam:
Prva klasa:
Broj "ham" SMS poruka: 1427
Broj "spam" SMS poruka: 146
Druga klasa:
Broj "ham" SMS poruka: 13
Broj "spam" SMS poruka: 86


KNN algoritam:
Prva klasa:
Broj "ham" SMS poruka: 1435
Broj "spam" SMS poruka: 108
Druga klasa:
Broj "ham" SMS poruka: 4
Broj "spam" SMS poruka: 126
