<a href="https://colab.research.google.com/github/kk412027247/nlp/blob/main/SMS_Spam_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

%tensorflow_version 2.x
import tensorflow as tf
import os
import io
import pandas as pd
import re
tf.__version__

!pip install stopwordsiso
!pip install stanfordnlp
!pip install stanza
# !pip install git+git://github.com/stanfordnlp/stanza.git@dev


import stanfordnlp as snlp
import stopwordsiso as stopwords
import stanza



path_to_zip = tf.keras.utils.get_file("smsspamcollection.zip",
                                      origin = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip",extract=True)
!unzip $path_to_zip -d data
en = snlp.download('en')
en = stanza.download('en')



lines = io.open('data/SMSSpamCollection').read().strip().split('\n')
lines[0]

spam_dataset = []
for line in lines:
  label, text = line.split('\t')
  if label.lower().strip() == 'spam':
    spam_dataset.append((1, text.strip()))
  else:
    spam_dataset.append(((0, text.strip())))
print(spam_dataset[0])

df = pd.DataFrame(spam_dataset, columns=['Spam', 'Message'])

def message_lenth(x):
  return len(x)

def num_capitals(x):
  _, count = re.subn(r'[A-Z]', '', x)
  return count

def num_puntuation(x):
  _,count = re.subn(r'\W', '', x)
  return count

df['Capitals'] = df['Message'].apply(num_capitals)
df['Punctuation'] = df['Message'].apply(num_puntuation)
df['Length'] = df['Message'].apply(message_lenth)
df.describe()

train=df.sample(frac=0.8, random_state=42)
test=df.drop(train.index)
x_train=train[['Length', 'Capitals', 'Punctuation']]
y_train = train[['Spam']]
x_test = test[['Length', 'Capitals', 'Punctuation']]
y_test = test[['Spam']]


def make_model(input_dims=3, num_units=12):
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Dense(num_units,
                                  input_dim=input_dims,
                                  activation='relu'))
  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

model = make_model()
# model.fit(x_train, y_train, epochs=10, batch_size=10)

# model.evaluate(x_test, y_test)

# y_train_pred = model.predict(x_train)
# x = tf.math.confusion_matrix(tf.constant(y_train.Spam), y_train_pred)
# print(x)

sentence = 'Go until jurong point, crazy.. Available only in bugis n great world'
sentence.split()



# en = snlp.Pipeline(lang='en', processors='tokenize')

# tokenized = en(sentence)
# len(tokenized.sentences)

# for snt in tokenized.sentences:
#   for word in snt.tokens:
#     print(word.text)
#   print("<end of Sentence>")



# jp = snlp.download('ja')
# jp = snlp.Pipeline(lang='ja', processors='tokenize')
# jp_line = jp("選挙管理委員会")

# for snt in jp_line.sentences:
#   for word in snt.tokens:
#     print(word.text)



# pipeline = snlp.Pipeline(lang='en', processors='tokenize')
en_sw = stopwords.stopwords('en')

en = stanza.Pipeline(lang='en')


def word_counts_v3(x, pipeline=en):
  totals = 0.
  count = 0.
  non_word = 0.
  try:
    doc = pipeline(x)
    for sentence in doc.sentences:
      totals += len(sentence.tokens)  # (1)
      for token in sentence.tokens:
          if token.text.lower() not in en_sw:
            if token.words[0].upos not in ['PUNCT', 'SYM']:
              count += 1.
            else:
              non_word += 1.
    non_word = non_word / totals
    return pd.Series([count, non_word], index=['Words_NoPunct', 'Punct'])
  except:
    print(x)
    return pd.Series([count, non_word], index=['Words_NoPunct', 'Punct'])

train_tmp = train['Message'].apply(word_counts_v3)
train = pd.concat(['train, train_tmp'], axis=1)

test_tmp = test['Message'].apply(word_counts_v3)
test = pd.concat(['test, test_tmp'], axis=1)


print(train.loc[train.Spam == 1].describe())
print(train.loc[train.Spam == 0].describe())


x_train=train[['Length', 'Capitals', 'Punctuation', 'Words_NoPunct', 'Punt']]
y_train = train[['Spam']]
x_test = test[['Length', 'Capitals', 'Punctuation', 'Words_NoPunct', 'Punt']]
y_test = test[['Spam']]




model = make_model(input_dims=5)

model.fit(x_train, y_train, epochs=10, batch_size=10)

model.evaluate(x_test, y_test)

y_train_pred = model.predict(x_train)
x = tf.math.confusion_matrix(tf.constant(y_train.Spam), y_train_pred)
print(x)

en = stanza.Pipeline(lang='en')
txt = "Yo you around? Afriend of mine's looking"
pos = en(txt)

def print_pos(doc):
  text = ''
  for sentence in doc.sentences:
    for token in sentence.tokens:
      text += token.words[0].text + '/' + token.words[0].upos + ' '
    text +='\n'
  return text

print(print_pos(pos))



In [None]:
text = "Stemming is aimed at reducing vocabulary and aid un-derstanding of morphological processes. This helps people un-derstand the morphology of words and reduce size of corpus."
en = stanza.Pipeline(lang='en')
lemma = en(text)

lemmas = ''
for sentence in lemma.sentences:
  for token in sentence.tokens:
    lemmas += token.words[0].lemma + '/' + token.words[0].upos + ' '
  lemmas += '\n'
print(lemmas)