In [2]:
# Libraries

import re 
import requests

import spacy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline

In [3]:
pd.set_option('display.max_colwidth', None)

nlp = spacy.load('en_core_web_md')

In [4]:
file_location = 'Data_NLP/spamdetection.xlsx'

df_train = pd.read_excel(file_location, sheet_name='train', dtype='str')
df_test  = pd.read_excel(file_location, sheet_name='test', dtype='str')

# <!> 
# {index: 2109, text: 645}
# I encounter  the issue TypeError: expected string or bytes-like object
# While performing pre-processing functions
# dtype='str', fixes the issue, by converting each text in string type

## Exploring our data train

In [5]:
## DF_TRAIN
print(f"Some data:\n{df_train.head(3)}\n")

print(f"Number of examples per class:\n{df_train.groupby('type').count()}\n")
# Remark: The train dataset is not balanced 

print(f"Size of df_train:\n{df_train.shape}\n")

print(f"Number of NULL values:\n{df_train.isnull().sum()}")

Some data:
  type  \
0  ham   
1  ham   
2  ham   

                                                                              text  
0  Hope you are having a good week. Just checking in to know and knowing and known  
1                                                          K..give back my thanks.  
2                                      Am also doing in cbe only. But have to pay.  

Number of examples per class:
      text
type      
ham   3460
spam   540

Size of df_train:
(4000, 2)

Number of NULL values:
type    0
text    0
dtype: int64


In [6]:
def display(ham: list, spam: list) -> None:
    """
    Nice way to display the maximum, minimum and the average of spam and ham message.
    """
    print(f"{'HAM ':>23} | {' SPAM'}")
    print(f"Max length    : {np.max(ham[0]):<7.2f} vs {np.max(spam[0]):<7.2f}\n" + \
          f"Min length    : {np.min(ham[1]):<7.2f} vs {np.min(spam[1]):<7.2f}\n" + \
          f"Average length: {np.mean(ham[2]):<7.2f} vs {np.mean(spam[2]):<7.2f}\n")


In [7]:
# Maximum / Minimum / Average size of a text ham 
ham_text_lenght = [len(text) for text in df_train[df_train['type'] == 'ham']['text']]
spam_text_lenght = [len(text) for text in df_train[df_train['type'] == 'spam']['text']]

display(ham=[np.max(ham_text_lenght), np.min(ham_text_lenght), np.mean(ham_text_lenght)],
        spam=[np.max(spam_text_lenght), np.min(spam_text_lenght), np.mean(spam_text_lenght)]) 

# Observation: Spam messages have more words than ham messages

                   HAM  |  SPAM
Max length    : 910.00  vs 197.00 
Min length    : 2.00    vs 13.00  
Average length: 70.52   vs 139.26 



In [8]:
# Maximum / Minimum / Average size of non-alphanumaric characters in a text 

ham_non_alphanumeric = [len([character for character in re.findall(r'[\W]', text) \
                             if character != ' '])
                        for text in df_train[df_train['type'] == 'ham']['text']]

spam_non_alphanumeric = [len([character for character in re.findall(r'[\W]', text) \
                              if character != ' '])
                         for text in df_train[df_train['type'] == 'spam']['text']]

display(ham=[np.max(ham_non_alphanumeric), np.min(ham_non_alphanumeric), np.mean(ham_non_alphanumeric)],
        spam=[np.max(spam_non_alphanumeric), np.min(spam_non_alphanumeric), np.mean(spam_non_alphanumeric)]) 

# Observation: Spam messages have more non-alphanumeric characters than ham messages

                   HAM  |  SPAM
Max length    : 61.00   vs 26.00  
Min length    : 0.00    vs 0.00   
Average length: 3.59    vs 6.15   



In [9]:
# Maximum / Minimum / Average size of uppercase words in a text 

ham_upper_case = [len(re.findall(r'\b[A-Z0-9]{2}\b', text)) \
                  for text in df_train[df_train['type'] == 'ham']['text']]

spam_upper_case = [len(re.findall(r'\b[A-Z0-9]{2}\b', text)) \
                   for text in df_train[df_train['type'] == 'spam']['text']]

display(ham=[np.max(ham_upper_case), np.min(ham_upper_case), np.mean(ham_upper_case)],
        spam=[np.max(spam_upper_case), np.min(spam_upper_case), np.mean(spam_upper_case)]) 

# Observation: 
# My initial intention was that spam messages would contain more capitalized words.
# According to our data this intuition is wrong

                   HAM  |  SPAM
Max length    : 9.00    vs 5.00   
Min length    : 0.00    vs 0.00   
Average length: 0.11    vs 0.79   



In [10]:
## DF_TEST
print(f"Some data:\n{df_test.head(3)}\n")
print(f"Size of df_test:\n{df_test.shape}\n") 
print(f"Number of NULL values:\n{df_test.isnull().sum()}")

Some data:
  id  \
0  1   
1  2   
2  3   

                                                                                                                                     text  
0  I wnt to buy a BMW car urgently..its vry urgent.but hv a shortage of  # Lacs.there is no source to arng dis amt. # lacs..thats my prob  
1                                                                                      Dunno lei shd b driving lor cos i go sch 1 hr oni.  
2                                                                                   Dun need to use dial up juz open da browser n surf...  

Size of df_test:
(1559, 2)

Number of NULL values:
id      0
text    0
dtype: int64


 ## Data Proccessing

In [11]:
# Saving a copy of df_train and df_train, because in what follows we will modify the dataframes

old_df_train = df_train.copy()
old_df_test  = df_test.copy()

In [12]:
def preprocessing(train, df, nb=10, verbose=True):
      
    if verbose:
        print(f"Initial DF:\n{df['text'].head(nb)}\n")
    
    if train:
        # 1. Convert the label to a categorical variable
        df['label'] = df['type'].apply(lambda label: 1 if label == "ham" else 0)

    # 2. Remove the Non-alphanumeric
    #df['preprocessed_text'] = df['text'].apply(lambda text: re.sub(r'\W', ' ', text))
    
    #if verbose: print(f"2. Remove the Non-alphanumeric:\n{df['preprocessed_text'].head(nb)}\n")

    # 3. Remove the digits
    df['preprocessed_text'] = df['text'].apply(lambda text: re.sub(r'\d+', 'digit ', text))

    if verbose:
        print(f"3. Remove the digits:\n{df['preprocessed_text'].head(nb)}\n")
        
    # 4. Lemmatiation
    df['preprocessed_text'] = df['preprocessed_text'].apply(lambda text: ' '.join([word.lemma_  
                                                                     for word in nlp(text)]))
    if verbose:
        print(f"4. Lemmatiation:\n{df['preprocessed_text'].head(nb)}\n")
    
    # 5. Lower case
    #df['preprocessed_text'] = df['preprocessed_text'].apply(lambda text: text.lower())

    #if verbose: print(f"5. Lower case:\n{df['preprocessed_text'].head(nb)}\n")
        
    # 6. Remove stopwords
    df['preprocessed_text'] = df['preprocessed_text'].apply(lambda text: re.sub(r'\b(' + \
                                                            r'|'.join(nlp.Defaults.stop_words) + \
                                                            r')\b', ' ', text))
    if verbose:
        print(f"6. Remove stopwords:\n{df['preprocessed_text'].head(nb)}\n")
    
    # 7. Remove words with lenght 1 
    df['preprocessed_text'] = df['preprocessed_text'].apply(lambda text:
                                                            re.sub(r'(^\w{0,1}\s|\s\w{0,1}\s|\s\w{0,1}$)', 
                                                            ' ',text))
    if verbose:
        print(f"7. Remove words with lenght 1 or 2:\n{df['preprocessed_text'].head(nb)}\n")

    # 8. Remove the extra spaces at the middle, the beginning and the end of the text    
    df['preprocessed_text'] = df['preprocessed_text'].apply(lambda text: re.sub(r'\s+', ' ', text))        
    df['preprocessed_text'] = df['preprocessed_text'].apply(lambda text: re.sub(r'(^\s+|\s+$)', ' ', text)) 
    
    if verbose:
        print(f"8. Remove spaces:\n{df['preprocessed_text'].head(nb)}\n")
    
    # 9. Remove duplicates 
    df['preprocessed_text'] = df['preprocessed_text'].apply(lambda text: re.sub(r'\b(\w+)( \1\b)+', r'\1', text))
    
    if verbose:
        print(f"9. Remove duplicates:\n{df['preprocessed_text'].head(nb)}\n")
    
    return df

In [13]:
df_test  = preprocessing(train=False, df=old_df_test, nb=1, verbose=True)

Initial DF:
0    I wnt to buy a BMW car urgently..its vry urgent.but hv a shortage of  # Lacs.there is no source to arng dis amt. # lacs..thats my prob
Name: text, dtype: object

3. Remove the digits:
0    I wnt to buy a BMW car urgently..its vry urgent.but hv a shortage of  # Lacs.there is no source to arng dis amt. # lacs..thats my prob
Name: preprocessed_text, dtype: object

4. Lemmatiation:
0    I wnt to buy a BMW car urgently .. its vry urgent.but hv a shortage of   # lacs.there be no source to arng dis amt . # lac .. that my prob
Name: preprocessed_text, dtype: object

6. Remove stopwords:
0    I wnt   buy   BMW car urgently ..   vry urgent.  hv   shortage     # lacs.      source   arng dis amt . # lac ..     prob
Name: preprocessed_text, dtype: object

7. Remove words with lenght 1 or 2:
0     wnt  buy  BMW car urgently ..  vry urgent. hv  shortage   # lacs.   source  arng dis amt . # lac ..   prob
Name: preprocessed_text, dtype: object

8. Remove spaces:
0     wnt buy BMW car u

In [14]:
df_train = preprocessing(train=True, df=old_df_train, nb=1, verbose=True)


Initial DF:
0    Hope you are having a good week. Just checking in to know and knowing and known
Name: text, dtype: object

3. Remove the digits:
0    Hope you are having a good week. Just checking in to know and knowing and known
Name: preprocessed_text, dtype: object

4. Lemmatiation:
0    hope you be have a good week . just check in to know and know and know
Name: preprocessed_text, dtype: object

6. Remove stopwords:
0    hope         good week .   check     know   know   know
Name: preprocessed_text, dtype: object

7. Remove words with lenght 1 or 2:
0    hope     good week .  check   know  know  know
Name: preprocessed_text, dtype: object

8. Remove spaces:
0    hope good week . check know know know
Name: preprocessed_text, dtype: object

9. Remove duplicates:
0    hope good week . check know
Name: preprocessed_text, dtype: object



In [15]:
df_train.head(3)

Unnamed: 0,type,text,label,preprocessed_text
0,ham,Hope you are having a good week. Just checking in to know and knowing and known,1,hope good week . check know
1,ham,K..give back my thanks.,1,.. thank .
2,ham,Am also doing in cbe only. But have to pay.,1,cbe . pay .


In [16]:
df_test.head(3)

Unnamed: 0,id,text,preprocessed_text
0,1,I wnt to buy a BMW car urgently..its vry urgent.but hv a shortage of # Lacs.there is no source to arng dis amt. # lacs..thats my prob,wnt buy BMW car urgently .. vry urgent. hv shortage # lacs. source arng dis amt . # lac .. prob
1,2,Dunno lei shd b driving lor cos i go sch 1 hr oni.,Dunno lei shd drive lor cos sch digit hr oni .
2,3,Dun need to use dial up juz open da browser n surf...,Dun need use dial juz open da browser surf ...


# Word representation and classification

In the following, i will use 2 models:

In [17]:
X_train, X_test, y_train, y_test = train_test_split(df_train['preprocessed_text'], 
                                                    df_train['label'], 
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=42)

In [18]:
# Model 1: CountVectorizer + LogisticRegression

pipe = Pipeline([("countvectorizer", CountVectorizer(max_features=1500, min_df=1, max_df=.3)),
         ("logisticregression", LogisticRegression())
        ])

pipe.fit(X_train, y_train)

y_train_pred, y_test_pred = pipe.predict(X_train), pipe.predict(X_test) 

print(f"Train:\n{metrics.classification_report(y_train, y_train)}")

print(f"Test:\n{metrics.classification_report(y_test, y_test_pred)}")

# Testing model performance on the real TestSet
y_true_test_pred = pipe.predict(df_test.preprocessed_text) 
df_test["y_pred_model1"] = y_true_test_pred

Train:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       437
           1       1.00      1.00      1.00      2763

    accuracy                           1.00      3200
   macro avg       1.00      1.00      1.00      3200
weighted avg       1.00      1.00      1.00      3200

Test:
              precision    recall  f1-score   support

           0       0.95      0.84      0.89       103
           1       0.98      0.99      0.99       697

    accuracy                           0.97       800
   macro avg       0.96      0.92      0.94       800
weighted avg       0.97      0.97      0.97       800



In [19]:
# Model 2: tfidfvectorizer + clf-svm

pipe = Pipeline([("tfidfvectorizer", TfidfVectorizer(max_df=.99, min_df=2)),
         ('clf-svm', SGDClassifier(loss='hinge', penalty='l1', alpha=1e-4, 
                                   max_iter=5000, random_state=42))
        ])

pipe.fit(X_train, y_train)

y_train_pred, y_test_pred = pipe.predict(X_train), pipe.predict(X_test) 


print(f"Train:\n{metrics.classification_report(y_train, y_train)}")

print(f"Test:\n{metrics.classification_report(y_test, y_test_pred)}")

# Testing model performance on the real TestSet
y_true_test_pred = pipe.predict(df_test.preprocessed_text) 
df_test["y_pred_model2"] = y_true_test_pred


Train:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       437
           1       1.00      1.00      1.00      2763

    accuracy                           1.00      3200
   macro avg       1.00      1.00      1.00      3200
weighted avg       1.00      1.00      1.00      3200

Test:
              precision    recall  f1-score   support

           0       0.93      0.86      0.89       103
           1       0.98      0.99      0.99       697

    accuracy                           0.97       800
   macro avg       0.95      0.93      0.94       800
weighted avg       0.97      0.97      0.97       800



In [20]:
# Model 3: tfidfvectorizer + clf-svm

pipe = Pipeline([("countvectorizer", CountVectorizer(max_features=1500, min_df=1, max_df=.3)),
         ('clf-svm', SGDClassifier(loss='hinge', penalty='l1', alpha=1e-4, 
                                   max_iter=5000, random_state=42))
        ])

pipe.fit(X_train, y_train)

y_train_pred, y_test_pred = pipe.predict(X_train), pipe.predict(X_test) 


print(f"Train:\n{metrics.classification_report(y_train, y_train)}")

print(f"Test:\n{metrics.classification_report(y_test, y_test_pred)}")

# Testing model performance on the real TestSet
y_true_test_pred = pipe.predict(df_test.preprocessed_text) 
df_test["y_pred_model3"] = y_true_test_pred


Train:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       437
           1       1.00      1.00      1.00      2763

    accuracy                           1.00      3200
   macro avg       1.00      1.00      1.00      3200
weighted avg       1.00      1.00      1.00      3200

Test:
              precision    recall  f1-score   support

           0       0.88      0.95      0.91       103
           1       0.99      0.98      0.99       697

    accuracy                           0.98       800
   macro avg       0.93      0.97      0.95       800
weighted avg       0.98      0.98      0.98       800



In [21]:
# Model 4: tfidfvectorizer + clf-svm

pipe = Pipeline([("tfidfvectorizer", TfidfVectorizer()),
         ('clf-svm', SGDClassifier(loss='hinge', penalty='l1', alpha=1e-4, 
                                   max_iter=5000, random_state=42))
        ])

pipe.fit(X_train, y_train)

y_train_pred, y_test_pred = pipe.predict(X_train), pipe.predict(X_test) 


print(f"Train:\n{metrics.classification_report(y_train, y_train)}")

print(f"Test:\n{metrics.classification_report(y_test, y_test_pred)}")

# Testing model performance on the real TestSet
y_true_test_pred = pipe.predict(df_test.preprocessed_text) 
df_test["y_pred_model4"] = y_true_test_pred


Train:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       437
           1       1.00      1.00      1.00      2763

    accuracy                           1.00      3200
   macro avg       1.00      1.00      1.00      3200
weighted avg       1.00      1.00      1.00      3200

Test:
              precision    recall  f1-score   support

           0       0.92      0.87      0.90       103
           1       0.98      0.99      0.98       697

    accuracy                           0.97       800
   macro avg       0.95      0.93      0.94       800
weighted avg       0.97      0.97      0.97       800



In [22]:
# Model 5: CountVectorizer, TfidfTransformer, clf-forest 

pipe = Pipeline([("CountVectorizer", CountVectorizer(max_features=1500, min_df=1, max_df=.3)),
                 ("TfidfTransformer", TfidfTransformer()), 
                 ('clf-forest', RandomForestClassifier(300))])

pipe.fit(X_train, y_train)

y_train_pred, y_test_pred = pipe.predict(X_train), pipe.predict(X_test) 

print(f"Train:\n{metrics.classification_report(y_train, y_train)}")

print(f"Test:\n{metrics.classification_report(y_test, y_test_pred)}")

# Testing model performance on the real TestSet
y_true_test_pred = pipe.predict(df_test.preprocessed_text) 
df_test["y_pred_model5"] = y_true_test_pred


Train:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       437
           1       1.00      1.00      1.00      2763

    accuracy                           1.00      3200
   macro avg       1.00      1.00      1.00      3200
weighted avg       1.00      1.00      1.00      3200

Test:
              precision    recall  f1-score   support

           0       0.99      0.85      0.92       103
           1       0.98      1.00      0.99       697

    accuracy                           0.98       800
   macro avg       0.98      0.93      0.95       800
weighted avg       0.98      0.98      0.98       800



In [23]:
for column in sorted(df_test.columns[3:]):
    print(f"{column}:")
    print(f"1: {np.sum([df_test[column] == 1])}")
    print(f"0: {np.sum([df_test[column] == 0])}\n")

y_pred_model1:
1: 1382
0: 177

y_pred_model2:
1: 1359
0: 200

y_pred_model3:
1: 1353
0: 206

y_pred_model4:
1: 1357
0: 202

y_pred_model5:
1: 1381
0: 178



In [24]:
df_test[(df_test["y_pred_model1"] == 1) 
        & ((df_test["y_pred_model3"] == 0) 
        | (df_test["y_pred_model3"] == 0)
        | (df_test["y_pred_model5"] == 0)
        | (df_test["y_pred_model4"] == 0))]


Unnamed: 0,id,text,preprocessed_text,y_pred_model1,y_pred_model2,y_pred_model3,y_pred_model4,y_pred_model5
24,25,.Please charge my mobile when you get up in morning.,.Please charge mobile morning .,1,1,0,1,1
42,43,You will be receiving this week's Triple Echo ringtone shortly. Enjoy it!,receive week 's Triple Echo ringtone shortly . enjoy !,1,0,0,0,1
85,86,Your credits have been topped up for http://www.bubbletext.com Your renewal Pin is tgxxrz,credit http://www.bubbletext.com renewal pin tgxxrz,1,0,0,0,0
95,96,Please reserve ticket on saturday eve from chennai to thirunelvali and again from tirunelvali to chennai on sunday eve...i already see in net..no ticket available..i want to book ticket through tackle ..,reserve ticket saturday eve chennai thirunelvali tirunelvali chennai sunday eve ... net .. ticket available .. want book ticket tackle ..,1,1,0,1,1
96,97,Or just do that 6times,digit time,1,0,1,0,1
105,106,Wewa is 130. Iriver 255. All 128 mb.,Wewa digit . Iriver digit . digit mb .,1,0,0,0,1
119,120,Are you free now?can i call now?,free ? I ?,1,0,1,0,1
135,136,You have 1 new message. Please call 08712400200.,digit new message . digit .,1,0,0,0,0
221,222,http//tms. widelive.com/index. wml?id=820554ad0a1705572711&first=true¡C C Ringtone¡,http//tms . widelive.com/index . wml?id = digit addigit adigit & = true¡C Ringtone ¡,1,0,0,0,0
240,241,Oh yes I can speak txt 2 u no! Hmm. Did u get email?,oh yes speak txt digit ! hmm . email ?,1,0,1,0,1


In [25]:
import requests

dataString = ""



for loop, row in enumerate(df_test[["id", "y_pred_model2"]].iterrows()):
    
    if loop == 0:
        dataString = str(row[1][0]) + ',' + \
                     str(row[1][1])
    else:
        dataString = dataString + "\n" + str(row[1][0]) + ',' + \
                     str(row[1][1])


In [26]:
dataString

'1,1\n2,1\n3,1\n4,1\n5,1\n6,1\n7,1\n8,1\n9,1\n10,1\n11,1\n12,1\n13,1\n14,1\n15,0\n16,1\n17,0\n18,1\n19,1\n20,0\n21,1\n22,1\n23,1\n24,1\n25,1\n26,0\n27,1\n28,0\n29,1\n30,0\n31,1\n32,1\n33,1\n34,1\n35,1\n36,1\n37,1\n38,1\n39,1\n40,1\n41,1\n42,1\n43,0\n44,1\n45,1\n46,1\n47,0\n48,1\n49,0\n50,1\n51,1\n52,1\n53,0\n54,1\n55,1\n56,0\n57,1\n58,1\n59,1\n60,1\n61,1\n62,1\n63,1\n64,1\n65,1\n66,1\n67,1\n68,1\n69,1\n70,1\n71,0\n72,1\n73,1\n74,0\n75,1\n76,1\n77,1\n78,0\n79,1\n80,1\n81,1\n82,1\n83,1\n84,1\n85,0\n86,0\n87,1\n88,1\n89,1\n90,1\n91,1\n92,1\n93,0\n94,0\n95,1\n96,1\n97,0\n98,1\n99,1\n100,1\n101,1\n102,1\n103,1\n104,1\n105,1\n106,0\n107,1\n108,1\n109,1\n110,1\n111,1\n112,1\n113,0\n114,0\n115,1\n116,1\n117,1\n118,1\n119,1\n120,0\n121,1\n122,1\n123,1\n124,1\n125,1\n126,1\n127,1\n128,1\n129,1\n130,1\n131,0\n132,1\n133,1\n134,1\n135,1\n136,0\n137,1\n138,1\n139,1\n140,1\n141,1\n142,1\n143,1\n144,1\n145,0\n146,1\n147,1\n148,1\n149,1\n150,1\n151,1\n152,1\n153,1\n154,1\n155,1\n156,1\n157,1\n158,1\n1

In [27]:
postData = {}
postData['challengeName'] = 'spamdetection'
postData['userID'] = '845698'
postData['challengeType'] = 'binaryclassification'
postData['submissionsData'] = dataString

url = 'https://8n46gxwibi.execute-api.us-east-2.amazonaws.com/default/computeModelScore'
x = requests.post(url,json=postData)

print(x.text)


{"message":"Internal Server Error"}


In [28]:
with open("./Data_NLP/spamdetection_results.txt", "w") as text_file:
    text_file.write(dataString)