In \[1\]:

    from os import walk
    from os.path import join
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split
    import sys

In \[2\]:

    sys.getfilesystemencoding()

Out\[2\]:

    'utf-8'

## Extract all the mails<a href="#Extract-all-the-mails" class="anchor-link">¶</a>

In \[3\]:

    SPAM_1_PATH = 'SpamData/01_Processing/spam_assassin_corpus/spam_1'
    SPAM_2_PATH = 'SpamData/01_Processing/spam_assassin_corpus/spam_2'
    EASY_NONSPAM_1_PATH = 'SpamData/01_Processing/spam_assassin_corpus/easy_ham_1'
    EASY_NONSPAM_2_PATH = 'SpamData/01_Processing/spam_assassin_corpus/easy_ham_2'
    HAM_CAT = 0
    SPAM_CAT = 1

# FUNCTION TO EXTRACT THE BODY AND CREATE A DF<a href="#FUNCTION-TO-EXTRACT-THE-BODY-AND-CREATE-A-DF" class="anchor-link">¶</a>

In \[4\]:

    def email_body_generator(path):
        
        for root,dirnames,  filenames in walk(path):
            
            for file_name in filenames:
                
                
                filepath = join(root,  file_name)
                
                
                stream = open(filepath, encoding='latin-1')

                is_body = False
                
                lines = []

                for line in stream:
                    if is_body:
                        lines.append(line)
                    elif line == '\n':
                        is_body = True

                stream.close()

                email_body = '\n'.join(lines)
                
                yield file_name, email_body 
                

    def df_from_directory(path, classification):
        
        rows = []
        
        row_names = []
        
        for file_name, email_body in email_body_generator(path):
            
            rows.append({'MESSAGE': email_body, 'CATEGORY': classification})
            
            row_names.append(file_name)
            
        return pd.DataFrame(rows, index = row_names)

In \[5\]:

    spam_emails = df_from_directory(SPAM_1_PATH, 1)
    spam_emails = spam_emails.append(df_from_directory(SPAM_2_PATH, 1))
    ham_emails = df_from_directory(EASY_NONSPAM_1_PATH, HAM_CAT)
    ham_emails = ham_emails.append(df_from_directory(EASY_NONSPAM_2_PATH, HAM_CAT))
    data = pd.concat([spam_emails, ham_emails])
    data.shape

Out\[5\]:

    (5800, 2)

In \[6\]:

    data[data.MESSAGE.str.len() == 0].index

Out\[6\]:

    Index(['cmds', 'cmds', 'cmds', '.yestee_antony.txt'], dtype='object')

In \[7\]:

    data.drop(['cmds','.yestee_antony.txt'], inplace = True)

In \[8\]:

    data

Out\[8\]:

|                                        | MESSAGE                                                                                         | CATEGORY |
|----------------------------------------|-------------------------------------------------------------------------------------------------|----------|
| 00001.7848dde101aa985090474a91ec93fcf0 | \<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Tr...                                              | 1        |
| 00002.d94f1b97e48ed3b553b3508d116e6a09 | 1\) Fight The Risk of Cancer!\\n\\nhttp://www.adc...                                            | 1        |
| 00003.2ee33bc6eacdb11f38d052c44819ba6c | 1\) Fight The Risk of Cancer!\\n\\nhttp://www.adc...                                            | 1        |
| 00004.eac8de8d759b7e74154f142194282724 | \#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#... | 1        |
| 00005.57696a39d7d84318ce497886896bf90d | I thought you might like these:\\n\\n1) Slim Dow...                                             | 1        |
| ...                                    | ...                                                                                             | ...      |
| 01396.61983fbe6ec43f55fd44e30fce24ffa6 | http://news.bbc.co.uk/1/hi/england/2515127.stm...                                               | 0        |
| 01397.9f9ef4c2a8dc012d80f2ce2d3473d3b7 | \> \>-- be careful when using this one.) Also, t...                                             | 0        |
| 01398.169b51731fe569f42169ae8f948ec676 | \>\>\>\>\> "SM" == Skip Montanaro \<skip@pobox.com\> ...                                        | 0        |
| 01399.ca6b00b7b341bbde9a9ea3dd6a7bf896 | So then, "Mark Hammond" \<mhammond@skippinet.co...                                              | 0        |
| 01400.f897f0931e461e7b2e964d28e927c35e | Hi there,\\n\\n\\n\\nNow this is probably of no us...                                           | 0        |

5796 rows × 2 columns

In \[9\]:

    document_ids = range(0, len(data.index))
    data['DOC_ID'] =document_ids
    data

Out\[9\]:

|                                        | MESSAGE                                                                                         | CATEGORY | DOC_ID |
|----------------------------------------|-------------------------------------------------------------------------------------------------|----------|--------|
| 00001.7848dde101aa985090474a91ec93fcf0 | \<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Tr...                                              | 1        | 0      |
| 00002.d94f1b97e48ed3b553b3508d116e6a09 | 1\) Fight The Risk of Cancer!\\n\\nhttp://www.adc...                                            | 1        | 1      |
| 00003.2ee33bc6eacdb11f38d052c44819ba6c | 1\) Fight The Risk of Cancer!\\n\\nhttp://www.adc...                                            | 1        | 2      |
| 00004.eac8de8d759b7e74154f142194282724 | \#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#... | 1        | 3      |
| 00005.57696a39d7d84318ce497886896bf90d | I thought you might like these:\\n\\n1) Slim Dow...                                             | 1        | 4      |
| ...                                    | ...                                                                                             | ...      | ...    |
| 01396.61983fbe6ec43f55fd44e30fce24ffa6 | http://news.bbc.co.uk/1/hi/england/2515127.stm...                                               | 0        | 5791   |
| 01397.9f9ef4c2a8dc012d80f2ce2d3473d3b7 | \> \>-- be careful when using this one.) Also, t...                                             | 0        | 5792   |
| 01398.169b51731fe569f42169ae8f948ec676 | \>\>\>\>\> "SM" == Skip Montanaro \<skip@pobox.com\> ...                                        | 0        | 5793   |
| 01399.ca6b00b7b341bbde9a9ea3dd6a7bf896 | So then, "Mark Hammond" \<mhammond@skippinet.co...                                              | 0        | 5794   |
| 01400.f897f0931e461e7b2e964d28e927c35e | Hi there,\\n\\n\\n\\nNow this is probably of no us...                                           | 0        | 5795   |

5796 rows × 3 columns

In \[10\]:

    data['FILE_NAME'] = data.index
    data

Out\[10\]:

|                                        | MESSAGE                                                                                         | CATEGORY | DOC_ID | FILE_NAME                              |
|----------------------------------------|-------------------------------------------------------------------------------------------------|----------|--------|----------------------------------------|
| 00001.7848dde101aa985090474a91ec93fcf0 | \<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Tr...                                              | 1        | 0      | 00001.7848dde101aa985090474a91ec93fcf0 |
| 00002.d94f1b97e48ed3b553b3508d116e6a09 | 1\) Fight The Risk of Cancer!\\n\\nhttp://www.adc...                                            | 1        | 1      | 00002.d94f1b97e48ed3b553b3508d116e6a09 |
| 00003.2ee33bc6eacdb11f38d052c44819ba6c | 1\) Fight The Risk of Cancer!\\n\\nhttp://www.adc...                                            | 1        | 2      | 00003.2ee33bc6eacdb11f38d052c44819ba6c |
| 00004.eac8de8d759b7e74154f142194282724 | \#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#... | 1        | 3      | 00004.eac8de8d759b7e74154f142194282724 |
| 00005.57696a39d7d84318ce497886896bf90d | I thought you might like these:\\n\\n1) Slim Dow...                                             | 1        | 4      | 00005.57696a39d7d84318ce497886896bf90d |
| ...                                    | ...                                                                                             | ...      | ...    | ...                                    |
| 01396.61983fbe6ec43f55fd44e30fce24ffa6 | http://news.bbc.co.uk/1/hi/england/2515127.stm...                                               | 0        | 5791   | 01396.61983fbe6ec43f55fd44e30fce24ffa6 |
| 01397.9f9ef4c2a8dc012d80f2ce2d3473d3b7 | \> \>-- be careful when using this one.) Also, t...                                             | 0        | 5792   | 01397.9f9ef4c2a8dc012d80f2ce2d3473d3b7 |
| 01398.169b51731fe569f42169ae8f948ec676 | \>\>\>\>\> "SM" == Skip Montanaro \<skip@pobox.com\> ...                                        | 0        | 5793   | 01398.169b51731fe569f42169ae8f948ec676 |
| 01399.ca6b00b7b341bbde9a9ea3dd6a7bf896 | So then, "Mark Hammond" \<mhammond@skippinet.co...                                              | 0        | 5794   | 01399.ca6b00b7b341bbde9a9ea3dd6a7bf896 |
| 01400.f897f0931e461e7b2e964d28e927c35e | Hi there,\\n\\n\\n\\nNow this is probably of no us...                                           | 0        | 5795   | 01400.f897f0931e461e7b2e964d28e927c35e |

5796 rows × 4 columns

In \[11\]:

    data.set_index('DOC_ID', inplace = True)
    data

Out\[11\]:

|        | MESSAGE                                                                                         | CATEGORY | FILE_NAME                              |
|--------|-------------------------------------------------------------------------------------------------|----------|----------------------------------------|
| DOC_ID |                                                                                                 |          |                                        |
| 0      | \<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Tr...                                              | 1        | 00001.7848dde101aa985090474a91ec93fcf0 |
| 1      | 1\) Fight The Risk of Cancer!\\n\\nhttp://www.adc...                                            | 1        | 00002.d94f1b97e48ed3b553b3508d116e6a09 |
| 2      | 1\) Fight The Risk of Cancer!\\n\\nhttp://www.adc...                                            | 1        | 00003.2ee33bc6eacdb11f38d052c44819ba6c |
| 3      | \#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#... | 1        | 00004.eac8de8d759b7e74154f142194282724 |
| 4      | I thought you might like these:\\n\\n1) Slim Dow...                                             | 1        | 00005.57696a39d7d84318ce497886896bf90d |
| ...    | ...                                                                                             | ...      | ...                                    |
| 5791   | http://news.bbc.co.uk/1/hi/england/2515127.stm...                                               | 0        | 01396.61983fbe6ec43f55fd44e30fce24ffa6 |
| 5792   | \> \>-- be careful when using this one.) Also, t...                                             | 0        | 01397.9f9ef4c2a8dc012d80f2ce2d3473d3b7 |
| 5793   | \>\>\>\>\> "SM" == Skip Montanaro \<skip@pobox.com\> ...                                        | 0        | 01398.169b51731fe569f42169ae8f948ec676 |
| 5794   | So then, "Mark Hammond" \<mhammond@skippinet.co...                                              | 0        | 01399.ca6b00b7b341bbde9a9ea3dd6a7bf896 |
| 5795   | Hi there,\\n\\n\\n\\nNow this is probably of no us...                                           | 0        | 01400.f897f0931e461e7b2e964d28e927c35e |

5796 rows × 3 columns

In \[12\]:

    DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'

In \[13\]:

    data.to_json(DATA_JSON_FILE)

# NATURAL LANGUAGE PROCESSING.....TEXT PROCESSING....<a href="#NATURAL-LANGUAGE-PROCESSING.....TEXT-PROCESSING...." class="anchor-link">¶</a>

In \[14\]:

    import nltk
    from nltk.stem import PorterStemmer
    from nltk.stem import SnowballStemmer
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    from bs4 import BeautifulSoup

In \[15\]:

    nltk.download('punkt')
    nltk.download('stopwords')

    [nltk_data] Downloading package punkt to
    [nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
    [nltk_data]   Package punkt is already up-to-date!
    [nltk_data] Downloading package stopwords to
    [nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
    [nltk_data]   Package stopwords is already up-to-date!

Out\[15\]:

    True

In \[16\]:

    stop_words = set(stopwords.words('english'))

In \[17\]:

    def clean_msg_no_html(message, stemmer=PorterStemmer(), 
                     stop_words=set(stopwords.words('english'))):
        
        soup = BeautifulSoup(message, 'html.parser')
        cleaned_text = soup.get_text()
        
        words = word_tokenize(cleaned_text.lower())
        
        filtered_words = []
        
        
        
        for word in words:
                  
            
            
            if word not in stop_words and word.isalpha():
                filtered_words.append(stemmer.stem(word))

        
        return filtered_words 
        
        

In \[18\]:

    stemmed_nested_list = data.MESSAGE.apply(clean_msg_no_html)
    flat_stemmed_list = [item for sublist in stemmed_nested_list for item in sublist]

    C:\Users\hp\anaconda3\lib\site-packages\bs4\__init__.py:389: UserWarning: "http://www.post-gazette.com/columnists/20020905brian5
    " looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
      ' that document to Beautiful Soup.' % decoded_markup

In \[19\]:

    len(flat_stemmed_list)

Out\[19\]:

    761974

In \[20\]:

    unique_words =pd.Series(flat_stemmed_list).value_counts()

In \[21\]:

    unique_words

Out\[21\]:

    http        10662
    use          5017
    list         4852
    email        4370
    get          4187
                ...  
    dmx             1
    postng          1
    gizli           1
    godammit        1
    kaçamak         1
    Length: 27305, dtype: int64

In \[22\]:

    frequent_words=unique_words[0:2500]

In \[23\]:

    type(frequent_words)

Out\[23\]:

    pandas.core.series.Series

In \[24\]:

    frequent_words.iloc[:10]

Out\[24\]:

    http     10662
    use       5017
    list      4852
    email     4370
    get       4187
    mail      3985
    one       3905
    free      3171
    time      3090
    work      2880
    dtype: int64

In \[25\]:

    frequent_words.index

Out\[25\]:

    Index(['http', 'use', 'list', 'email', 'get', 'mail', 'one', 'free', 'time',
           'work',
           ...
           'render', 'tobacco', 'decreas', 'freez', 'milter', 'flight', 'fashion',
           'mutual', 'ximian', 'pleasur'],
          dtype='object', length=2500)

In \[26\]:

    vocab = pd.DataFrame({'VOCAB_WORD': frequent_words.index.values})

In \[27\]:

    vocab

Out\[27\]:

|      | VOCAB_WORD |
|------|------------|
| 0    | http       |
| 1    | use        |
| 2    | list       |
| 3    | email      |
| 4    | get        |
| ...  | ...        |
| 2495 | flight     |
| 2496 | fashion    |
| 2497 | mutual     |
| 2498 | ximian     |
| 2499 | pleasur    |

2500 rows × 1 columns

In \[28\]:

    vocab.index.name = 'WORD_ID'

In \[29\]:

    vocab

Out\[29\]:

|         | VOCAB_WORD |
|---------|------------|
| WORD_ID |            |
| 0       | http       |
| 1       | use        |
| 2       | list       |
| 3       | email      |
| 4       | get        |
| ...     | ...        |
| 2495    | flight     |
| 2496    | fashion    |
| 2497    | mutual     |
| 2498    | ximian     |
| 2499    | pleasur    |

2500 rows × 1 columns

In \[30\]:

    WORD_ID_FILE = 'SpamData/01_Processing/word-by-id.csv'

In \[31\]:

    vocab.to_csv(WORD_ID_FILE, index_label=vocab.index.name, header=vocab.VOCAB_WORD.name)

In \[32\]:

    stemmed_nested_list

Out\[32\]:

    DOC_ID
    0       [save, life, insur, spend, life, quot, save, g...
    1       [fight, risk, cancer, http, slim, guarante, lo...
    2       [fight, risk, cancer, http, slim, guarante, lo...
    3       [adult, club, offer, free, membership, instant...
    4       [thought, might, like, slim, guarante, lose, l...
                                  ...                        
    5791    [http, bizarr, collect, stuf, anim, could, fet...
    5792    [care, use, one, also, realli, cute, thing, ja...
    5793    [sm, skip, montanaro, write, jeremi, put, anot...
    5794    [mark, hammond, like, given, zodb, sound, attr...
    5795    [hi, probabl, use, whatsoev, also, problem, re...
    Name: MESSAGE, Length: 5796, dtype: object

In \[33\]:

    clean_email_lengths = [len(sublist) for sublist in stemmed_nested_list]
    print(' Number of words in the largest mail is : ', max(clean_email_lengths))

     Number of words in the largest mail is :  7671

In \[34\]:

    type(stemmed_nested_list)

Out\[34\]:

    pandas.core.series.Series

In \[35\]:

    stemmed_nested_list_after_tolist=stemmed_nested_list.tolist()

In \[36\]:

    type(stemmed_nested_list_after_tolist)

Out\[36\]:

    list

In \[37\]:

    word_columns_df = pd.DataFrame.from_records(stemmed_nested_list_after_tolist)

In \[38\]:

    word_columns_df

Out\[38\]:

|      | 0       | 1       | 2         | 3        | 4          | 5        | 6       | 7       | 8        | 9       | ... | 7661 | 7662 | 7663 | 7664 | 7665 | 7666 | 7667 | 7668 | 7669 | 7670 |
|------|---------|---------|-----------|----------|------------|----------|---------|---------|----------|---------|-----|------|------|------|------|------|------|------|------|------|------|
| 0    | save    | life    | insur     | spend    | life       | quot     | save    | g       | famili   | financi | ... | None | None | None | None | None | None | None | None | None | None |
| 1    | fight   | risk    | cancer    | http     | slim       | guarante | lose    | lb      | day      | http    | ... | None | None | None | None | None | None | None | None | None | None |
| 2    | fight   | risk    | cancer    | http     | slim       | guarante | lose    | lb      | day      | http    | ... | None | None | None | None | None | None | None | None | None | None |
| 3    | adult   | club    | offer     | free     | membership | instant  | access  | site    | user     | name    | ... | None | None | None | None | None | None | None | None | None | None |
| 4    | thought | might   | like      | slim     | guarante   | lose     | lb      | day     | http     | fight   | ... | None | None | None | None | None | None | None | None | None | None |
| ...  | ...     | ...     | ...       | ...      | ...        | ...      | ...     | ...     | ...      | ...     | ... | ...  | ...  | ...  | ...  | ...  | ...  | ...  | ...  | ...  | ...  |
| 5791 | http    | bizarr  | collect   | stuf     | anim       | could    | fetch   | sold    | cornwal  | museum  | ... | None | None | None | None | None | None | None | None | None | None |
| 5792 | care    | use     | one       | also     | realli     | cute     | thing   | japanes | av       | girl    | ... | None | None | None | None | None | None | None | None | None | None |
| 5793 | sm      | skip    | montanaro | write    | jeremi     | put      | anoth   | way     | interest | hear    | ... | None | None | None | None | None | None | None | None | None | None |
| 5794 | mark    | hammond | like      | given    | zodb       | sound    | attract | would   | packag   | hundr   | ... | None | None | None | None | None | None | None | None | None | None |
| 5795 | hi      | probabl | use       | whatsoev | also       | problem  | regard  | nvidia  | two      | machin  | ... | None | None | None | None | None | None | None | None | None | None |

5796 rows × 7671 columns

In \[39\]:

    word_columns_df.shape

Out\[39\]:

    (5796, 7671)

In \[40\]:

    X_train, X_test, y_train, y_test = train_test_split(word_columns_df, data.CATEGORY,
                                                       test_size=0.3, random_state=42)

In \[41\]:

    X_train.head()

Out\[41\]:

|      | 0       | 1      | 2     | 3         | 4           | 5     | 6       | 7            | 8     | 9       | ... | 7661 | 7662 | 7663 | 7664 | 7665 | 7666 | 7667 | 7668 | 7669 | 7670 |
|------|---------|--------|-------|-----------|-------------|-------|---------|--------------|-------|---------|-----|------|------|------|------|------|------|------|------|------|------|
| 4844 | ye      | inde   | agent | directori | verita      | cd    | unix    | subdirectori | file  | call    | ... | None | None | None | None | None | None | None | None | None | None |
| 4727 | problem | come   | tri   | instal    | harddissssk | like  | alreadi | mount        | http  | yahoo   | ... | None | None | None | None | None | None | None | None | None | None |
| 5022 | origin  | messag | date  | mon       | aug         | chad  | norwood | sven         | cc    | subject | ... | None | None | None | None | None | None | None | None | None | None |
| 3504 | inlin   | folk   | sever | major     | internet    | outag | morn    | across       | major | provid  | ... | None | None | None | None | None | None | None | None | None | None |
| 3921 | url     | http   | date  | bath      | chronicl    | None  | None    | None         | None  | None    | ... | None | None | None | None | None | None | None | None | None | None |

5 rows × 7671 columns

In \[42\]:

    y_train.head()

Out\[42\]:

    DOC_ID
    4844    0
    4727    0
    5022    0
    3504    0
    3921    0
    Name: CATEGORY, dtype: int64

In \[43\]:

    X_test.head()

Out\[43\]:

|      | 0        | 1       | 2      | 3        | 4         | 5        | 6         | 7         | 8        | 9            | ... | 7661 | 7662 | 7663 | 7664 | 7665 | 7666 | 7667 | 7668 | 7669 | 7670 |
|------|----------|---------|--------|----------|-----------|----------|-----------|-----------|----------|--------------|-----|------|------|------|------|------|------|------|------|------|------|
| 4675 | interest | alway   | wonder | thing    | bad       | exampl   | goto      | languag   | support  | goto         | ... | None | None | None | None | None | None | None | None | None | None |
| 4220 | url      | http    | date   | final    | gdc       | europ    | review    | confernec | session  | ect          | ... | None | None | None | None | None | None | None | None | None | None |
| 2484 | stephen  | william | mailto | swilliam | weaken    | food     | transact  | argument  | note     | neighborhood | ... | None | None | None | None | None | None | None | None | None | None |
| 2418 | el       | mon     | sep    | bitbitch | wrote     | eugen    | mani      | homo      | friend   | lover        | ... | None | None | None | None | None | None | None | None | None | None |
| 5110 | music    | school  | joke   | american | conductor | european | conductor | talk      | european | conductor    | ... | None | None | None | None | None | None | None | None | None | None |

5 rows × 7671 columns

In \[44\]:

    y_test.head()

Out\[44\]:

    DOC_ID
    4675    0
    4220    0
    2484    0
    2418    0
    5110    0
    Name: CATEGORY, dtype: int64

In \[45\]:

    X_train.index.name = y_train.index.name = 'DOC_ID'

In \[46\]:

    X_train.head()

Out\[46\]:

|        | 0       | 1      | 2     | 3         | 4           | 5     | 6       | 7            | 8     | 9       | ... | 7661 | 7662 | 7663 | 7664 | 7665 | 7666 | 7667 | 7668 | 7669 | 7670 |
|--------|---------|--------|-------|-----------|-------------|-------|---------|--------------|-------|---------|-----|------|------|------|------|------|------|------|------|------|------|
| DOC_ID |         |        |       |           |             |       |         |              |       |         |     |      |      |      |      |      |      |      |      |      |      |
| 4844   | ye      | inde   | agent | directori | verita      | cd    | unix    | subdirectori | file  | call    | ... | None | None | None | None | None | None | None | None | None | None |
| 4727   | problem | come   | tri   | instal    | harddissssk | like  | alreadi | mount        | http  | yahoo   | ... | None | None | None | None | None | None | None | None | None | None |
| 5022   | origin  | messag | date  | mon       | aug         | chad  | norwood | sven         | cc    | subject | ... | None | None | None | None | None | None | None | None | None | None |
| 3504   | inlin   | folk   | sever | major     | internet    | outag | morn    | across       | major | provid  | ... | None | None | None | None | None | None | None | None | None | None |
| 3921   | url     | http   | date  | bath      | chronicl    | None  | None    | None         | None  | None    | ... | None | None | None | None | None | None | None | None | None | None |

5 rows × 7671 columns

In \[47\]:

    y_train.head()

Out\[47\]:

    DOC_ID
    4844    0
    4727    0
    5022    0
    3504    0
    3921    0
    Name: CATEGORY, dtype: int64

In \[48\]:

    X_test.index.name=y_test.index.name='DOC_ID'

In \[49\]:

    X_test.head()

Out\[49\]:

|        | 0        | 1       | 2      | 3        | 4         | 5        | 6         | 7         | 8        | 9            | ... | 7661 | 7662 | 7663 | 7664 | 7665 | 7666 | 7667 | 7668 | 7669 | 7670 |
|--------|----------|---------|--------|----------|-----------|----------|-----------|-----------|----------|--------------|-----|------|------|------|------|------|------|------|------|------|------|
| DOC_ID |          |         |        |          |           |          |           |           |          |              |     |      |      |      |      |      |      |      |      |      |      |
| 4675   | interest | alway   | wonder | thing    | bad       | exampl   | goto      | languag   | support  | goto         | ... | None | None | None | None | None | None | None | None | None | None |
| 4220   | url      | http    | date   | final    | gdc       | europ    | review    | confernec | session  | ect          | ... | None | None | None | None | None | None | None | None | None | None |
| 2484   | stephen  | william | mailto | swilliam | weaken    | food     | transact  | argument  | note     | neighborhood | ... | None | None | None | None | None | None | None | None | None | None |
| 2418   | el       | mon     | sep    | bitbitch | wrote     | eugen    | mani      | homo      | friend   | lover        | ... | None | None | None | None | None | None | None | None | None | None |
| 5110   | music    | school  | joke   | american | conductor | european | conductor | talk      | european | conductor    | ... | None | None | None | None | None | None | None | None | None | None |

5 rows × 7671 columns

In \[50\]:

    y_test.head()

Out\[50\]:

    DOC_ID
    4675    0
    4220    0
    2484    0
    2418    0
    5110    0
    Name: CATEGORY, dtype: int64

In \[51\]:

    vocab

Out\[51\]:

|         | VOCAB_WORD |
|---------|------------|
| WORD_ID |            |
| 0       | http       |
| 1       | use        |
| 2       | list       |
| 3       | email      |
| 4       | get        |
| ...     | ...        |
| 2495    | flight     |
| 2496    | fashion    |
| 2497    | mutual     |
| 2498    | ximian     |
| 2499    | pleasur    |

2500 rows × 1 columns

In \[52\]:

    vocab.at[0, 'VOCAB_WORD']

Out\[52\]:

    'http'

In \[53\]:

    vocab.at[2499, 'VOCAB_WORD']

Out\[53\]:

    'pleasur'

In \[54\]:

    word_index = pd.Index(vocab.VOCAB_WORD)

In \[55\]:

    word_index

Out\[55\]:

    Index(['http', 'use', 'list', 'email', 'get', 'mail', 'one', 'free', 'time',
           'work',
           ...
           'render', 'tobacco', 'decreas', 'freez', 'milter', 'flight', 'fashion',
           'mutual', 'ximian', 'pleasur'],
          dtype='object', name='VOCAB_WORD', length=2500)

In \[56\]:

    # Creating a Sparse Matrix....

In \[57\]:

    def make_sparse_matrix(df, indexed_words, labels):
        
        nr_rows = df.shape[0]
        
        nr_cols = df.shape[1]
        
        word_set = set(indexed_words)
        
        dict_list = []
        
        for i in range(nr_rows):
            
            for j in range(nr_cols):
                
                word = df.iat[i,j]
                
                if word in word_set:
                    
                    doc_id = df.index[i]
                    
                    word_id = indexed_words.get_loc(word)
                    
                    category = labels.at[doc_id]
                    
                    item = {'LABEL' : category, 'DOC_ID': doc_id,
                           
                           'OCCURRENCE' : 1, 'WORD_ID': word_id}
                    
                    dict_list.append(item)
                    
        
        return pd.DataFrame(dict_list)
        
        

In \[58\]:

    sparse_train_df = make_sparse_matrix(X_train, word_index, y_train)
    sparse_train_df

Out\[58\]:

|        | LABEL | DOC_ID | OCCURRENCE | WORD_ID |
|--------|-------|--------|------------|---------|
| 0      | 0     | 4844   | 1          | 266     |
| 1      | 0     | 4844   | 1          | 1265    |
| 2      | 0     | 4844   | 1          | 507     |
| 3      | 0     | 4844   | 1          | 310     |
| 4      | 0     | 4844   | 1          | 254     |
| ...    | ...   | ...    | ...        | ...     |
| 430980 | 1     | 860    | 1          | 47      |
| 430981 | 1     | 860    | 1          | 1438    |
| 430982 | 1     | 860    | 1          | 26      |
| 430983 | 1     | 860    | 1          | 19      |
| 430984 | 1     | 860    | 1          | 126     |

430985 rows × 4 columns

In \[59\]:

    sparse_train_df.shape

Out\[59\]:

    (430985, 4)

In \[60\]:

    train_grouped = sparse_train_df.groupby(['DOC_ID', 'WORD_ID','LABEL' ]).sum().reset_index()
    train_grouped.head()

Out\[60\]:

|     | DOC_ID | WORD_ID | LABEL | OCCURRENCE |
|-----|--------|---------|-------|------------|
| 0   | 0      | 2       | 1     | 1          |
| 1   | 0      | 3       | 1     | 2          |
| 2   | 0      | 4       | 1     | 1          |
| 3   | 0      | 7       | 1     | 3          |
| 4   | 0      | 11      | 1     | 1          |

In \[61\]:

    train_grouped.shape

Out\[61\]:

    (258367, 4)

In \[ \]:

    sparse_test_df = make_sparse_matrix(X_test, word_index, y_test)

In \[73\]:

    test_grouped = sparse_test_df.groupby(['DOC_ID', 'WORD_ID','LABEL' ]).sum().reset_index()
    test_grouped.head()

Out\[73\]:

|     | DOC_ID | WORD_ID | LABEL | OCCURRENCE |
|-----|--------|---------|-------|------------|
| 0   | 8      | 2       | 1     | 1          |
| 1   | 8      | 3       | 1     | 4          |
| 2   | 8      | 4       | 1     | 2          |
| 3   | 8      | 5       | 1     | 1          |
| 4   | 8      | 6       | 1     | 2          |

In \[74\]:

    test_grouped.shape

Out\[74\]:

    (117635, 4)

In \[75\]:

    TRAINING_DATA_FILE = 'SpamData/02_Training/train-data.txt'
    np.savetxt(TRAINING_DATA_FILE, train_grouped, fmt='%d')

In \[76\]:

    TEST_DATA_FILE = 'SpamData/02_Training/test-data.txt'
    np.savetxt(TEST_DATA_FILE, test_grouped, fmt='%d')

In \[78\]:

    len(set(train_grouped.DOC_ID))

Out\[78\]:

    4015

In \[79\]:

    len(X_train)

Out\[79\]:

    4057

In \[80\]:

    len(X_train)-len(set(train_grouped.DOC_ID))

Out\[80\]:

    42

In \[81\]:

    len(X_test)-len(set(test_grouped.DOC_ID))

Out\[81\]:

    15

In \[ \]:

    # How to find the mails that did not make it?...I am working with the test only....

In \[83\]:

    set(X_test.index.values)-set(test_grouped.DOC_ID)

Out\[83\]:

    {134, 179, 240, 274, 298, 339, 439, 471, 670, 734, 765, 945, 1544, 1670, 1700}

In \[ \]:

    # why did the mails above not make it finally?

In \[86\]:

    data.MESSAGE[179]

Out\[86\]:

    '------=_NextPart_000_00A5_78C83A6B.A1543A16\n\nContent-Type: text/html; charset="iso-8859-1"\n\nContent-Transfer-Encoding: base64\n\n\n\n\n\nMzU0NHRqVFIwLTM5NUZLa20zNjkyYUN6QjUtNDQ5cGVsMzANCjxodG1sPjxi\n\nb2R5IGxpbms9I0ZGRkYwMCB2bGluaz0jRkZGRjAwIGFsaW5rPSNGRkZGMDAg\n\ndGV4dD0jRkZGRjAwIGJnY29sb3I9IzAwMDAwMD4gPHRhYmxlIGJvcmRlckNv\n\nbG9yPSMwMDAwMDAgd2lkdGg9NjAwIGFsaWduPWNlbnRlciBiZ0NvbG9yPSM2\n\nNjk5MzMgYm9yZGVyPTM+PHRyPjx0ZCBhbGlnbj1taWRkbGU+PGZvbnQgZmFj\n\nZT1BcmlhbCxIZWx2ZXRpY2Esc2Fucy1zZXJpZj48YnI+IDxmb250IGNvbG9y\n\nPXdoaXRlIHNpemU9KzM+PGEgaHJlZj1odHRwOi8vd3d3Lnh4eG1hdGNoLm5l\n\ndC93ZXRiaXRzL2luZGV4Lmh0bT5XZXRiaXRzPC9hPjwvZm9udD48L2ZvbnQ+\n\nPGEgaHJlZj1odHRwOi8vd3d3Lnh4eG1hdGNoLm5ldC93ZXRiaXRzL2luZGV4\n\nLmh0bT4gPC9hPjxwPjxmb250IGZhY2U9QXJpYWwsSGVsdmV0aWNhLHNhbnMt\n\nc2VyaWYgY29sb3I9I2ZmZmYwMCBzaXplPTM+PGI+IEdvbGRlbiBTaG93ZXIg\n\nRXh0cmF2YWdhbnphPC9iPjwvZm9udD48Zm9udCBmYWNlPUFyaWFsLEhlbHZl\n\ndGljYSxzYW5zLXNlcmlmPjxicj4gPGEgaHJlZj1odHRwOi8vd3d3Lnh4eG1h\n\ndGNoLm5ldC93ZXRiaXRzL2luZGV4Lmh0bT4gPGZvbnQgY29sb3I9eWVsbG93\n\nIHNpemU9KzI+RU5URVIhPC9mb250PjwvYT48L3A+IDx0YWJsZSB3aWR0aD01\n\nMDAgYWxpZ249Y2VudGVyPjx0cj48dGQgYWxpZ249bWlkZGxlPjxmb250IGNv\n\nbG9yPXdoaXRlIHNpemU9KzM+VW5iZWxpZXZhYmxlIFNleCBBY3RzITwvZm9u\n\ndD48YnI+IDxhIGhyZWY9aHR0cDovL3d3dy54eHhtYXRjaC5uZXQvd2V0Yml0\n\ncy9pbmRleC5odG0+IDxmb250IGNvbG9yPXllbGxvdyBzaXplPSs0PlNoZSBD\n\nYW4gUGVlIEFsbCBEYXk8L2ZvbnQ+PC9hPjxicj4gPGZvbnQgY29sb3I9Ymxh\n\nY2sgc2l6ZT0rMj5Db2NrIGNyYXZpbmcgdGVlbnMgc3F1aXJtIGFyb3VuZCBj\n\nb3ZlcmVkIGluIHBlZSB3aXRoIGEgdGhyb2JiaW5nIGNvY2sgYnVyaWVkIHRv\n\nIHRoZSBoaWx0LiBTbGlwcGVyeSBhbmFsIHdob3JlcyBnZXR0aW5nIHdldCE8\n\nYnI+IDxmb250IHNpemU9KzMgZWxsb3c+U3ByZWFkICdlbSA8L2ZvbnQ+PC9m\n\nb250PiA8Zm9udCBjb2xvcj15ZWxsb3cgc2l6ZT0rMz5QZWU8L2ZvbnQ+PGZv\n\nbnQgY29sb3I9YmxhY2sgc2l6ZT0rMj48Zm9udCBjb2xvcj15ZWxsb3cgc2l6\n\nZT0rMz5naXJsITwvZm9udD48YnI+IDxhIGhyZWY9aHR0cDovL3d3dy54eHht\n\nYXRjaC5uZXQvd2V0Yml0cy9pbmRleC5odG0+IDxmb250IGNvbG9yPXllbGxv\n\ndyBzaXplPSs0PkVudGVyITwvZm9udD48L2E+PC9mb250PjwvdGQ+PC90cj4g\n\nPC90YWJsZT4gPHRhYmxlIHdpZHRoPTUwMCBhbGlnbj1jZW50ZXI+PHRyPjx0\n\nZCBhbGlnbj1taWRkbGU+PGZvbnQgY29sb3I9d2hpdGUgc2l6ZT0rMT5TZXgg\n\nY3JhemVkIHBlZWdpcmxzIHJpZGUgY29jayBhbGwgZGF5IHdoaWxlIHBpc3Np\n\nbmcgZm91bnRhaW5zIG9uIHRoZWlyIHBhcnRuZXJzLiBQZWVnaXJscyB0aGF0\n\nIGNhbid0IGdldCBlbm91Z2ggY3VtLCBzdWNraW5nIG9mZiBkaWNrIGFmdGVy\n\nIGRpY2suIEhvcm55IHNsdXRzIHNwcmVhZGluZyB0aGVtIGZvciBldmVyeSBU\n\nb20sIERpY2sgYW5kIENvY2sgdG8gY29tZSBpbiEgPC9mb250PjwvdGQ+PC90\n\ncj4gPC90YWJsZT4gPHRhYmxlIGJvcmRlckNvbG9yPWJsYWNrIGJvcmRlckNv\n\nbG9yRGFyaz1ibGFjayBhbGlnbj1jZW50ZXIgYm9yZGVyQ29sb3JMaWdodD1y\n\nZWQgYm9yZGVyPTI+PHRyPjx0ZCBhbGlnbj1taWRkbGUgd2lkdGg9MjAwPiA8\n\nYSBocmVmPWh0dHA6Ly93d3cueHh4bWF0Y2gubmV0L3dldGJpdHMvaW5kZXgu\n\naHRtPiA8Zm9udCBjb2xvcj15ZWxsb3cgc2l6ZT0rMT5VbHRyYS1IYXJkY29y\n\nZSBBY3Rpb24gPC9mb250PjwvYT48L3RkPjx0ZCBhbGlnbj1taWRkbGUgd2lk\n\ndGg9MjAwPiA8YSBocmVmPWh0dHA6Ly93d3cueHh4bWF0Y2gubmV0L3dldGJp\n\ndHMvaW5kZXguaHRtPiA8Zm9udCBjb2xvcj15ZWxsb3cgc2l6ZT0rMT5UZWVu\n\nYWdlIFBlZWdpcmxzIDwvZm9udD48L2E+PC90ZD48dGQgYWxpZ249bWlkZGxl\n\nIHdpZHRoPTIwMD4gPGEgaHJlZj1odHRwOi8vd3d3Lnh4eG1hdGNoLm5ldC93\n\nZXRiaXRzL2luZGV4Lmh0bT4gPGZvbnQgY29sb3I9eWVsbG93IHNpemU9KzE+\n\nV2lsZCBQZWUgUGFydGllcyA8L2ZvbnQ+PC9hPjwvdGQ+PC90cj48dHI+PHRk\n\nIGFsaWduPW1pZGRsZSB3aWR0aD0yMDA+IDxhIGhyZWY9aHR0cDovL3d3dy54\n\neHhtYXRjaC5uZXQvd2V0Yml0cy9pbmRleC5odG0+IDxmb250IGNvbG9yPXll\n\nbGxvdyBzaXplPSsxPkJpemFycmUgVmlkZW9zIDwvZm9udD48L2E+PC90ZD48\n\ndGQgYWxpZ249bWlkZGxlIHdpZHRoPTIwMD4gPGEgaHJlZj1odHRwOi8vd3d3\n\nLnh4eG1hdGNoLm5ldC93ZXRiaXRzL2luZGV4Lmh0bT4gPGZvbnQgY29sb3I9\n\neWVsbG93IHNpemU9KzE+SWxsZWdhbCBBY3Rpb24gPC9mb250PjwvYT48L3Rk\n\nPjx0ZCBhbGlnbj1taWRkbGUgd2lkdGg9MjAwPiA8YSBocmVmPWh0dHA6Ly93\n\nd3cueHh4bWF0Y2gubmV0L3dldGJpdHMvaW5kZXguaHRtPiA8Zm9udCBjb2xv\n\ncj15ZWxsb3cgc2l6ZT0rMT5IaWRkZW4gUGVlIENhbXMgPC9mb250PjwvYT48\n\nL3RkPjwvdHI+IDwvdGFibGU+PHA+PGJyPiA8YSBocmVmPWh0dHA6Ly93d3cu\n\neHh4bWF0Y2gubmV0L3dldGJpdHMvaW5kZXguaHRtPiA8Zm9udCBjb2xvcj15\n\nZWxsb3cgc2l6ZT0rMz5FbnRlciBXZXRiaXRzPC9mb250PjwvYT48YnI+PGJy\n\nPiAmbmJzcDs8L2ZvbnQ+PC90ZD48L3RyPiA8L3RhYmxlPjwvYm9keT48L2h0\n\nbWw+DQo1NDE4WlRRSTAtNjAyb0pUbzk3MDR0TXZJMC05MTZ1TktINjQyNEF0\n\nWUM0LTQ4NFVVY3UxMzU4T0VlQjItbDU3\n\n\n'

In \[87\]:

    clean_msg_no_html(data.at[179, 'MESSAGE'])

Out\[87\]:

    []

In \[90\]:

    set(X_train.index.values)-set(train_grouped.DOC_ID)

Out\[90\]:

    {22,
     38,
     73,
     77,
     86,
     91,
     94,
     114,
     127,
     138,
     186,
     193,
     197,
     205,
     206,
     302,
     328,
     335,
     338,
     369,
     402,
     766,
     875,
     878,
     924,
     929,
     939,
     940,
     965,
     983,
     988,
     1035,
     1100,
     1164,
     1170,
     1234,
     1248,
     1313,
     1360,
     1686,
     1846,
     1878}

In \[91\]:

    data.MESSAGE[22]

Out\[91\]:

    '------=_NextPart_000_00B2_83B03D1E.C6530E24\n\nContent-Type: text/html; charset="iso-8859-1"\n\nContent-Transfer-Encoding: base64\n\n\n\n\n\nPGh0bWw+PGJvZHk+PGRpdiBpZD0ibWVzc2FnZUJvZHkiPjxkaXY+PGZvbnQg\n\nZmFjZT0iQXJpYWwiIHNpemU9IjIiPlRoaXMgbWVzc2FnZSBpcyBzZW50IHRv\n\nIG91ciBzdWJzY3JpYmVycyBvbmx5LiBGdXJ0aGVyIGVtYWlscyB0byB5b3Ug\n\nYnkgdGhlIHNlbmRlciB0aGlzIG9uZSB3aWxsIGJlIHN1c3BlbmRlZCBhdCBu\n\nbyBjb3N0IHRvIHlvdS4gU2NyZWVuaW5nIG9mIGFkZHJlc3NlcyBoYXMgYmVl\n\nbiBkb25lIHRvIHRoZSBiZXN0IG9mIG91ciBhYmlsaXR5LCB1bmZvcnR1bmF0\n\nZWx5IGl0IGlzIGltcG9zc2libGUgdG8gYmUgMTAwJSBhY2N1cmF0ZSwgc28g\n\naWYgeW91IGRpZCBub3QgYXNrIGZvciB0aGlzLCBvciB3aXNoIHRvIGJlIGV4\n\nY2x1ZGVkIG9mIHRoaXMgbGlzdCwgcGxlYXNlIGNsaWNrIDxhIGhyZWY9Im1h\n\naWx0bzpoZWFsdGgxMDVAbWFpbC5ydT9zdWJqZWN0PXJlbW92ZSIgdGFyZ2V0\n\nPSJuZXdfd2luIj5oZXJlPC9hPjwvZm9udD48L2Rpdj4gIDxwPjxiPjxmb250\n\nIGZhY2U9IkFyaWFsIj48Zm9udCBjb2xvcj0iI2ZmMDAwMCI+VEhJUyBJUyBG\n\nT1IgQURVTFQgTUVOIE9OTFkgISBJRiBZT1UgQVJFIE5PVCBBTiBBRFVMVCwg\n\nREVMRVRFIE5PVyAhDQo8cD4NCjxwIGFsaWduPSJjZW50ZXIiPjxpbWcgc3Jj\n\nPSJodHRwOi8vYTIyMDAudHJpcG9kLmNvbS5jby9waG90by5qcGciIHdpZHRo\n\nPSIzNTEiIGhlaWdodD0iMTc5Ij48L3A+DQo8L2ZvbnQ+PC9wPjxkaXY+V2Ug\n\nYXJlIGEgc2VyaW91cyBjb21wYW55LCBvZmZlcmluZyBhIHByb2dyYW0gdGhh\n\ndCB3aWxsIGVuaGFuY2UgeW91ciBzZXggbGlmZSwgYW5kIGVubGFyZ2UgeW91\n\nciBwZW5pcyBpbiBhIHRvdGFsbHkgbmF0dXJhbCB3YXkuIDxwPldlIHJlYWxp\n\nemUgbWFueSBtZW4gLWFuZCB0aGVpciBwYXJ0bmVycy0gYXJlIHVuaGFwcHkg\n\nd2l0aCB0aGVpciBwZW5pcyBzaXplLiBUaGUgdHJ1dGggaXMgdGhhdCBzaXpl\n\nIG1hdHRlcnM7IG5vdCBvbmx5IGl0IGFmZmVjdHMgbWFueSBtZW4ncyBwZXJm\n\nb3JtYW5jZSwgYnV0IHRoZWlyIHNlbGYtZXN0ZWVtIGFzIHdlbGwuPC9wPjxw\n\nPiZuYnNwOzwvZGl2PjxkaXY+UGVuaXMgZW5sYXJnZW1lbnQgSVMgUE9TU0lC\n\nTEU7IGp1c3QgYXMgeW91IGNhbiBleGVyY2lzZSBhbG1vc3QgYW55IHBhcnQg\n\nb2YgDQp5b3VyIGJvZHksIHlvdSBDQU4gZXhlcmNpc2UgeW91ciBwZW5pcy48\n\nL3A+DQo8L2ZvbnQ+PC9kaXY+PGZvbnQgY29sb3I9IiNmZjAwMDAiPjxkaXY+\n\nPGZvbnQgZmFjZT0iQXJpYWwiIGNvbG9yPSIjZmYwMDAwIiBzaXplPSIzIj5P\n\ndXIgcHJvZ3JhbSBpcyB0b3RhbGx5IFBST1ZFTiBhbmQgMTAwJSBHVUFSQU5U\n\nRUVEICE8L3A+DQo8L2Rpdj48ZGl2Pk91ciBjb21wYW55IGhhcyB0aGUgdGVj\n\naG5pcXVlcyEgVG90YWxseSBOQVRVUkFMIHRlY2huaXF1ZXM7IG5vIGdhZGdl\n\ndHMsIG5vIHB1bXBzLCBubyBzdXJnZXJ5ICE8L2Rpdj48cD5JZiB5b3Ugd2Fu\n\ndCBtb3JlIGluZm9ybWF0aW9uLCBwbGVhc2UgY2xpY2sgPGEgaHJlZj0iaHR0\n\ncDovL2xhcmdlMS50cmlwb2QuY29tLmFyIj5oZXJlPC9hPiwgb3Igc2VuZCB1\n\ncyBhbiBlbWFpbCA8YSBocmVmPSJtYWlsdG86aW5mbzMwMTdAZXhjaXRlLmNv\n\nbSAgICAgICAgP3N1YmplY3Q9bW9yZWluZm8iPmhlcmU8L2E+PC9wPg0KPC9k\n\naXY+PGRpdj5UaGlzIElTIE5PVCBVTlNPTElDSVRFRDsgeW91IGFwcGVhciBp\n\nbiBhbiBzdWJzY3JpcHRpb24gbGlzdCwgaWYgaW4gZXJyb3IsIHBsZWFzZSBs\n\nZXQgdXMga25vdy4gUGxlYXNlIGxldCB0aG9zZSB3aG8gc3VmZmVyIGZyb20g\n\nZXJlY3RpbGUgZHlzZnVuY3Rpb24sIHNtYWxsIHBlbmlzIHNpemUsIGFuZCBv\n\ndGhlciBtYWxlIGFpbG1lbnRzIHJlYWQgdGhpcyBtZXNzYWdlITwvZGl2Pjxw\n\nPkRJU1BPTklCTEUgVEFNQklFTiBFTiBFU1BBTk9MPGZvbnQgY29sb3I9IiNm\n\nZmZmZmYiPjAyMjBqbkhwOS00NzhSTFJkNzczOFNZbVQyLTEzNXd6Z3IyNjc3\n\nUlFKbTQtMTU3cWFFRjk3NzRER2FsNTU=\n\n\n'

In \[92\]:

    clean_msg_no_html(data.at[22, 'MESSAGE'])

Out\[92\]:

    []

In \[ \]: