## Import

In [161]:
import pandas as pd
import tensorflow as tf
from fileMaster import *
import plotly.express as px
import plotly.graph_objects as go

## Preliminary cleaning | Clean ill-formated data

In [162]:
with open(raw_data_txt, 'r', encoding='utf-8') as f:
    raw_lines = f.readlines()

df = pd.DataFrame(columns=['Type', 'Msg'])

count = 0

for line in raw_lines:
    # Encode Sensitive Info
    line = line.replace("&lt;#&gt;", "[NUM]").replace(" [NUM] ", "[NUM]")
    line = line.replace("&lt;DECIMAL&gt;", "[DEC]").replace(" [DEC] ", "[DEC]")
    line = line.replace("&lt;URL&gt;", "[URL]").replace(" [URL] ", "[URL]")
    line = line.replace("&lt;TIME&gt;", "[TIME]").replace(" [URL] ", "[URL]")
    line = line.replace("&lt;EMAIL&gt;", "[EMAIL]").replace(" [EMAIL] ", "[EMAIL]")

    # Replace Known Strings
    line = line.replace("&lt;3", "[HEART]").replace(" [HEART] ", "[HEART]")
    line = line.replace("&lt;", "<")
    line = line.replace("&gt;", ">")
    line = line.replace("&amp;", "&")

    # Replace Special Characters
    line = line.replace("\x92", "'")
    line = line.replace("Bill said u or ur rents", 'Bill said u or ur parents')
    line = line.replace("MORROW", "TOMORROW")
    line = line.replace(" ", "")
    line = line.replace("ü", "u")
    line = line.replace("Ü", "U")
    line = line.replace("", "")
    line = line.replace("", " ")

    # Replace Unknown Characters
    line = line.replace("鈥┾??〨ud", "[UNK]")

    _lst = line.split('\t')
    _type = _lst[0]
    _msg = _lst[1].strip()

    d = {'Type': _type, 'Msg': _msg}
    df2 = pd.DataFrame(d, [count])

    df = pd.concat([df, df2], axis=0)

    count += 1

df.to_csv(data_folder / 'cleaned.csv', sep=',', encoding='utf-8', index=False)

In [163]:
df.head()

Unnamed: 0,Type,Msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Verify Tensorflow GPU is Identified

In [164]:
if tf.test.gpu_device_name():

    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

else:

   print("Please install GPU version of TF")


Default GPU Device: /device:GPU:0


## Exploratory Data Analysis
This shows the basic stats for each type

In [165]:
df.groupby("Type").describe()

Unnamed: 0_level_0,Msg,Msg,Msg,Msg
Unnamed: 0_level_1,count,unique,top,freq
Type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4827,4518,"Sorry, I'll call later",30
spam,747,642,Please call our customer service representativ...,4


In [166]:
eda_type_count_df = pd.DataFrame({'Count': [df['Type'].value_counts()['ham'], df['Type'].value_counts()['spam']], 'Type': ['Ham', 'Spam']}, ['Ham', 'Spam'])

In [167]:
eda_type_count_fig = px.bar(eda_type_count_df, x='Type', y='Count', orientation='v', color_discrete_sequence=["#A0e7A0", "#FF8986"], color='Type', title='Count of Msg Types', text_auto='', template='plotly')

In [168]:
eda_type_count_fig.show()

The number of datapoints is skewed heavily in favor of Ham messages. This is unsurprising, given that on a daily basis most people receive more ham than spam. However, this may lead to potential implicit biasness in our models, and hence we will be doing a comparison between a balanced dataset and the original.

In [169]:
df['CharCount'] = df['Msg'].apply(lambda x: len(x))

In [170]:
def tokenization(msg, punc_remove=False):
    punct_lst = ['...', '..', ",", '.', ';', '<', '>', '/', '"', '&', '!', '@', '#', '?', ]

    strng = msg

    for p in punct_lst:

        if ('..'  in strng or '...' in strng) and p == '.':
            pass
        else:
            temp_lst = strng.split(p)
            temp_lst = [x.rstrip() for x in temp_lst]
            temp_lst = [x.lstrip() for x in temp_lst]

            if punc_remove:
                strng = f' '.join(temp_lst)
            else:
                strng = f' {p} '.join(temp_lst)

    word_lst = strng.split(' ')

    if not punc_remove:
        count = 0

        punct_lst.remove('..')
        punct_lst.remove('...')

        for w in word_lst:
            try:
                for p in punct_lst:
                    if  w == 2*p and word_lst[count+1] == p:
                        word_lst[count] = 3*p
                        word_lst.pop(count + 1)
            except IndexError:  # index error occurs when we are at the last element of list and try to access the next element, this means that the ".." is at the last element so we do not to do anything abt it
                pass

            count += 1

    return word_lst

In [171]:
df['ListWords'] = df['Msg'].apply(lambda x: tokenization(x))
df['NumWords'] = df['Msg'].apply(lambda x: len(tokenization(x, punc_remove=True)))

In [172]:
df.head()

Unnamed: 0,Type,Msg,CharCount,ListWords,NumWords
0,ham,"Go until jurong point, crazy.. Available only ...",111,"[Go, until, jurong, point, ,, crazy, .., Avail...",20
1,ham,Ok lar... Joking wif u oni...,29,"[Ok, lar, ..., Joking, wif, u, oni, ...]",6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,"[Free, entry, in, 2, a, wkly, comp, to, win, F...",29
3,ham,U dun say so early hor... U c already then say...,49,"[U, dun, say, so, early, hor, ..., U, c, alrea...",11
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,"[Nah, I, don't, think, he, goes, to, usf, ,, h...",13


In [173]:
def split_sentence(msg):
    punct_lst = ['...', '..', '.', '?', '!', ';']

    strng = msg

    for p in punct_lst:

        if ('..'  in strng or '...' in strng) and p == '.':
            pass
        else:
            temp_lst = strng.split(p)
            temp_lst = [x.rstrip() for x in temp_lst]
            temp_lst = [x.lstrip() for x in temp_lst]
            strng = f'|'.join(temp_lst)

    sentence_lst = strng.split('|')

    sym_lst = ['...', '..', ",", '.', ';', '<', '>', '/', '"', '&', '!', '@', '#', '?', ' ', '']
    for s in sentence_lst:
        if s in sym_lst:
            sentence_lst.remove(s)

    return sentence_lst

In [174]:
df['ListSentence'] = df['Msg'].apply(lambda x: split_sentence(x))
df['NumSentences'] = df['Msg'].apply(lambda x: len(split_sentence(x)))

In [175]:
df.head()

Unnamed: 0,Type,Msg,CharCount,ListWords,NumWords,ListSentence,NumSentences
0,ham,"Go until jurong point, crazy.. Available only ...",111,"[Go, until, jurong, point, ,, crazy, .., Avail...",20,"[Go until jurong point, crazy, Available only ...",3
1,ham,Ok lar... Joking wif u oni...,29,"[Ok, lar, ..., Joking, wif, u, oni, ...]",6,"[Ok lar, Joking wif u oni]",2
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,"[Free, entry, in, 2, a, wkly, comp, to, win, F...",29,[Free entry in 2 a wkly comp to win FA Cup fin...,2
3,ham,U dun say so early hor... U c already then say...,49,"[U, dun, say, so, early, hor, ..., U, c, alrea...",11,"[U dun say so early hor, U c already then say]",2
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,"[Nah, I, don't, think, he, goes, to, usf, ,, h...",13,"[Nah I don't think he goes to usf, he lives ar...",1


In [176]:
df.tail()

Unnamed: 0,Type,Msg,CharCount,ListWords,NumWords,ListSentence,NumSentences
5569,spam,This is the 2nd time we have tried 2 contact u...,160,"[This, is, the, 2nd, time, we, have, tried, 2,...",30,[This is the 2nd time we have tried 2 contact ...,5
5570,ham,Will u b going to esplanade fr home?,36,"[Will, u, b, going, to, esplanade, fr, home, ?, ]",9,[Will u b going to esplanade fr home],1
5571,ham,"Pity, * was in mood for that. So...any other s...",57,"[Pity, ,, *, was, in, mood, for, that., So, .....",12,"[Pity, * was in mood for that, So, any other s...",3
5572,ham,The guy did some bitching but I acted like i'd...,125,"[The, guy, did, some, bitching, but, I, acted,...",26,[The guy did some bitching but I acted like i'...,1
5573,ham,Rofl. Its true to its name,26,"[Rofl, ., Its, true, to, its, name]",6,"[Rofl, Its true to its name]",2


### Box Plot and Histograms of *CharCount*, *NumWords*, *NumSentence*

##### Histograms


In [177]:
eda_charc_hist = px.histogram(df[['CharCount', 'Type']], x='CharCount', color='Type', template='plotly', title='Histogram on Number of Characters', color_discrete_sequence=["#A0e7A0", "#FF8986"])
eda_charc_hist.update_layout(barmode='overlay')

It is interesting note that most **spam** messages have a **higher** character count (i.e. the mode of spam messages lies between 155-159 characters) than ham messages (i.e. mode of ham messages lies between 25~29 characters). Thus, the distribution of ham messages is **positively** skewed, whereas that of spam messages is **negatively** skewed.

In [178]:
eda_charc_hist = px.histogram(df[['NumWords', 'Type']], x='NumWords', color='Type', template='plotly', title='Histogram on Number of Words', color_discrete_sequence=["#A0e7A0", "#FF8986"])
eda_charc_hist.update_layout(barmode='overlay')

It is interesting note that most **spam** messages have a **higher** character (i.e. the mode of spam messages lies at 28 words) than ham messages (i.e. mode of ham messages lies at 6 words). Thus, the distribution of ham messages is **positively** skewed, whereas that of spam messages is **negatively** skewed.

In [179]:
eda_charc_hist = px.histogram(
    df[['NumSentences', 'Type']],
    x='NumSentences',
    color='Type',
    template='plotly',
    title='Histogram on Number of Sentences',
    color_discrete_sequence=["#A0e7A0", "#FF8986"]
)
eda_charc_hist.update_layout(barmode='overlay')

It is interesting to note that the distribution of spam messages visually seem to follow a normal distribution, with the mode of spam messages being at 3, which is significantly higher than that of ham messages (whose mode lies at 1 sentence). It is also unsurprising that ham messages have a positive skew, considering that most people will limit their text to around 1 sentence. Hence, our group postulates that ceteris paribus, the likelihood of a message being spam message is incredibly high should there be three sentences.

##### Box Plot


In [256]:
def genBoxPlot(var):
    box_fig = go.Figure()
    box_fig.layout.title = f'BoxPlots of Spam and Ham for {var}'
    ham_df = df[df['Type']=='ham']
    temp_df = df

    temp_df['Ham'] = ham_df[var]
    spam_df = df[df['Type']=='spam']
    temp_df['Spam'] = spam_df[var]


    for col in temp_df[['Ham', 'Spam']]:
        box_fig.add_trace(
            go.Box(x=temp_df[col].values,
                   name=temp_df[col].name,
                   fillcolor=["#CEF3CE", "#FFEBEB"][['Ham', 'Spam'].index(col)],
                   line={'color': ["#6BDB6B", "#FF8986"][['Ham', 'Spam'].index(col)]}
                   )
        )
    box_fig.update_traces(orientation='h')
    box_fig.show()
    return temp_df


In [258]:
genBoxPlot('CharCount')
genBoxPlot('NumWords')
genBoxPlot('NumSentences')

Unnamed: 0,Type,Msg,CharCount,ListWords,NumWords,ListSentence,NumSentences,Ham,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",111,"[Go, until, jurong, point, ,, crazy, .., Avail...",20,"[Go until jurong point, crazy, Available only ...",3,3.0,
1,ham,Ok lar... Joking wif u oni...,29,"[Ok, lar, ..., Joking, wif, u, oni, ...]",6,"[Ok lar, Joking wif u oni]",2,2.0,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,"[Free, entry, in, 2, a, wkly, comp, to, win, F...",29,[Free entry in 2 a wkly comp to win FA Cup fin...,2,,2.0
3,ham,U dun say so early hor... U c already then say...,49,"[U, dun, say, so, early, hor, ..., U, c, alrea...",11,"[U dun say so early hor, U c already then say]",2,2.0,
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,"[Nah, I, don't, think, he, goes, to, usf, ,, h...",13,"[Nah I don't think he goes to usf, he lives ar...",1,1.0,
...,...,...,...,...,...,...,...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u...,160,"[This, is, the, 2nd, time, we, have, tried, 2,...",30,[This is the 2nd time we have tried 2 contact ...,5,,5.0
5570,ham,Will u b going to esplanade fr home?,36,"[Will, u, b, going, to, esplanade, fr, home, ?, ]",9,[Will u b going to esplanade fr home],1,1.0,
5571,ham,"Pity, * was in mood for that. So...any other s...",57,"[Pity, ,, *, was, in, mood, for, that., So, .....",12,"[Pity, * was in mood for that, So, any other s...",3,3.0,
5572,ham,The guy did some bitching but I acted like i'd...,125,"[The, guy, did, some, bitching, but, I, acted,...",26,[The guy did some bitching but I acted like i'...,1,1.0,


In [265]:
scatter_plot = px.scatter_matrix(df,
                                 dimensions=["CharCount", "NumWords", "NumSentences"],
                                 color = "Type",
                                 template = 'plotly',
                                 color_discrete_map = {'ham': '#6BDB6B', 'spam': '#FF8986'},
                                 title = "Pair Plot of all 3 Features",
                                 )
scatter_plot.update_traces(diagonal_visible=False)
scatter_plot.show()

##### LINK BACK TO R/S with Histogram. Similarity between words and char shape but not sentences

##### Correlation Matrix

In [287]:
ham_corr_df = df[df['Type']=='ham']
ham_corr_df = ham_corr_df[['CharCount', 'NumWords', 'NumSentences']]
fig = px.imshow(ham_corr_df.corr(),
                text_auto=True,
                color_continuous_scale = 'reds',
                template = 'plotly_white',
                title = "Correlation Matrix")
fig.show()
