# Entry ? - N-grams

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.graph_objects as go
import plotly.express as px

import string
import re
import nltk
import emoji
import itertools

In [2]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, cross_val_predict
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

# Functions

## Filter data, create features

In [3]:
def count_ratios(df, text_col):
    df['char_count'] = df[text_col].str.len()
    df['word_count'] = df[text_col].str.count('\\w+')
    df['ltr_count'] = df[text_col].str.count('[A-Za-z]')
    df['ltr_ratio'] = (df['ltr_count'] / df['char_count']).fillna(0)
    df['spec_char_count'] = df[text_col].apply(lambda x: sum(map(x.count, string.punctuation)))
    df['spec_char_ratio'] = (df['spec_char_count'] / df['char_count']).fillna(0)
    df['num_count'] = df[text_col].str.count('[0-9]+')
    df['num_ratio'] = (df['num_count'] / df['char_count']).fillna(0)
    df['vowel_count'] = df[text_col].str.count('[aeiouyAEIOUY]')
    df['vowel_ratio'] = (df['vowel_count'] / df['char_count']).fillna(0)
    df['caps_count'] = df[text_col].str.count('[A-Z]')
    df['caps_ratio'] = (df['caps_count'] / df['char_count']).fillna(0)
    df['newline_tab_count'] = df[text_col].str.count(r'[\t\r\n]')
    df['newline_tab_ratio'] = (df['newline_tab_count'] / df['char_count']).fillna(0)
    df['qwerty_count'] = df[text_col].str.count('[asdfghjkl]')
    df['qwerty_ratio'] = (df['qwerty_count'] / df['char_count']).fillna(0)
    return df
    
def punct_tokens(df, text_col):
    newline_list = '\t\r\n'
    remove_newline = str.maketrans(' ', ' ', newline_list)
    emoji_string = ''.join(emoji.UNICODE_EMOJI.keys())
    punct_list = string.punctuation + '-‘_”' + emoji_string
    nopunct = str.maketrans('', '', punct_list)
    df['no_punct_tokens'] = df[text_col].fillna("").str.lower().str.translate(remove_newline).str.translate(nopunct).str.split()
    df['distinct_word_count'] = df['no_punct_tokens'].apply(lambda x: len(set(x)))
    df['max_word_len'] = df['no_punct_tokens'].apply(lambda x: max([len(word) for word in x], default=0))
    df['min_word_len'] = df['no_punct_tokens'].apply(lambda x: min([len(word) for word in x], default=0))
    df['word_len_range'] = df['max_word_len'] - df['min_word_len']
    df['word_diversity'] = (df['distinct_word_count'] / df['word_count']).fillna(0)
    df['avg_word_len'] = (df['char_count'] / df['word_count']).fillna(0)
    df['repeat_ltrs'] = df['no_punct_tokens'].apply(lambda x: [word for word in x if re.search(r'([a-zA-Z])\1{2,}', word.lower())])
    df['repeat_ltr_count'] = df[text_col].str.count(r'([a-zA-Z])\1{2,}')
    df['repeat_ltr_ratio'] = (df['repeat_ltr_count'] / df['word_count']).fillna(0)
    return df

def tribi_grams(df):
    stop = nltk.corpus.stopwords.words('english')
    df['unigrams'] = df['no_punct_tokens'].apply(lambda x: [item for item in x if item not in stop])
    df['bigrams'] = df['unigrams'].apply(lambda x:(list(nltk.bigrams(x))))
    df['trigrams'] = df['unigrams'].apply(lambda x:(list(nltk.trigrams(x))))
    return df

def clean_features(df, text_col):
    df = count_ratios(df, text_col)
    df = punct_tokens(df, text_col)
    df.loc[df['avg_word_len'] == np.inf, 'avg_word_len'] = 0
    df = tribi_grams(df)
    return df

In [4]:
sms_df = pd.read_csv('data/smsspamcollection/SMSSpamCollection', sep='\t', names =['spam', 'text'])
sms_df = clean_features(sms_df, 'text')

In [5]:
sms_df['target'] = sms_df['spam'].map({'ham': 0, 'spam': 1})

In [6]:
sms_df.head(3)

Unnamed: 0,spam,text,char_count,word_count,ltr_count,ltr_ratio,spec_char_count,spec_char_ratio,num_count,num_ratio,...,word_len_range,word_diversity,avg_word_len,repeat_ltrs,repeat_ltr_count,repeat_ltr_ratio,unigrams,bigrams,trigrams,target
0,ham,"Go until jurong point, crazy.. Available only ...",111,20,83,0.747748,9,0.081081,0,0.0,...,8,1.0,5.55,[],0,0.0,"[go, jurong, point, crazy, available, bugis, n...","[(go, jurong), (jurong, point), (point, crazy)...","[(go, jurong, point), (jurong, point, crazy), ...",0
1,ham,Ok lar... Joking wif u oni...,29,6,18,0.62069,6,0.206897,0,0.0,...,5,1.0,4.833333,[],0,0.0,"[ok, lar, joking, wif, u, oni]","[(ok, lar), (lar, joking), (joking, wif), (wif...","[(ok, lar, joking), (lar, joking, wif), (jokin...",0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,33,97,0.625806,6,0.03871,6,0.03871,...,10,0.636364,4.69697,[],0,0.0,"[free, entry, wkly, comp, win, fa, cup, final,...","[(free, entry), (entry, wkly), (wkly, comp), (...","[(free, entry, wkly), (entry, wkly, comp), (wk...",1


In [7]:
X = sms_df.drop(['spam', 'target'], axis=1)
y = sms_df['target']
X_ngrams, X, y_ngrams, y = train_test_split(X, y, train_size=0.2, random_state=12)

In [8]:
ngram_df = pd.concat([X_ngrams, y_ngrams], axis=1)[['text', 'unigrams', 'bigrams', 'trigrams', 'target']]

In [9]:
ngram_df.head()

Unnamed: 0,text,unigrams,bigrams,trigrams,target
3442,Get a FREE mobile video player FREE movie. To ...,"[get, free, mobile, video, player, free, movie...","[(get, free), (free, mobile), (mobile, video),...","[(get, free, mobile), (free, mobile, video), (...",1
4172,Pls what's the full name of joke's school cos ...,"[pls, whats, full, name, jokes, school, cos, f...","[(pls, whats), (whats, full), (full, name), (n...","[(pls, whats, full), (whats, full, name), (ful...",0
1943,K...k:)why cant you come here and search job:),"[kkwhy, cant, come, search, job]","[(kkwhy, cant), (cant, come), (come, search), ...","[(kkwhy, cant, come), (cant, come, search), (c...",0
2089,Well done ENGLAND! Get the official poly ringt...,"[well, done, england, get, official, poly, rin...","[(well, done), (done, england), (england, get)...","[(well, done, england), (done, england, get), ...",1
2359,I'll talk to the others and probably just come...,"[ill, talk, others, probably, come, early, tom...","[(ill, talk), (talk, others), (others, probabl...","[(ill, talk, others), (talk, others, probably)...",0


In [10]:
def create_ngram_dist(df, ngram_col, target_col):
    total_ngram_list = list(itertools.chain(*df[ngram_col].ravel()))
    total_freq = nltk.FreqDist(total_ngram_list)
    total_freq_df = pd.DataFrame.from_dict(total_freq, orient='index', columns=['total_ct'])
    tar_gram_list = list(itertools.chain(*df[ngram_df[target_col] == 1][ngram_col].ravel()))
    tar_freq = nltk.FreqDist(tar_gram_list)
    tar_freq_df = pd.DataFrame.from_dict(tar_freq, orient='index', columns=['tar_ct'])
    freq_df = pd.merge(total_freq_df, tar_freq_df, how='left', left_index=True, right_index=True).fillna(0)
    freq_df['tar_rate'] = freq_df['tar_ct'] / freq_df['total_ct']
    freq_df['log2_total_ct'] = np.log2(freq_df['total_ct'])
    freq_df['log_total_ct'] = np.log(freq_df['total_ct'])
    freq_df['log10_total_ct'] = np.log10(freq_df['total_ct'])
    freq_df['log2_tar_score'] = freq_df['log2_total_ct'] * freq_df['tar_rate']
    freq_df['log_tar_score'] = freq_df['log_total_ct'] * freq_df['tar_rate']
    freq_df['log10_tar_score'] = freq_df['log10_total_ct'] * freq_df['tar_rate']
    return freq_df

In [11]:
unigram_dist_df = create_ngram_dist(ngram_df, 'unigrams', 'target')
bigram_dist_df = create_ngram_dist(ngram_df, 'bigrams', 'target')
trigram_dist_df = create_ngram_dist(ngram_df, 'trigrams', 'target')

In [12]:
unigram_dist_df.sort_values('tar_ct').head()

Unnamed: 0,total_ct,tar_ct,tar_rate,log2_total_ct,log_total_ct,log10_total_ct,log2_tar_score,log_tar_score,log10_tar_score
nap,2,0.0,0.0,1.0,0.693147,0.30103,0.0,0.0,0.0
hand,5,0.0,0.0,2.321928,1.609438,0.69897,0.0,0.0,0.0
wit,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
clear,2,0.0,0.0,1.0,0.693147,0.30103,0.0,0.0,0.0
copies,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=12)

In [14]:
X_train.head(3)

Unnamed: 0,text,char_count,word_count,ltr_count,ltr_ratio,spec_char_count,spec_char_ratio,num_count,num_ratio,vowel_count,...,min_word_len,word_len_range,word_diversity,avg_word_len,repeat_ltrs,repeat_ltr_count,repeat_ltr_ratio,unigrams,bigrams,trigrams
1938,The fact that you're cleaning shows you know w...,133,28,99,0.744361,9,0.067669,0,0.0,42,...,1,9,0.785714,4.75,[],0,0.0,"[fact, youre, cleaning, shows, know, im, upset...","[(fact, youre), (youre, cleaning), (cleaning, ...","[(fact, youre, cleaning), (youre, cleaning, sh..."
495,Are you free now?can i call now?,32,8,24,0.75,2,0.0625,0,0.0,12,...,1,5,0.875,4.0,[],0,0.0,"[free, nowcan, call]","[(free, nowcan), (nowcan, call)]","[(free, nowcan, call)]"
3798,For The First Time In The History 'Need' 'Comf...,180,33,115,0.638889,36,0.2,0,0.0,39,...,2,6,0.636364,5.454545,[],0,0.0,"[first, time, history, need, comfort, luxury, ...","[(first, time), (time, history), (history, nee...","[(first, time, history), (time, history, need)..."


In [219]:
%%timeit
temp = pd.DataFrame(X_train['unigrams'].values.tolist())
for col in temp.columns:
    temp[col] = temp[col].map(unigram_dist_df['total_ct'])
combined_df = pd.DataFrame()
combined_df['uni_total_ct'] = temp2.values.tolist()
combined_df['uni_total_ct'] = combined_df['uni_total_ct'].apply(lambda r: [x for x in r if ~np.isnan(x)])

559 ms ± 2.34 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
unigram_dist_df.columns

Index(['total_ct', 'tar_ct', 'tar_rate', 'log2_total_ct', 'log_total_ct',
       'log10_total_ct', 'log2_tar_score', 'log_tar_score', 'log10_tar_score'],
      dtype='object')

In [None]:
re.search()

In [41]:
def create_val_map_df(df, dist_df):
    combined_df = pd.DataFrame()
    val_map_cols = dist_df.columns
    word_df = pd.DataFrame(df['unigrams'].values.tolist())
    for val_col in val_map_cols:
        temp=pd.DataFrame()
        for col in word_df.columns:            
            temp[col] = word_df[col].map(dist_df[val_col])
        col_name = 'uni_' + val_col
        combined_df[col_name] = temp.values.tolist()
        combined_df[col_name] = combined_df[col_name].apply(lambda r: [x for x in r if ~np.isnan(x)])
    for math_col in [col for col in combined_df.columns if re.search(r'.tar_rate|.tar_score',col)]:
        col_name = math_col + '_min'
        combined_df[col_name] = combined_df[math_col].apply(lambda x: min(x) if x else 0)
        col_name = math_col + '_max'
        combined_df[col_name] = combined_df[math_col].apply(lambda x: max(x) if x else 0)
        col_name = math_col + '_mean'
        combined_df[col_name] = combined_df[math_col].apply(lambda x: np.mean(x) if x else 0)
        col_name = math_col + '_sum'
        combined_df[col_name] = combined_df[math_col].apply(lambda x: sum(x) if x else 0)
    return combined_df

In [47]:
unigram_vals_df = create_val_map_df(X_train, unigram_dist_df)
bigram_vals_df = create_val_map_df(X_train, bigram_dist_df)
trigram_vals_df = create_val_map_df(X_train, trigram_dist_df)

In [43]:
unigram_vals_df.head(3)

Unnamed: 0,uni_total_ct,uni_tar_ct,uni_tar_rate,uni_log2_total_ct,uni_log_total_ct,uni_log10_total_ct,uni_log2_tar_score,uni_log_tar_score,uni_log10_tar_score,uni_tar_rate_min,...,uni_log2_tar_score_mean,uni_log2_tar_score_sum,uni_log_tar_score_min,uni_log_tar_score_max,uni_log_tar_score_mean,uni_log_tar_score_sum,uni_log10_tar_score_min,uni_log10_tar_score_max,uni_log10_tar_score_mean,uni_log10_tar_score_sum
0,"[1.0, 14.0, 8.0, 48.0, 105.0, 44.0, 42.0]","[0.0, 0.0, 8.0, 6.0, 3.0, 8.0, 2.0]","[0.0, 0.0, 1.0, 0.125, 0.02857142857142857, 0....","[0.0, 3.807354922057604, 3.0, 5.58496250072115...","[0.0, 2.6390573296152584, 2.0794415416798357, ...","[0.0, 1.146128035678238, 0.9030899869919435, 1...","[0.0, 0.0, 3.0, 0.6981203125901445, 0.19183558...","[0.0, 0.0, 2.0794415416798357, 0.4839001263634...","[0.0, 0.0, 0.9030899869919435, 0.2101551546719...",0.0,...,0.734194,5.139357,0.0,2.079442,0.508904,3.562331,0.0,0.90309,0.221014,1.547101
1,"[64.0, 103.0]","[53.0, 59.0]","[0.828125, 0.5728155339805825]","[6.0, 6.6865005271832185]","[4.1588830833596715, 4.634728988229636]","[1.806179973983887, 2.012837224705172]","[4.96875, 3.8301313699399016]","[3.4440750534072277, 2.654844760248044]","[1.4957427909554064, 1.1529844296854868]",0.572816,...,4.399441,8.798881,2.654845,3.444075,3.04946,6.09892,1.152984,1.495743,1.324364,2.648727
2,"[13.0, 39.0, 42.0, 5.0, 41.0, 41.0, 41.0, 41.0]","[1.0, 3.0, 2.0, 5.0, 0.0, 0.0, 0.0, 0.0]","[0.07692307692307693, 0.07692307692307693, 0.0...","[3.700439718141092, 5.285402218862249, 5.39231...","[2.5649493574615367, 3.6635616461296463, 3.737...","[1.1139433523068367, 1.591064607026499, 1.6232...","[0.28464920908777636, 0.4065694014509422, 0.25...","[0.19730379672781054, 0.2818124343176651, 0.17...","[0.08568795017744898, 0.12238958515588455, 0.0...",0.0,...,0.40874,3.269924,0.0,1.609438,0.283317,2.266538,0.0,0.69897,0.123043,0.984345


In [48]:
bigram_vals_df

Unnamed: 0,uni_total_ct,uni_tar_ct,uni_tar_rate,uni_log2_total_ct,uni_log_total_ct,uni_log10_total_ct,uni_log2_tar_score,uni_log_tar_score,uni_log10_tar_score,uni_tar_rate_min,...,uni_log2_tar_score_mean,uni_log2_tar_score_sum,uni_log_tar_score_min,uni_log_tar_score_max,uni_log_tar_score_mean,uni_log_tar_score_sum,uni_log10_tar_score_min,uni_log10_tar_score_max,uni_log10_tar_score_mean,uni_log10_tar_score_sum
0,[],[],[],[],[],[],[],[],[],0,...,0,0,0,0,0,0,0,0,0,0
1,[],[],[],[],[],[],[],[],[],0,...,0,0,0,0,0,0,0,0,0,0
2,[],[],[],[],[],[],[],[],[],0,...,0,0,0,0,0,0,0,0,0,0
3,[],[],[],[],[],[],[],[],[],0,...,0,0,0,0,0,0,0,0,0,0
4,[],[],[],[],[],[],[],[],[],0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3561,[],[],[],[],[],[],[],[],[],0,...,0,0,0,0,0,0,0,0,0,0
3562,[],[],[],[],[],[],[],[],[],0,...,0,0,0,0,0,0,0,0,0,0
3563,[],[],[],[],[],[],[],[],[],0,...,0,0,0,0,0,0,0,0,0,0
3564,[],[],[],[],[],[],[],[],[],0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
bigram_vals_df[bigram_vals_df['uni_log2_tar_score_mean'] > 0]

Unnamed: 0,uni_total_ct,uni_tar_ct,uni_tar_rate,uni_log2_total_ct,uni_log_total_ct,uni_log10_total_ct,uni_log2_tar_score,uni_log_tar_score,uni_log10_tar_score,uni_tar_rate_min,...,uni_log2_tar_score_mean,uni_log2_tar_score_sum,uni_log_tar_score_min,uni_log_tar_score_max,uni_log_tar_score_mean,uni_log_tar_score_sum,uni_log10_tar_score_min,uni_log10_tar_score_max,uni_log10_tar_score_mean,uni_log10_tar_score_sum


In [49]:
trigram_vals_df

Unnamed: 0,uni_total_ct,uni_tar_ct,uni_tar_rate,uni_log2_total_ct,uni_log_total_ct,uni_log10_total_ct,uni_log2_tar_score,uni_log_tar_score,uni_log10_tar_score,uni_tar_rate_min,...,uni_log2_tar_score_mean,uni_log2_tar_score_sum,uni_log_tar_score_min,uni_log_tar_score_max,uni_log_tar_score_mean,uni_log_tar_score_sum,uni_log10_tar_score_min,uni_log10_tar_score_max,uni_log10_tar_score_mean,uni_log10_tar_score_sum
0,[],[],[],[],[],[],[],[],[],0,...,0,0,0,0,0,0,0,0,0,0
1,[],[],[],[],[],[],[],[],[],0,...,0,0,0,0,0,0,0,0,0,0
2,[],[],[],[],[],[],[],[],[],0,...,0,0,0,0,0,0,0,0,0,0
3,[],[],[],[],[],[],[],[],[],0,...,0,0,0,0,0,0,0,0,0,0
4,[],[],[],[],[],[],[],[],[],0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3561,[],[],[],[],[],[],[],[],[],0,...,0,0,0,0,0,0,0,0,0,0
3562,[],[],[],[],[],[],[],[],[],0,...,0,0,0,0,0,0,0,0,0,0
3563,[],[],[],[],[],[],[],[],[],0,...,0,0,0,0,0,0,0,0,0,0
3564,[],[],[],[],[],[],[],[],[],0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
trigram_vals_df[trigram_vals_df['uni_log2_tar_score_mean'] > 0]

Unnamed: 0,uni_total_ct,uni_tar_ct,uni_tar_rate,uni_log2_total_ct,uni_log_total_ct,uni_log10_total_ct,uni_log2_tar_score,uni_log_tar_score,uni_log10_tar_score,uni_tar_rate_min,...,uni_log2_tar_score_mean,uni_log2_tar_score_sum,uni_log_tar_score_min,uni_log_tar_score_max,uni_log_tar_score_mean,uni_log_tar_score_sum,uni_log10_tar_score_min,uni_log10_tar_score_max,uni_log10_tar_score_mean,uni_log10_tar_score_sum
