# Entry ? - Word distributions

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.graph_objects as go
import plotly.express as px

import string
import re
import nltk
import emoji
import itertools

# Functions

## Filter data, create features

In [54]:
def count_ratios(df, text_col):
    df['char_count'] = df[text_col].str.len()
    df['word_count'] = df[text_col].str.count('\\w+')
    df['ltr_count'] = df[text_col].str.count('[A-Za-z]')
    df['ltr_ratio'] = (df['ltr_count'] / df['char_count']).fillna(0)
    df['spec_char_count'] = df[text_col].apply(lambda x: sum(map(x.count, string.punctuation)))
    df['spec_char_ratio'] = (df['spec_char_count'] / df['char_count']).fillna(0)
    df['num_count'] = df[text_col].str.count('[0-9]+')
    df['num_ratio'] = (df['num_count'] / df['char_count']).fillna(0)
    df['vowel_count'] = df[text_col].str.count('[aeiouyAEIOUY]')
    df['vowel_ratio'] = (df['vowel_count'] / df['char_count']).fillna(0)
    df['caps_count'] = df[text_col].str.count('[A-Z]')
    df['caps_ratio'] = (df['caps_count'] / df['char_count']).fillna(0)
    df['newline_tab_count'] = df[text_col].str.count(r'[\t\r\n]')
    df['newline_tab_ratio'] = (df['newline_tab_count'] / df['char_count']).fillna(0)
    df['qwerty_count'] = df[text_col].str.count('[asdfghjkl]')
    df['qwerty_ratio'] = (df['qwerty_count'] / df['char_count']).fillna(0)
    return df
    
def punct_tokens(df, text_col):
    newline_list = '\t\r\n'
    remove_newline = str.maketrans(' ', ' ', newline_list)
    emoji_string = ''.join(emoji.UNICODE_EMOJI.keys())
    punct_list = string.punctuation + '-‘_”' + emoji_string
    nopunct = str.maketrans('', '', punct_list)
    df['no_punct_tokens'] = df[text_col].fillna("").str.lower().str.translate(remove_newline).str.translate(nopunct).str.split()
    df['distinct_word_count'] = df['no_punct_tokens'].apply(lambda x: len(set(x)))
    df['max_word_len'] = df['no_punct_tokens'].apply(lambda x: max([len(word) for word in x], default=0))
    df['min_word_len'] = df['no_punct_tokens'].apply(lambda x: min([len(word) for word in x], default=0))
    df['word_len_range'] = df['max_word_len'] - df['min_word_len']
    df['word_diversity'] = (df['distinct_word_count'] / df['word_count']).fillna(0)
    df['avg_word_len'] = (df['char_count'] / df['word_count']).fillna(0)
    df['repeat_ltrs'] = df['no_punct_tokens'].apply(lambda x: [word for word in x if re.search(r'([a-zA-Z])\1{2,}', word.lower())])
    df['repeat_ltr_count'] = df[text_col].str.count(r'([a-zA-Z])\1{2,}')
    df['repeat_ltr_ratio'] = (df['repeat_ltr_count'] / df['word_count']).fillna(0)
    return df

def tribi_grams(df):
    stop = nltk.corpus.stopwords.words('english')
    df['unigrams'] = df['no_punct_tokens'].apply(lambda x: [item for item in x if item not in stop])
    df['bigrams'] = df['unigrams'].apply(lambda x:(list(nltk.bigrams(x))))
    df['trigrams'] = df['unigrams'].apply(lambda x:(list(nltk.trigrams(x))))
#     df['spam_unigram'] = df.apply(lambda x: [[x['spam'],word] for word in x['no_punct_tokens'], axis=1])
#     df['spam_bigram'] = df.apply(lambda x: [[x['spam'],word] for word in x['bigrams'], axis=1])
#     df['spam_trigram'] = df.apply(lambda x: [[x['spam'],word] for word in x['trigrams'], axis=1])
    return df

def clean_features(df, text_col):
    df = count_ratios(df, text_col)
    df = punct_tokens(df, text_col)
    df.loc[df['avg_word_len'] == np.inf, 'avg_word_len'] = 0
    df = tribi_grams(df)
    return df

## Word frequency distributions

In [46]:
# stop = nltk.corpus.stopwords.words('english')
# # To make a custom list, just do pd.Series(['words', 'to', 'remove'])

# def create_word_freq(df, text_col):
#     newline_list = '\t\r\n'
#     remove_newline = str.maketrans(' ', ' ', newline_list)
#     emoji_string = ''.join(emoji.UNICODE_EMOJI.keys())
#     punct_list = string.punctuation + '-‘_”' + emoji_string
#     nopunct = str.maketrans('', '', punct_list)
#     freq = df[text_col].str.lower().str.translate(
#         remove_newline).str.translate(nopunct).str.split().apply(
#         lambda x: [item for item in x if item not in stop])
#     return freq

## N-gram frequency distributions

In [4]:
# def create_unigram_cfd(df):
#     gram_words = list(itertools.chain(*df.fraud_unigram.ravel()))
#     gram_cfd = nltk.ConditionalFreqDist(gram_words)
#     gram_cfd_df = pd.DataFrame(gram_cfd)
#     gram_cfd_df = gram_cfd_df[~gramcfd_df.index_isin(stop)]
#     gram_cfd_df['email_count'] = gram_cfd_df[0].fillna(0) + gram_cfd_df[1].fillna(0)
#     gram_cfd_df['unigram_spam_ratio'] = gram_cfd_df[1].fillna(0) / gram_cfd_df['email_count'].fillna(0)
#     return gram_cfd_df

# def create_bigram_cfd(df):
#     gram_words = list(itertools.chain(*df.fraud_bigram.ravel()))
#     gram_cfd = nltk.ConditionalFreqDist(gram_words)
#     gram_cfd_df = pd.DataFrame(gram_cfd)
#     gram_cfd_df['email_count'] = gram_cfd_df[0].fillna(0) + gram_cfd_df[1].fillna(0)
#     gram_cfd_df['unigram_spam_ratio'] = gram_cfd_df[1].fillna(0) / gram_cfd_df['email_count'].fillna(0)
#     return gram_cfd_df

# def create_trigram_cfd(df):
#     gram_words = list(itertools.chain(*df.fraud_trigram.ravel()))
#     gram_cfd = nltk.ConditionalFreqDist(gram_words)
#     gram_cfd_df = pd.DataFrame(gram_cfd)
#     gram_cfd_df['email_count'] = gram_cfd_df[0].fillna(0) + gram_cfd_df[1].fillna(0)
#     gram_cfd_df['unigram_spam_ratio'] = gram_cfd_df[1].fillna(0) / gram_cfd_df['email_count'].fillna(0)
#     return gram_cfd_df

# def create_gram_cfd(df):
#     uni_cfd = create_unigram_cfd(df)
#     bi_cfd = create_bigram_cfd(df)
#     tri_cfd = create_trigram_cfd(df)
#     return uni_cfd, bi_cfd, tri_cfd

In [55]:
def create_unigram_cfd(df):
    gram_words = list(itertools.chain(*df.no_punct_tokens.ravel()))
    gram_cfd = nltk.ConditionalFreqDist(gram_words)
    gram_cfd_df = pd.DataFrame(gram_cfd)
    gram_cfd_df = gram_cfd_df[~gramcfd_df.index_isin(stop)]
    gram_cfd_df['instance_count'] = gram_cfd_df[0].fillna(0) + gram_cfd_df[1].fillna(0)
    gram_cfd_df['instance_ratio'] = gram_cfd_df[1].fillna(0) / gram_cfd_df['instance_count'].fillna(0)
    return gram_cfd_df

def create_bigram_cfd(df):
    gram_words = list(itertools.chain(*df.fraud_bigram.ravel()))
    gram_cfd = nltk.ConditionalFreqDist(gram_words)
    gram_cfd_df = pd.DataFrame(gram_cfd)
    gram_cfd_df['instance_count'] = gram_cfd_df[0].fillna(0) + gram_cfd_df[1].fillna(0)
    gram_cfd_df['instance_ratio'] = gram_cfd_df[1].fillna(0) / gram_cfd_df['instance_count'].fillna(0)
    return gram_cfd_df

def create_trigram_cfd(df):
    gram_words = list(itertools.chain(*df.fraud_trigram.ravel()))
    gram_cfd = nltk.ConditionalFreqDist(gram_words)
    gram_cfd_df = pd.DataFrame(gram_cfd)
    gram_cfd_df['instance_count'] = gram_cfd_df[0].fillna(0) + gram_cfd_df[1].fillna(0)
    gram_cfd_df['instance_ratio'] = gram_cfd_df[1].fillna(0) / gram_cfd_df['instance_count'].fillna(0)
    return gram_cfd_df

def create_gram_cfd(df):
    uni_cfd = create_unigram_cfd(df)
    bi_cfd = create_bigram_cfd(df)
    tri_cfd = create_trigram_cfd(df)
    return uni_cfd, bi_cfd, tri_cfd

In [None]:
df_all, df_counts, df_ratios = split_data(sms_df)

word_freq = create_word_freq(sms_df, 'text')
unigram_cfd, bigram_cfd, trigram_cfd = create_gram_cfd(sms_df)

In [63]:
word_freq

array([list(['go', 'jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amore', 'wat']),
       list(['ok', 'lar', 'joking', 'wif', 'u', 'oni']),
       list(['free', 'entry', 'wkly', 'comp', 'win', 'fa', 'cup', 'final', 'tkts', 'st', 'may', 'text', 'fa', 'receive', 'entry', 'questionstd', 'txt', 'ratetcs', 'apply', 'overs']),
       ..., list(['pity', 'mood', 'soany', 'suggestions']),
       list(['guy', 'bitching', 'acted', 'like', 'id', 'interested', 'buying', 'something', 'else', 'next', 'week', 'gave', 'us', 'free']),
       list(['rofl', 'true', 'name'])], dtype=object)

In [None]:
def create_unigram_cfd(df):
    gram_words = list(itertools.chain(*df.no_punct_tokens.ravel()))
    gram_cfd = nltk.ConditionalFreqDist(gram_words)
    gram_cfd_df = pd.DataFrame(gram_cfd)
    gram_cfd_df = gram_cfd_df[~gramcfd_df.index_isin(stop)]
    gram_cfd_df['instance_count'] = gram_cfd_df[0].fillna(0) + gram_cfd_df[1].fillna(0)
    gram_cfd_df['instance_ratio'] = gram_cfd_df[1].fillna(0) / gram_cfd_df['instance_count'].fillna(0)
    return gram_cfd_df

In [59]:
test = list(itertools.chain(*sms_df.no_punct_tokens.ravel()))

In [61]:
test[:5]

['go', 'until', 'jurong', 'point', 'crazy']

In [60]:
nltk.ConditionalFreqDist(test)

ValueError: too many values to unpack (expected 2)

In [None]:
from nltk.probability import ConditionalFreqDist
from nltk.tokenize import word_tokenize
sent = "the the the dog dog some other words that we do not care about"
cfdist = ConditionalFreqDist()
for word in word_tokenize(sent):
    condition = len(word)
    cfdist[condition][word] += 1

In [65]:
sent = "the the the dog dog some other words that we do not care about"
nltk.FreqDist(sent.split())

FreqDist({'the': 3, 'dog': 2, 'some': 1, 'other': 1, 'words': 1, 'that': 1, 'we': 1, 'do': 1, 'not': 1, 'care': 1, ...})

In [67]:
nltk.ConditionalFreqDist(nltk.FreqDist(sent.split()))

ValueError: too many values to unpack (expected 2)