In [23]:
# reads a corpus
# compartmentalized toxic avenger
# removes top n stop words - 1500? 500*10*.3
# applies lds with 1 word, many many iterations
# and 0 - 20 topics

import pandas as pd
import unicodedata
import re
import string

import nltk

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [54]:
def strip_accents(STR):
    return ''.join(c for c in unicodedata.normalize('NFD', STR)
                   if unicodedata.category(c) != 'Mn')

# Example filter for noun-type structures bigrams

def bigram_filter(bigram):
    tag = nltk.pos_tag(bigram)
    if tag[0][1] not in ['JJ', 'NN'] and tag[1][1] not in ['NN']:
        return False
    if 'n' in bigram or 't' in bigram:
        return False
    if 'PRON' in bigram:
        return False
    return True


def let_bigrams_be_bigrams(string_text, finder, bigram_measures):
    """
    """
    # Example for detecting bigrams 
    

    # may want to filter only those that occur at least 5 times; may not
    #finder.apply_freq_filter(5)
    bigram_scores = finder.score_ngrams(bigram_measures.pmi)
    bigram_pmi = pd.DataFrame(bigram_scores)
    bigram_pmi.columns = ['bigram', 'pmi']
    bigram_pmi.sort_values(by='pmi', axis = 0, ascending = False, inplace = True)
    filtered_bigram = bigram_pmi[bigram_pmi.
                                 apply(lambda bigram: 
                                       bigram_filter(bigram['bigram']) 
                                       and bigram.pmi > 5, axis = 1)][:500]
    bigrams = [' '.join(x) for x in filtered_bigram.bigram.values if len(x[0]) > 2 or len(x[1]) > 2]
    return bigrams


def replace_ngram(doc_string_text, bigrams):
    for gram in bigrams:
        doc_string_text = doc_string_text.replace(gram, '_'.join(gram.split()))
    return doc_string_text


def clean_doc(doc, finder, bigram_measures, more_stop_words=[]): 
    """
    Given a string doc,
    this function cleans the doc
    of . . .
    non-ascii chars
    punctuation
    anything other than lower case words
    non-alphabetic tokens
    stop words
    tokens shorter than length 3
    stem, lemma are also options
    returns the list of tokens.
    """
    
    bigrams = let_bigrams_be_bigrams(doc, finder, bigram_measures)
    doc = replace_ngram(doc, bigrams)
    
    tokens = doc.split() # set the tone
    
    # punctuation situation
    sub_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [sub_punc.sub('', tkn) for tkn in tokens]
    
    # alpha
    tokens = [tkn for tkn in tokens if tkn.isalpha()]
    
    # shorties
    tokens = [tkn for tkn in tokens if len(tkn) > 3]
    
    # capitalization
    tokens = [tkn.lower() for tkn in tokens]
    
    # stop words
    stop_words = set(stopwords.words('english') + more_stop_words)
    tokens = [tkn for tkn in tokens if not tkn in stop_words]
    
    # word stemming    
#    ps = PorterStemmer()
#    tokens = [ps.stem(tkn) for tkn in tokens]
    
    # lemmatizing
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(wrd) for wrd in tokens]
    
    # bleh??
    # tag = nltk.pos_tag(bigram)
    
    return tokens # tokens is a list

In [56]:
#read in class corpus csv into python
data = pd.read_csv(r'~/Projects/msds453/MSDS_453_Public/MSDS453_ClassCorpus/ClassCorpus_Final_v5_20220717.csv')
buffer_data = data.copy()

data['stripped_text'] = data.Text.apply(lambda xyz: strip_accents(xyz))

bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_documents(data.stripped_text)

#adding two columns to the dataframe to store the processed text and tokenized text
buffer_data['processed_text'] = buffer_data['Text'].apply(lambda x: clean_doc(x, finder, bigram_measures))

# getting the top freq qords so I can add them to stop_words
buffer_data['word_count'] = buffer_data['processed_text'].apply(lambda xyz : [idx[0] for idx in nltk.FreqDist(xyz).most_common(int(.3*500))])
more_stop_words = [idx[0] for idx in nltk.FreqDist([abc for xyz in buffer_data.word_count.to_list() for abc in xyz]).most_common(100)]

#adding two columns to the dataframe to store the processed text and tokenized text
data['processed_text'] = data['Text'].apply(lambda x: clean_doc(x, finder, bigram_measures, more_stop_words))

In [58]:
buffer_data[buffer_data.processed_text.apply(lambda xyz: "_" in ' '.join(xyz))==True]

Unnamed: 0,Doc_ID,DSI_Title,Text,Submission File Name,Student Name,Genre of Movie,Review Type (pos or neg),Movie Title,processed_text,word_count


In [59]:
data = pd.read_csv(r'~/Projects/msds453/MSDS_453_Public/MSDS453_ClassCorpus/ClassCorpus_Final_v5_20220717.csv')
buffer_data = data.copy()


In [None]:
data['stripped_text'] = data.Text.apply(lambda xyz: strip_accents(xyz))

In [99]:
s = "the quick.Brown fox"
re.sub(r"[a-z]\b(?!')", lambda m: m.group().lower(), s)

'the quick.Brown fox'

In [100]:
re.sub(r"^[A-Z]\b(.?!)\s", lambda m: m.group().upper(), s)

'the quick.Brown fox'

In [101]:
re.sub(r"^[A-Z][a-z\s]+[\.\?!]$", lambda m: m.group().upper(), s)

'the quick.Brown fox'

In [108]:
re.sub(r".[A-Z]", lambda m: m.group().lower(), s)

'the quick.brown fox'

In [109]:
t = "the quick. Brown fox"

In [114]:
re.sub(r". [A-Z]", lambda m: m.group().lower(), t)

'the quick. brown fox'

In [115]:
re.sub(r"[?!.] [A-Z]", lambda m: m.group().lower(), t)

'the quick. brown fox'

In [116]:
q = "the quick! Brown fox"

In [119]:
re.sub(r"[?!.] [A-Z]", lambda m: m.group().lower(), q)

'the quick? brown fox'

In [118]:
q = "the quick? Brown fox"