## Preprocessing Pipeline Assignment


# Initial Setup via Dr. Tang

In [1]:
import pandas as pd
from collections import Counter

# If you would like to save and read data files from your Google drive
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
# To load the example data set
tw_df = pd.read_csv('/content/drive/MyDrive/2024 Spring/Text Mining/Week 2/LucasHoskinAssignment/tweeter_training.csv', encoding='ISO-8859-1', header=None)
column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
tw_df.columns = column_names

In [3]:
# Combine all text into 'raw text' and print as a test
raw_text = ' '.join(tw_df['text'])
print(raw_text[:50])

@switchfoot http://twitpic.com/2y1zl - Awww, that'


# Creating and Testing the Tokenizer (Extracting Numbers and Symbols)

### Creating the tokenizer

In [7]:
def ExtractNumsAndSymbols(all_text):
    """
    Extracts sets of numbers and singular symbols from all_text

    Parameters:
        all_text (str): One string containing all text

    Returns:
        set_nums (list of str): list of extracted sets of numbers
        set_symbols (list of str): list of extracted singular symbols
    """

    # Initialize lists for numbers and symbols
    set_nums = []
    set_syms = []

    # Flag that tracks if the last character (char) was a number
    was_num = False

    # Iterate through each character
    for char in all_text:

        # If number and last character was number, append to last number
        # If number and last character was not number, start new number
        if char.isdigit():
            if was_num == True:
                set_nums[-1] += char
            else:
                set_nums.append(char)
            was_num = True

        # If symbol, add to new string and reset flag
        elif not char.isalnum() and not char.isspace():
            set_syms.append(char)
            was_num = False
        # If not symbol or number, reset flag
        else:
            was_num = False

    return set_nums, set_syms

### Testing a small set of text

In [8]:
# Test a random piece of text
text_test = '/hello//  siudfh lau89482u uansk48ngfadn4p8ajp*JAPRIONF54661PISUFHPsih 9f84PW&thp8'
numbers_test, symbols_test = ExtractNumsAndSymbols(text_test)
print(numbers_test)
print(symbols_test)

['89482', '48', '4', '8', '54661', '9', '84', '8']
['/', '/', '/', '*', '&']


### Testing the tweeter_training.csv file

In [9]:
# Running the Tokenizer
numbers_tweet_text, symbols_tweet_text = ExtractNumsAndSymbols(raw_text)

# Show top 20 values of numbers and symbols from the tweeter_training.csv file
top_nums = Counter(numbers_tweet_text).most_common(20)
print("\ntop 20 Numbers and Frequencies:")
for number, count in top_nums:
    print(number, ":", count)

top_syms = Counter(symbols_tweet_text).most_common(20)
print("\ntop 20 Symbols and Frequencies:")
for symbol, count in top_syms:
    print(symbol, ":", count)


top 20 Numbers and Frequencies:
2 : 69177
3 : 53939
4 : 36776
1 : 36362
5 : 22683
6 : 22541
7 : 18742
8 : 15548
0 : 12534
10 : 10740
9 : 10665
30 : 8991
12 : 6158
11 : 5679
20 : 5267
100 : 4897
13 : 4741
15 : 4570
09 : 3435
24 : 3160

top 20 Symbols and Frequencies:
. : 2087151
! : 917950
@ : 798682
' : 646745
, : 486760
/ : 261643
? : 247562
; : 166822
- : 156793
& : 146413
: : 139057
_ : 90166
) : 48458
# : 45380
( : 42818
* : 38339
= : 11445
~ : 9052
$ : 7707
¿ : 7639


# Creating and Testing Another Tokenizer (Extracting All Three Character Subsets)

### Creating the tokenizer

In [4]:
def ExtractSetsOfThree(all_text):
    """
    Extracts strings that include each character and its following two characters

    Example:
        input: testing123
        output: ['tes', 'est', 'sti', 'tin', 'ing', 'ng1', 'g12', '123']

    Parameters:
        all_text (str): One string containing all text

    Returns:
        sets (list of str): extracted sets of three characters
    """

    # Initialize sets of strings
    sets = []


    # Iterate through each character (except the last 2)
    for i in range(len(all_text) - 2):
        substring = all_text[i:i+3]
        sets.append(substring)

    return sets

### Testing a small set of text

In [5]:
# Test a random piece of text
text_test = '/hello//  siudfh lau89482u uansk48ngfadn4p8ajp*JAPRIONF54661PISUFHPsih 9f84PW&thp8'
subsets_test = ExtractSetsOfThree(text_test)
print(subsets_test[:8])

['/he', 'hel', 'ell', 'llo', 'lo/', 'o//', '// ', '/  ']


### Testing the tweeter_training.csv File

In [6]:
# Running the Tokenizer
subsets_tweet_text = ExtractSetsOfThree(raw_text)

# Show top 20 subsets from the tweeter_training.csv file
top_subsets = Counter(subsets_tweet_text).most_common(20)
print("\nTop Sets and Frequencies:")
for set, count in top_subsets:
    print(set, ":", count)


Top Sets and Frequencies:
 th : 1083248
ing : 849115
 to : 796095
ng  : 756348
the : 745486
to  : 578320
he  : 576971
  @ : 522450
 I  : 496608
nd  : 418627
 an : 383456
... : 379842
 ha : 376316
er  : 369208
 a  : 366212
ed  : 357901
.   : 355115
and : 354033
you : 348543
 yo : 343839
