<a href="https://colab.research.google.com/github/joywang233/TwitterPreprocessing/blob/main/Text_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import pandas as pd
import numpy as np
import re
import string
import warnings
from gensim.utils import deaccent
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords as stop_words

  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
def clean_text(text):
  #remove text in square brackets
  text = re.sub(r'\[.*?\]', '', text)
  #remove url
  text = re.sub(r"http\S+", "", text)
  text = re.sub(r"https\S+", "", text)
  #remove mentioned user
  text = re.sub('@[^\s]+','',text)
  #remove punctuation
  text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
  #remove rt
  text = text.replace('RT', '').replace('\n', ' ').strip()
  text = text.replace('rt', '').replace('\n', ' ').strip()
  text = text.replace('gt', '').replace('\n', ' ').strip()
  return text



# Remove emojis
def remove_emojis(text):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', text)



class WhiteSpacePreprocessing():
    """
    Ref: https://github.com/MilaNLProc/contextualized-topic-models/blob/master/contextualized_topic_models/utils/preprocessing.py
    Provides a very simple preprocessing script that filters infrequent tokens from text
    """

    def __init__(self, documents, stopwords_language="english", vocabulary_size=2000):
        """
        :param documents: list of strings
        :param stopwords_language: string of the language of the stopwords (see nltk stopwords)
        :param vocabulary_size: the number of most frequent words to include in the documents. Infrequent words will be discarded from the list of preprocessed documents
        """
        self.documents = documents
        self.stopwords = set(stop_words.words(stopwords_language)) #you may include your customized stopwords list
        self.vocabulary_size = vocabulary_size

        warnings.simplefilter('always', DeprecationWarning)
        warnings.warn("WhiteSpacePreprocessing is deprecated and will be removed in future versions."
                      "Use WhiteSpacePreprocessingStopwords.")

    def preprocess(self):
        """
        Note that if after filtering some documents do not contain words we remove them. That is why we return also the
        list of unpreprocessed documents.
        :return: preprocessed documents, unpreprocessed documents and the vocabulary list
        """
        preprocessed_docs_tmp = self.documents
        preprocessed_docs_tmp = [deaccent(doc.lower()) for doc in preprocessed_docs_tmp]
        preprocessed_docs_tmp = [doc.translate(
            str.maketrans(string.punctuation, ' ' * len(string.punctuation))) for doc in preprocessed_docs_tmp]
        print('check stop words:', self.stopwords)
        preprocessed_docs_tmp = [' '.join([w for w in doc.split() if len(w) > 0 and w not in self.stopwords])
                                 for doc in preprocessed_docs_tmp]

        vectorizer = CountVectorizer(max_features=self.vocabulary_size)
        vectorizer.fit_transform(preprocessed_docs_tmp)
        temp_vocabulary = set(vectorizer.get_feature_names_out())
        #print(len(temp_vocabulary))

        preprocessed_docs_tmp = [' '.join([w for w in doc.split() if w in temp_vocabulary])
                                 for doc in preprocessed_docs_tmp]

        # the size of the preprocessed or unpreprocessed_docs might be less than given docs
        # for that reason, we need to return retained indices to change the shape of given custom embeddings.
        preprocessed_docs, unpreprocessed_docs, retained_indices = [], [], []
        for i, doc in enumerate(preprocessed_docs_tmp):
            if len(doc) > 0:
                preprocessed_docs.append(doc)
                unpreprocessed_docs.append(self.documents[i])
                retained_indices.append(i)

        vocabulary = list(set([item for doc in preprocessed_docs for item in doc.split()]))
        return preprocessed_docs, unpreprocessed_docs, vocabulary, retained_indices

  and should_run_async(code)
  text = re.sub('@[^\s]+','',text)


In [15]:
#Read your text file(can be csv or any other format) here
raw_text = [
    'Interested in a career in #cybersecurity? üíª\n\nJoin us on Wednesday, October 19 from 12:00 pm ‚Äì 2:00 pm ET for a career panel with three esteemed leaders in the industry.\n\nSpots are limited so reserve yours today ‚¨áÔ∏è\nhttps://t.co/UTpyoj4Mpr https://t.co/G3ocJNAuU0'
    '‚û°Ô∏è‚û°Ô∏èCheck out the 5 main benefits of #Web3!\n\n#data #security #cybersecurity #bigdata #privacy #python #javascript #cloud #technology #devcommunity #coding #developers #software #aws #serverless #webdevelopment #opensource #iot https://t.co/UCtna6eme1'
    'RT @LetsDefendIO: Cybersecurity Wheel https://t.co/4qDF118WRf'
    'Grand raffle prize at @laasersladybugs Bugs, Bags &amp; Brews event!! üêû \n\nWin the bags &amp; boards on Saturday at Schram Brewery in Chaska!! üç∫ \n\nPlus, become an ally in the fight to #EndtheStigma. All proceeds go to mental health initiatives in public schools.\n\n1-6 pm. See you there! https://t.co/vgO4cRco11'
    '@AltCryptoGems @beyondprotocol1 has secured a $12.5M in this bear season to roll out it mainnet and building the next big thing: smart contracts for IoT. \n#DataSecurity #Infosec #100DaysOfCode #Hacking #Cybersecurity #AI #Crypto #CryptoNews #smartcities #smarthome https://t.co/xDkLKyFpsB'
    'RT @NandanLohitaksh: IDOR Checklist by @hunter0x7 \n\n#bugbounty #bugbountytips #cybersecurity https://t.co/MInHMRrCQL'
    ]

cleaned_text = [clean_text(txt) for txt in raw_text]
cleaned_text = [remove_emojis(txt) for txt in cleaned_text]
processor = WhiteSpacePreprocessing(cleaned_text)
preprocessed_docs, unpreprocessed_docs, vocab, retained_indices = processor.preprocess()
preprocessed_docs #here is the cleaned text in a list of string


check stop words: {'just', 'before', 'wouldn', 'into', 'does', 'between', 'where', "shouldn't", 'most', 'having', "weren't", 'below', 'we', 's', 'if', 't', 'was', 'had', 'that', "hasn't", 'here', 'my', 'now', 'him', 'and', 'how', 'be', 'mustn', 'yourselves', 'doesn', "needn't", 'off', 'more', 'the', 'each', "that'll", 'they', 'aren', "won't", 'will', 'any', 'its', 'theirs', "mustn't", 'so', 'shouldn', "don't", 'your', 'further', 'herself', 'myself', 'when', 'himself', "you'll", 'a', "didn't", 'at', "it's", 'all', "wasn't", 'other', 'wasn', 'd', 'no', 'o', "doesn't", 'don', 'of', 'did', 'them', "hadn't", 'is', 'been', 'hadn', "you're", "shan't", 'again', 'can', 'during', 'has', 'yourself', 'about', 'very', "you'd", 'above', 'it', 'nor', 'ain', 'ours', 'with', 're', 'she', 'i', 'have', 'from', 've', 'out', 'you', 'in', 'me', "haven't", 'her', 'doing', 'm', "you've", 'under', 'those', 'are', "should've", 'over', 'haven', 'll', 'yours', 'his', "mightn't", "couldn't", 'ma', 'hasn', 'should'

  and should_run_async(code)


['interested career cybersecurity join us wednesday october 19 1200 pm 200 pm et career panel three esteemed leaders industry spots limited reserve today main benefits web3 data security cybersecurity bigdata privacy python javascript cloud technology devcommunity coding developers software aws serverless webdevelopment opensource iot cybersecurity wheel raffle prize bugs bags amp brews event win bags amp boards saturday schram brewery chaska plus become ally fight endthestigma proceeds go mental health initiatives public schools 16 pm see secured 125m bear season roll mainnet building next big thing sma contracts iot datasecurity infosec 100daysofcode hacking cybersecurity ai crypto cryptonews smacities smahome idor checklist bugbounty bugbountytips cybersecurity']