In [1]:
import pandas as pd
import regex as re
import string

In [2]:
def clean_text(text):
    #lowercase   
    text = text.lower()
    #remove square brackets
    text = re.sub(r'\[.*?\]', '', text)
    #remove urls   
    text = re.sub(r'http\S+', '', text)
    # remove hashtages
    text = re.sub(r'#\S+', '', text)
    # remove username reference
    text = re.sub(r'@\S+', '', text)
    #remove punctuation   
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text) 
    #remove numbers
    text = re.sub(r'\w*\d\w*', '', text)

    #remove emojis    
    patterns = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F" 
        u"\U0001F300-\U0001F5FF" 
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F1E0-\U0001F1FF"  
                           "]+", flags = re.UNICODE)
    text = patterns.sub(r'', text)
    
    #remove non letters and spaces 
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s\s+', ' ', text)

    return text


In [3]:
def sample_and_drop(df, k):
    sampled_df = df.sample(k, random_state=666, ignore_index=True)
    text_series = sampled_df.loc[:k, 'text']
    top = pd.DataFrame(text_series.apply(lambda x: clean_text(x)), dtype=str)
    top.drop_duplicates(inplace=True, ignore_index=True)
    return top

In [4]:
clean_text('2342345  !!!! stop this war already ‼️ http://ww.f.d #russiaukraine @potus')

' stop this war already '

In [5]:
df = pd.read_csv('merged_eng_only.csv', low_memory=False, lineterminator='\n', index_col=0)

In [6]:
sample10k=sample_and_drop(df, 10000)
sample50k=sample_and_drop(df, 50000)
sample100k=sample_and_drop(df, 100000)
sample1m=sample_and_drop(df, 1000000)
sample2m=sample_and_drop(df, 2000000)
sample5m=sample_and_drop(df, 5000000)
sample10m=sample_and_drop(df, 10000000)
sample30m=sample_and_drop(df, 30000000)

In [7]:
# do not ucomment unless storing desired 
suffix='no_hashtags'
sample10k.to_csv(f'sample10k_{suffix}.csv', encoding='utf-8')
sample50k.to_csv(f'sample50k_{suffix}.csv', encoding='utf-8')
sample100k.to_csv(f'sample100k_{suffix}.csv', encoding='utf-8')
sample1m.to_csv(f'sample1m_{suffix}.csv', encoding='utf-8')
sample2m.to_csv(f'sample2m_{suffix}.csv', encoding='utf-8')
sample5m.to_csv(f'sample5m_{suffix}.csv', encoding='utf-8')
sample10m.to_csv(f'sample10m_{suffix}.csv', encoding='utf-8')
sample30m.to_csv(f'sample30m_{suffix}.csv', encoding='utf-8')

In [8]:
# print(f'10k - {sample10k.sample(1).text} ')
# print(f'50k - {sample50k.sample(1).text} ')
# print(f'100k - {sample100k.sample(1).text} ')
# print(f'1m - {sample1m.sample(1).text} ')
# print(f'2m - {sample2m.sample(1).text} ')
# print(f'5m - {sample5m.sample(1).text} ')
# print(f'10m - {sample10m.sample(1).text} ')