# Tweet Sanitizer
---
A Python code to sanitize i.e. remove hashtags, mentions, links, photos, etc. from raw tweet content.

In [1]:
import numpy as np

import re
import csv
import os

import emot
import emoji

from tqdm.notebook import tqdm

In [2]:
def reduce_to_polish(in_file, out_file):
    with open(in_file, 'r') as f:
        reader = csv.reader(f)
        header = next(reader)
        
        with open(out_file, 'w') as wf:
            writer = csv.writer(wf)
            writer.writerow(header)
            
            for row in reader:
                if row[11] == 'pl':
                    writer.writerow(row)

**PARTIAL SANITIZATION**

Remove:
* strange non-utf-8 characters
* user mentions
* links {https://t.co/P3zt8zBUbL}
* photos content {pic.twitter.com...}
* hashtags with hashcodes {#.43djr324rj34}
* special characters {/w; /n; /r}
* redundant spaces


**FULL SANITIZATION**

Remove:
* all like in partial sanitization
* all hashtag hashes {#}
* others but texts

Extract:
* emoticons {:); ;)}
* emoji {🤦‍♂️; 🤣; 😂; 🤣}

In [3]:
def partial_sanitization(text):
    url_pat = '(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|http[s]?://)'
    mention_pat = '@[\w\-]+'
    photo_pat = 'pic.twitter.com\/[\w\-]+'
    hashcode_pat = '#.[a-zA-Z0-9]{11}.*(twitter|facebook|reddit|youtube)'
    multidot_pat = '(\.\.\.|…)'
    black_sq_pat = '■'
    space_pat = '\s+'
    
    text = re.sub(url_pat, '', text)
    text = re.sub(mention_pat, '', text)
    text = re.sub(photo_pat, '', text)
    text = re.sub(hashcode_pat, '', text)
    text = re.sub(multidot_pat, '', text)
    text = re.sub(black_sq_pat, '', text)
    text = re.sub(space_pat, ' ', text)
    
    return text

def extract_emoji_(text):
    desc = emot.emoji(text)
    emojis = ' '.join(desc['value'])
    
    for emoji_ in set(desc['value']):
        text = re.sub(emoji_, '', text)
    
    return text, emojis

def extract_emoji(text):
    emoji_list = [c for c in text if c in emoji.UNICODE_EMOJI]
    emojis = ' '.join(emoji_list)
    
    for emoji_ in emoji_list:
        text = re.sub(emoji_, '', text)
    
    return text, emojis

def extract_emoticons(text):
    desc = emot.emoticons(text)
    try:  # there's a bug in 'emot' library causing TypeError in some cases
        emoticons = ' '.join(desc['value'])
        
        for emoticon in set([''.join(f'\{x}' for x in v) for v in desc['value']]):
            try:
                text = re.sub(emoticon, '', text)
            except Exception:
                pass
    except TypeError:
        emoticons, emoticon_text, emoticon_pos = '', '', ''
        
    return text, emoticons

def full_sanitization(text):
    text = partial_sanitization(text)
    
    hash_pat = '#'
    space_pat = '\s+'
    text = re.sub(hash_pat, '', text)
    
    text, emojis = extract_emoji(text)
    text, emoticons = extract_emoticons(text)
    
    text = re.sub(space_pat, ' ', text)
    
    return text, emojis, emoticons
    

In [4]:
test_text = '#Kompania #Węglowa @weglowa :( pic.twitter.com/O2ixmQ2Jm1 https:// blokuje śląskie sądy. http://niezalezna.pl/209246-sprawdzili-czy-tusk 😂 20 tysięcy pozwów ws. deputatów węglowych :/- Dziennik...zachodni.pl:http://niezalezna.pl/209246-sprawdzili-czy-tusk-jest-winny #.VIXGNXEL7p8.twitter …'
test_text

'#Kompania #Węglowa @weglowa :( pic.twitter.com/O2ixmQ2Jm1 https:// blokuje śląskie sądy. http://niezalezna.pl/209246-sprawdzili-czy-tusk 😂 20 tysięcy pozwów ws. deputatów węglowych :/- Dziennik...zachodni.pl:http://niezalezna.pl/209246-sprawdzili-czy-tusk-jest-winny #.VIXGNXEL7p8.twitter …'

In [5]:
partial_sanitization(test_text)

'#Kompania #Węglowa :( blokuje śląskie sądy. 😂 20 tysięcy pozwów ws. deputatów węglowych :/- Dziennikzachodni.pl: '

In [6]:
full_sanitization(test_text)

('Kompania Węglowa blokuje śląskie sądy. 20 tysięcy pozwów ws. deputatów węglowych - Dziennikzachodni.pl: ',
 '😂',
 ':( :/')

In [7]:
extract_emoji_('This does not work: 🤨, !🇵, 🤪, and 🥺. But this 😂 works!')

('This does not work: \U0001f928, !🇵, \U0001f92a, and \U0001f97a. But this  works!',
 '😂')

In [8]:
extract_emoji('This does not work: 🤨, !🇵, 🤪, and 🥺. But this 😂 works!')

('This does not work: , !, , and . But this  works!',
 '\U0001f928 🇵 \U0001f92a \U0001f97a 😂')

In [9]:
def sanitize_tweets(in_file, out_file, full_sanitize=False, reduce_to_polish=True, save_texts_only=False):
    with open(in_file, 'r') as f:
        reader = csv.reader(f)
        header = next(reader)
        
        # 10-th column is a tweet to sanitize; 11-th column is tweet language
        with open(out_file, 'w') as wf:
            writer = csv.writer(wf)
            if save_texts_only:
                writer.writerow(['tweet'])
            else:
                if full_sanitize:
                    writer.writerow(header[:11] + ['emojis', 'emoticons'] + header[11:])
                else:
                    writer.writerow(header)

            for row in reader:
                if not reduce_to_polish or row[11] == 'pl':
                    if save_texts_only:
                        if full_sanitize:
                            text, _, _ = full_sanitization(row[10])
                            writer.writerow([text])
                        else:
                            text = partial_sanitization(row[10])
                            writer.writerow([text])
                    else:
                        if full_sanitize:
                            text, emojis, emoticons = full_sanitization(row[10])
                            writer.writerow(row[:10] + [text, emojis, emoticons] + row[11:])
                        else:
                            text = partial_sanitization(row[10])
                            writer.writerow(row[:10] + [text] + row[11:])

In [10]:
sanitize_tweets('data/sady_supplement/sady_infos_raw.csv', 'data/sady_supplement/sady_infos_sanitized.csv',
                full_sanitize=True, reduce_to_polish=False)

**Get all texts from vulgar tweets.**