# Preprocessing of tweets datasets

In [None]:
import pandas as pd
import re
import string
import emoji

## Load in the datasets

In [2]:
# Set preliminary viewing options
pd.set_option('display.max_colwidth', None)

In [3]:
d_text = pd.read_csv('../Data/Combined_sets/depression_tweets.tsv', index_col=[0], sep='\t')
a_text = pd.read_csv('../Data/Combined_sets/anxiety_tweets.tsv', index_col=[0], sep='\t')

## Preprocessing the text

What we're substituting:
* lowercasing everything
* html links to 'URL'
* mentions to 'USR'
* hashtags to 'HASH'
* '< enter >' to ' '
* '< squote >' to ' " '
* '< dquote >' to ' " '

In [None]:
# DEPRESSION

d_tweets = []

for i, row in d_text.iterrows():
    tweet = row['full_text'].lower()
    tweet = re.sub(r'https?:\/\/?[a-z0-9]+\.[a-z0-9\/]+', 'URL', tweet)
    tweet = re.sub(r'@[^ ]+', 'USR', tweet)
    tweet = re.sub(r'$h[^ ]+', 'HASH', tweet)
    tweet = re.sub(r'#[^ ]+', 'HASH', tweet)
    tweet = re.sub(r'<enter>', ' ', tweet)
    tweet = re.sub(r'<squote>', "'", tweet)
    tweet = re.sub(r'<dquote>', "'", tweet)
        
    d_tweets.append(tweet)

In [7]:
# ANXIETY

a_tweets = []

for i, row in a_text.iterrows():
    tweet = row['full_text'].lower()
    tweet = re.sub(r'https?:\/\/?[a-z0-9]+\.[a-z0-9\/]+', 'URL', tweet)
    tweet = re.sub(r'@[^ ]+', 'USR', tweet)
    tweet = re.sub(r'$h[^ ]+', 'HASH', tweet)
    tweet = re.sub(r'#[^ ]+', 'HASH', tweet)
    tweet = re.sub(r'<enter>', ' ', tweet)
    tweet = re.sub(r'<squote>', "'", tweet)
    tweet = re.sub(r'<dquote>', "'", tweet)
        
    a_tweets.append(tweet)

#### Add the edited text as a new column

In [8]:
d_text['edited_text'] = d_tweets
a_text['edited_text'] = a_tweets

Drop the full_text column to save computational power

In [10]:
d_text = d_text.drop(columns = ['full_text'])
a_text = a_text.drop(columns = ['full_text'])

## Counting

We're counting:
* words
* punctuation present
* exclamation marks
* mentions
* emojis
* if an emoji is present
* positive emojis
* negative emojis

### Words

In [11]:
# DEPRESSION
d_text['word_count'] = d_text['edited_text'].apply(lambda x: len(x.split()))

# ANXIETY
a_text['word_count'] = a_text['edited_text'].apply(lambda x: len(x.split()))

### Punctuation

In [12]:
# DEPRESSION

d_punctuation_counts = []

for index, row in d_text.iterrows():
    tweet = row['edited_text'] # Assuming the column with tweets is named 'text'
    # Count punctuation in the tweet
    punctuation_count = sum(1 for char in tweet if char in string.punctuation)
    d_punctuation_counts.append(punctuation_count)

# Add the punctuation counts as a new column to the DataFrame
d_text['punctuation_count'] = d_punctuation_counts


In [13]:
# ANXIETY
a_punctuation_counts = []

for index, row in a_text.iterrows():
    tweet = row['edited_text'] # Assuming the column with tweets is named 'text'
    # Count punctuation in the tweet
    punctuation_count = sum(1 for char in tweet if char in string.punctuation)
    a_punctuation_counts.append(punctuation_count)

# Add the punctuation counts as a new column to the DataFrame
a_text['punctuation_count'] = a_punctuation_counts

### Exclamation marks

In [14]:
# DEPRESSION
d_exclamation_counts = []

for index, row in d_text.iterrows():
    tweet = row['edited_text']
    exclamation_count = tweet.count('!')
    d_exclamation_counts.append(exclamation_count)

d_text['exclamation_count'] = d_exclamation_counts


In [15]:
# ANXIETY
a_exclamation_counts = []

for index, row in a_text.iterrows():
    tweet = row['edited_text']
    exclamation_count = tweet.count('!')
    a_exclamation_counts.append(exclamation_count)

a_text['exclamation_count'] = a_exclamation_counts

In [16]:
a_text.head()

Unnamed: 0_level_0,edited_text,word_count,punctuation_count,exclamation_count
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tD2619064,"shits so fucking inhumane. white people can deny or look the other way all they want. eat shit, fuck you &amp; your family tree bitch. black lives matter every day. good or bad. straight or gay. trans. darkskin or light skin. female or male. it‚Äôs a life.",47,13,0
tD2619065,"justice for eric lurry. there is clear camera fucking evidence. cops in joliet held his nose for 98 seconds, slapped him, made him go unconscious &amp; they killed him. HASH",30,7,0
tD2619066,can we fucking talk about this. URL,7,1,0
tD2619067,juice wrld makes me feel shit i didn‚Äôt even know i could feel. like why he so fucking talented &amp; make songs that make me t up and also keep going when life so dark. i love you j. i‚Äôll never shut up about you. ü§çüïä HASH,47,6,0
tD2619068,pop smoke already had me won over...but after hearing mood swings ü•¥ yeah ima need to do my shit to that trackkkkkk,22,3,0


## Mentions

In [17]:
d_mention_counts = []

for index, row in d_text.iterrows():
    tweet = row['edited_text']
    mention_count = tweet.count('USR')
    d_mention_counts.append(mention_count)

d_text['mention_count'] = d_mention_counts


In [20]:
a_mention_counts = []

for index, row in a_text.iterrows():
    tweet = row['edited_text']
    mention_count = tweet.count('USR')
    a_mention_counts.append(mention_count)

a_text['mention_count'] = a_mention_counts

### Emojis

When counting emojis, we distinguish in four categories:

1. emoji_count = how many emojis are present in this message?
2. has_emoji = does the tweet have an emoji or not?
3. positive emojis = from list
4. negative emojis = from list

In [21]:
# Function definition:
def count_emojis_with_library(text):
  return emoji.emoji_count(text)

In [22]:
# DEPRESSION
d_text['emoji_count'] = d_text['edited_text'].apply(count_emojis_with_library)

# ANXIETY
a_text['emoji_count'] = a_text['edited_text'].apply(count_emojis_with_library)

Make has_emoji

In [23]:
# DEPRESSION
d_text['has_emoji'] = d_text['emoji_count'] > 0

# ANXIETY
a_text['has_emoji'] = a_text['emoji_count'] > 0

### Classifying positive emojis and negative emojis

In [24]:
# list the unicodes of positive affected emojis and negatively affected emojis
positive_affect_emojis = ["U+1F602", "U+1F603", "U+1F604", "U+1F601", "U+1F923", "U+1F642", "U+2764", "U+1F60A", "U+263A", "U+1F970", "U+1F929", "U+1F970", "U+1F929", "U+1F495", "U+1F973"]
negative_affect_emojis = ["U+1F62D", "U+1F622", "U+1F630", "U+1F61E", "U+1F61F", "U+1F620", "U+1F621", "U+1F612", "U+1F629", "U+1F480", "U+1F922", "U+1F480", "U+1F922", "U+1F494", "U+1F92C"]

In [25]:
def count_specific_unicode_emojis(text, emoji_list):
    count = 0
    for emoji_unicode in emoji_list:
        # Convert Unicode escape sequence to character
        emoji_char = chr(int(emoji_unicode[2:], 16))
        count += text.count(emoji_char)
    return count

In [26]:
# DEPRESSION
d_text['positive_emojis'] = d_text['edited_text'].apply(lambda x: count_specific_unicode_emojis(x, positive_affect_emojis))
d_text['negative_emojis'] = d_text['edited_text'].apply(lambda x: count_specific_unicode_emojis(x, negative_affect_emojis))

# ANXIETY
a_text['positive_emojis'] = a_text['edited_text'].apply(lambda x: count_specific_unicode_emojis(x, positive_affect_emojis))
a_text['negative_emojis'] = a_text['edited_text'].apply(lambda x: count_specific_unicode_emojis(x, negative_affect_emojis))


# Save datasets

In [29]:
d_text.to_csv('../Data/combined_sets/d_processed_tweets.tsv', sep='\t')
a_text.to_csv('../Data/combined_sets/a_processed_tweets.tsv', sep='\t')