## Combining users and questions files and cleaning the data

In [29]:
import pandas as pd

users = pd.read_csv('data/users.csv')
posts = pd.read_csv('data/questions.csv')

  users = pd.read_csv('data/users.csv')
  posts = pd.read_csv('data/questions.csv')


In [8]:
print('Number of users', len(users))
# We need only gender from users
users = users[['Id', 'Sex']]
users.drop_duplicates(inplace = True)
print('Number of users after duplicates removal', len(users))
users.dropna(inplace = True)
print('Number of users after NA removal', len(users))
users = users[users['Sex'].isin([0,1])]
print('Number of users with binary gender', len(users))

# Filtering out wierd cases with non numeric Id's
mask = pd.to_numeric(users['Id'], errors='coerce').notna() & pd.to_numeric(users['Sex'], errors='coerce').notna()
users = users[mask]
users[['Id', 'Sex']] = users[['Id', 'Sex']].astype(int)
print('Cleaned number', len(users))

Number of users 431563
Number of users after duplicates removal 431502
Number of users after NA removal 431095
Number of users with binary gender 400850
Cleaned number 400656


In [11]:
print('Number of posts', len(posts))
posts = posts[['Id', 'CreatedBy', 'Content', 'NewMood']]
posts.dropna(inplace = True)
print('Number of posts after NA removal', len(posts))
posts[['Id', 'CreatedBy']] = posts[['Id', 'CreatedBy']].astype(int)
posts[['Content', 'NewMood']] = posts[['Content', 'NewMood']].astype(str)
posts = pd.merge(posts, users, left_on = 'CreatedBy', right_on = 'Id', how = 'inner')
print('Number of posts with known binary gender', len(posts))

Number of posts 6633562
Number of posts after NA removal 4765774
Number of posts with known binary gender 3822868


In [14]:
posts = posts.rename(columns={
    'Id_x': 'id',
    'CreatedBy': 'user',
    'Content': 'text',
    'NewMood': 'mood',
    'Sex': 'sex'
})

# Drop the Id_y column
posts = posts.drop('Id_y', axis=1)

## Extracting only 16 main labels, removing short posts and saving the result

In [15]:
moods = set(['Sad','Lonely','Angry','Annoyed','Frustrated','Furious','Anxious','Stressed','Afraid','Nervous','Worried','Loving','Caring','Supportive','Happy','Excited'])

In [16]:
posts = posts[posts['mood'].isin(moods)]
print(len(posts))

1738282


In [17]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('FacebookAI/roberta-base')

In [18]:
def token_count(text):
    return len(tokenizer.encode(text, add_special_tokens=True))

In [19]:
posts['tokens'] = posts['text'].apply(token_count)

Token indices sequence length is longer than the specified maximum sequence length for this model (636 > 512). Running this sequence through the model will result in indexing errors


In [20]:
# Filtering data as defined in preregistration

posts = posts[(posts['tokens'] >= 5) & (posts['tokens'] <= 512)]
print(len(posts))

1711514


In [21]:
posts = posts.drop('tokens', axis=1)

In [27]:
posts.to_csv('data/dataset.csv', index=False, quoting=1, escapechar='\\', doublequote=True)