# Preprocessing Raw Data

<p>Parts of the code that need to be checked before execution when data access is available again: 
    <ul>
        <li> Label processor: can't drop index 63399 because you don't know what the hashed version is. 
        <li> Pre-processing average counselor reviews: check whether you can actually drop that section of the code. 
        <li> Pre-processing average counselor reviews: Figure out why gamma is 0.9
    </ul>
</p>

In [1]:
import label_processor
import nltk
import os
import random
import re
import pickle
import torch
import tqdm
import transformers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import defaultdict

from label_processor import list_series_values
from label_processor import LabelProcessorSimplified, list_series_value_counts

from sklearn import preprocessing

from tqdm.auto import tqdm, trange

In [2]:
raw_directory = "raw/"
pickle_directory = "pickle/"
save_directory = "saved/"

### Pre-processing counselor surveys (COUNSELOR_SURVEY.tsv)

In [3]:
## loads files and gets the number of raw rows to compare how many rows are empty later
## standardizes column names to lowercase

survey = pd.read_csv(raw_directory+"SURVEY_VALUE.tsv", sep='\t')
survey.columns = survey.columns.str.lower()
lines = survey.shape[0]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [None]:
## processing the survey values 
## check that the values correspond correctly 

survey.value.replace(["a:0:{}",
                      "N;",
                      'a:2:{i:0;N;i:1;s:0:";}"',
                      'a:3:{i:0;N;i:1;N;i:2;s:0:";}"',
                      'a:1:{i:0;s:0:";}"',
                      'a:1:{i:0;s:0:";}"',
                      'a:1:{i:0;s:3:n/a";}"',
                      'a:1:{i:0;s:3:N/A";}"',
                      'a:2:{i:0;s:0:";i:1;s:0:"";}"',
                      's:1:" ";',
                      's:4:"b:0;";',
                      'a:1:{i:0;s:0:"";}',
                      ""
                     ],np.nan,inplace=True)

In [None]:
## drops empty values and checks how many lines were dropped

survey = survey.dropna(subset=["value"])
lines2 = survey.shape[0]
print(f"{lines - lines2} lines were blank or N/A ({(lines - lines2)/lines*100:.3f}%)")

In [None]:
## checks that the responses are unicode characters; this part checks for responses starting with a:

chars = "\w\s\'\"\d\,\.\-\/\&\(\):/!?\+\@=-—£\[\]%’‘’#\*<>”“´…"
giant_regex = r"[i][:]\d+[;][s][:]\d+[:]\s\"\s[\w\s\'\"\d\,\.\-\/\&\(\):/!?\+\@=-—£\[\]%’‘’#\*<>”“´…]+\s\"\s[;]"

array_values = survey.value.dropna()
array_values = array_values[array_values.str.contains("^a:\d",regex=True)]

array_values = array_values.apply(lambda v: re.findall(giant_regex,v))
array_values = array_values.apply(lambda v: v if len(v)>0 else np.nan)
array_values = array_values.apply(lambda a:[i for i in a if re.match("\w",i)] if type(a) == list else a)

array_values = array_values.astype(str).str.replace('i:\d;s:\d+:\s\"\s',"",regex=True)
array_values = array_values.astype(str).str.replace('\s\"\s;', "", regex=True)

survey["array_value"] = array_values
print(f"failed: {array_values.isna().sum()}, or {array_values.isna().mean()*100:.3f}%")

In [None]:
## drops the failed rows from the survey
drop_index = array_values[array_values.isna()].index
survey = survey.drop(drop_index)

In [None]:
## checks the remaining values, incl. encoded responses starting with s:
## checks that all remaining non-N/A values are strings

## values that are not na and match; note that this would incl. responses starting with s:digit 
## these responses were not incl. in array_values, which only looked for values containing a:digit
str_values = survey[~(survey.value.isna()) & survey.array_value.isna()].value

## get the shape of this
a = str_values.shape[0]

## values that start with the format s:digit 
m = str_values.str.contains("^s:\d+", regex=True)

## values that do not start with the format s:digit
leftover = str_values[~m]

## values that start with the format s:digit
str_values = str_values[m]

## get the shape of values that start with format s:digit
b = str_values.shape[0]

print("Okay, all remaining non-N/A values are strings." if a==b 
      else f"Hmm... {leftover.shape[0]} values are not N/A but not recognised as strings either" )

str_values = str_values.str.replace('s:\d+[:]\s\"\s',"",regex=True)
str_values = str_values.str.replace('\s\"\s;', "", regex=True)

In [None]:
## checks the number of failed values 

survey['str_value'] = str_values
f = ~str_values.str.match('\w+')
print(f"failed: {f.sum()}, or {f.mean()*100:.3f}%")

In [None]:
## drop the failed rows from the array
to_drop = f[f==True].index
survey = survey.drop(to_drop)

In [None]:
## creates the columns for a table of answers to the survey

q_ids = survey.question_id.unique()
q_ids = [i for i in q_ids if i != 23]
for q in q_ids.copy():
    s = survey[survey.question_id==q]
    a = s.str_value.isna().all()
    b = s.array_value.isna().all()
    
    if a and not b:
        vals = s.array_value
    elif b and not a:
        vals = s.str_value
    else:
        raise ValueError(q)
    survey.loc[survey.question_id==q,q] = vals

In [None]:
## loading survey.tsv and standardizing column name

survey_conversation = pd.read_csv(raw_directory+"SURVEY.tsv", sep='\t')
survey_conversation.columns = survey_conversation.columns.str.lower()
survey_conversation = survey_conversation.rename({"id":"survey_id"},axis=1)

In [None]:
## grouping by survey id and setting questions as the columns

survey2 = survey.drop(['value','array_value','str_value','last_edit_time'],axis=1).set_index(['survey_id'])
survey_conversation2 = survey_conversation.copy()
for q in q_ids:
    i = survey2[q].dropna().index
    assert i.nunique() == i.shape[0]
    survey_conversation2 = survey_conversation2.join(survey2[q].dropna(),on="survey_id",how="left")

survey_conversation = survey_conversation2

In [None]:
## filling up some na values. If all columns are NA for that row, drop those. 

survey_conversation = survey_conversation.set_index("conversation_id").drop("survey_id",axis=1).dropna(how='all')

survey_conversation[21].fillna("NA",inplace=True)
survey_conversation[22].fillna("NA",inplace=True)
survey_conversation[33].fillna("NA",inplace=True)

survey_conversation[[18,26,27,19]] = survey_conversation[[18,26,27,19]].applymap(lambda a: [] if a is np.nan else a)

In [None]:
for i in survey_conversation.columns:
    survey_conversation[i] = survey_conversation[i].str.replace("\'", "", regex=True)

In [None]:
## sanity check
survey_conversation

In [None]:
## saving 
survey_conversation.to_pickle(save_directory+'counselor_survey_by_conversation.pickle')

### Pre-processing MESSAGE_PART.tsv

<p> concatenates all the tsvs into one file and drops some unnecessary columns

In [None]:
## pickles all the message tsvs for next step

num_files = max([int(re.findall("MESSAGE_PART_(\d+)",i)[0]) for i in os.listdir(raw_directory) if "MESSAGE_PART" in i])
for i in trange(1, num_files + 1): 
    df = pd.read_csv(f"{raw_directory}MESSAGE_PART_{i}.tsv", sep='\t')
    df.to_pickle(pickle_directory+f"MESSAGE_PART_{i}.pickle")

In [None]:
## saves all the messages in one file only 

num_files = max([int(re.findall("MESSAGE_PART_(\d+)",i)[0]) for i in os.listdir(raw_directory) if "MESSAGE_PART" in i])
df = pd.concat([pd.read_pickle(pickle_directory+f"MESSAGE_PART_{i}.pickle") for i in trange(1, num_files + 1)])
df.to_pickle(save_directory+"raw_messages.pickle")

# kill kernel to avoid memory overflow. Redo imports + variable definition and continue from next cell. 

In [None]:
# standardizes column names
# drops unnecessary columns

df = pd.read_pickle(save_directory+"raw_messages.pickle")
df.columns = df.columns.str.lower()
dropped_columns = ["salt","retries","delivery_error","media_uri","media_mimetype"]
df.drop(dropped_columns,axis=1,inplace=True)

### Pre-processing CONVERSATION_PARTICIPATION

<p> Uses conversation_participation.tsv to set the actor_ids as texter or counselor 

In [None]:
df = pd.read_pickle("saved/raw_messages.pickle")

In [None]:
## inspect df first!!
robot_id = df.loc[[1], 'ACTOR_ID'].iloc[0]; robot_id

In [None]:
## loading the tsv
## standardize column names

conversation_participation = pd.read_csv(raw_directory+'CONVERSATION_PARTICIPATION.tsv', sep='\t')
conversation_participation.columns = [i.lower() for i in conversation_participation.columns]

In [None]:
## Gets counselor and texter IDs

counselor_ids = conversation_participation[conversation_participation['interaction'].isin(['counselor','observer'])].actor_id.unique()
texter_ids = conversation_participation[conversation_participation['interaction'] == 'texter'].actor_id.unique()
counselor_ids = set(counselor_ids)
texter_ids = set(texter_ids)

In [None]:
## assigns interactions i.e. whether texter or counselor

df.loc[df.actor_id == robot_id, "interaction"] = "bot"
df.loc[df.actor_id.isin(texter_ids), "interaction"] = "texter"
df.loc[df.actor_id.isin(counselor_ids), "interaction"] = "counselor"

In [None]:
## checks whether there are missing interactions; if none, skip next step
if df.interaction.isna().any():
    print("There are missing interactions.")

In [None]:
## If still necessary, use the heuristic that all conversations are started by the texter

if df.interaction.isna().any():
    first_id_per_convo = df[df.id.isin(df.groupby('conversation_id').id.first())]
    texter_ids.update(first_id_per_convo[first_id_per_convo.interaction.isna()].actor_id)
    df.loc[df.actor_id.isin(texter_ids), "interaction"] = "texter"

In [None]:
## saves the df with actors

df.to_pickle(save_directory+'messages_with_actors.pickle')

### Fix character weirdness

In [None]:
## load the df again
df = pd.read_pickle(save_directory+'messages_with_actors.pickle')

In [None]:
## Use RoBERTa-base tokenizer to replace some unknown characters later

MODEL_NAME = "roberta-base"
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
## drops messed up messages that contain aes256
df = df[~df.message.str.contains("aes256", na=False)]

In [None]:
## replaces emojis
df.message = df.message.str.replace('<span class="?\w+ (\w+)"?></span>',"[\\1]")

In [None]:
## Fix various symbols/punctuation, e.g. inverted commas, dashes

df.message = df.message.str.strip().str.replace("[‘’]", "\'")
df.message = df.message.str.replace('[”“]','"')
df.message = df.message.str.replace('͟','')
df.message = df.message.str.replace("–","-")

In [None]:
## Remove empty and null messages

df.dropna(subset=['message'],inplace=True)
df.drop(df[df.message.str.len() == 0].index, inplace=True)

In [None]:
## Replace Chinese characters with the unk token

df.message = df.message.str.replace('[㔹㘵㔳㜴㙆㘱㙅㘹㘸㜲㜳㜷㘶㙂㙃㐹㘴㜹㘷㜵㜶㔴䑅㘳㘲䐸㍄㜰]',tokenizer.unk_token)

In [None]:
## uses frequency and tokenization to replace 'weird' and infrequent characters
## gets the frequency for characters in O(N) time
## tokenizes the text; identifies characters tokenized as more than one token (i.e. weird characters)

freq = defaultdict(lambda:0)
def addfreq(l):
        for i in j:
            freq[i] += 1

#tqdm.pandas()
df.message.apply(addfreq)

freq2 = {i:j for i,j in freq.items() if j < 100000 and len(tokenizer.tokenize(i))>1}
freq2 = pd.Series(freq2)

In [None]:
## replaces any 'weird' character that appears <100 times with the 'unk' token
df.message = df.message.str.replace("[" + "".join(freq2[freq2<100].keys()) + "]", tokenizer.unk_token)

In [None]:
## visual check of the rest of the tokens
freq2[freq2>=100]

In [None]:
## Replaces emojis with an [emoji] token as well as the actual emoji in bytes 
## (which will hopefully help it recongize emojis that are too infrequent and were replaced with unk)

def to_emoji(a):
    if len(a) == 10:
        return to_emoji(a[:5]) + to_emoji(a[5:])
    return chr(int("0x"+a,16))
df.message = df.message.str.replace("\[emoji(\w+)\]",lambda a:"[emoji]"+ to_emoji(a.groups()[0]))

In [None]:
df.to_pickle(save_directory+'messages_without_weird_characters.pickle')

### Conversation lengths (Fixing outliers)

<p> If bimodal distribution, use the plot to cut down on the lower end of messages. On the higher end, remove conversations with length > 2 x IQR + mean.
<br><br> Consider whether it would make sense to count the number of word/characters per conversation; because you can have very long (but very few) messages. Keep in mind that long messages MAY correspond with long conversations (i.e. conversations with many messages). 
</p>

In [None]:
df = pd.read_pickle(save_directory+'messages_without_weird_characters.pickle')

In [None]:
## gets the conversations lengths for each conversation
conversation_lengths = df[df.interaction!='bot'].groupby('conversation_id').message.agg(lambda x : sum(len(i) for i in x))

In [None]:
## plots the raw distribution for the number of messages per conversation

conversation_lengths.hist(bins=200)
plt.plot([1200, 1200], [0, 60000])
plt.plot([2200, 2200], [0, 60000], color='red')
# plt.xlim(right=25000)
plt.grid()
plt.xlabel("Number of messages per conversation (excl. non-bot characters)")
plt.ylabel("Number of conversations")
plt.title("Histogram of conversation.lengths")
plt.savefig("figs/conversation_length_histogram.pdf")
plt.show()

In [None]:
## plots the raw distribution for the number of messages per conversation

conversation_lengths.hist(bins=200)
plt.plot([1200, 1200], [0, 80000])
plt.plot([2200, 2200], [0, 80000], color='red')
plt.grid()
plt.xlabel("Number of messages per conversation (excl. non-bot characters)")
plt.ylabel("Number of conversations")
plt.title("Histogram of conversation.lengths")
plt.savefig("figs/conversation_length_histogram.pdf")
plt.show()

In [None]:
## cuts the number of conversations down using the criteria determined above. 
## plots the number of messages for the selected conversations. 
## remember to update the minimum length and maximum length you want. 

iqr = conversation_lengths.quantile(0.75) - conversation_lengths.quantile(0.25)
min_len = 1200
max_len = conversation_lengths.mean() + 2*iqr
print(f"removing from below {(conversation_lengths<min_len).mean() * 100:.2f}%")
print(f"removing from above {(conversation_lengths>max_len).mean() * 100:.2f}%")
conversation_lengths.hist(bins=200)
y = 31000
plt.plot([min_len, min_len, max_len,max_len, min_len],[y,0,0,y,y])
plt.grid()
plt.xlim(right=25000)
plt.xlabel("Conversation Length (non-bot characters) with minor preprocessing")
plt.ylabel("Number of Conversations")
plt.title("Histogram of conversation lengths");
plt.savefig("figs/conversation_length_histogram_selected.pdf")
plt.show()

In [None]:
## cuts down the number of messages kept using the criteria described above
## saves the final dataset with the wanted messages

convo_ids = conversation_lengths[((conversation_lengths>min_len) & (conversation_lengths<max_len))].index
df2 = df[df.conversation_id.isin(convo_ids)]
df2.to_pickle(save_directory+'selected_messages.pickle')

### Train/Test split

In [3]:
## loads the messages
df = pd.read_pickle(save_directory+"selected_messages.pickle")

In [4]:
## shuffles the conversations by conversation id

all_convo_ids = df.conversation_id.unique()
np.random.default_rng(42).shuffle(all_convo_ids)

In [5]:
## splits the conversations. 5% of the conversations are kept for testing. 

l = len(all_convo_ids);l
test_size = int(l*0.05)
test_convos = set(all_convo_ids[:test_size])
train_convos = set(all_convo_ids[test_size:])

torch.save(train_convos, save_directory+"train_convos.torch")
torch.save(test_convos, save_directory+"test_convos.torch")

### Pre-processing texter survey (TEXTER_SURVEY_RESPONSE.tsv/TEXTER_SURVEY_RESPONSE_VALUE.tsv)

In [None]:
## getting texter messages 

messages = pd.read_pickle(save_directory+'messages_with_actors.pickle')
texters = messages[messages.interaction == 'texter'].groupby('conversation_id').actor_id.first()

In [None]:
## loading and standardizing column names 
texter_survey_response = pd.read_csv(raw_directory+"TEXTER_SURVEY_RESPONSE.tsv", sep="\t")
texter_survey_response_value = pd.read_csv(raw_directory+"TEXTER_SURVEY_RESPONSE_VALUE.tsv", sep="\t")
texter_survey_response_value.columns = [i.lower() for i in texter_survey_response_value]
texter_survey_response.columns = texter_survey_response.columns.str.lower()

In [None]:
# Remove rows where the value is N/A
texter_survey_response_value = texter_survey_response_value[~texter_survey_response_value.value.isna()]

# Convert question numbers from float to str (via int)
texter_survey_response_value.question_id = texter_survey_response_value.question_id.apply(lambda x: str(int(x)))

# Find questions where a single survey has multiple answers, not counting the flag "Other - Write In"
g = texter_survey_response_value[texter_survey_response_value.value!='Other - Write In'].groupby(['response_id','question_id'])
a = g.count().value[g.count().value>1].index.to_frame().question_id.unique()
a.sort()

checkbox_questions = list(a)

print("The following questions are being treated as 'checkbox questions':")
", ".join(checkbox_questions)

In [None]:
## sets up the dataframe for collated texter responses with with columns: question_id and index: response_id 

texter_survey_collated = pd.DataFrame(
    columns=texter_survey_response_value.question_id.unique(), 
    index = texter_survey_response_value.response_id.unique()
)

texter_survey_collated = texter_survey_collated.applymap(lambda x:[])

In [None]:
# This generates the collated dataframe, (i.e. with the responses) Should take ~2-3 mins to run. Could be

for _, _, response_id, question_id, value, _ in tqdm(texter_survey_response_value.itertuples(),
                                                     total=texter_survey_response_value.shape[0]):
#     response_id = int(response_id)
    texter_survey_collated.loc[response_id, question_id].append(value)

In [None]:
## I think it's dropping out empty responses?  

texter_survey_collated = texter_survey_collated.applymap(lambda a: a if len(a) > 0 else np.nan)
cols = [texter_survey_collated[i].dropna().apply(len).max() for i in texter_survey_collated]
cols = texter_survey_collated.columns[[i == 1 for i in cols]]
texter_survey_collated[cols] = texter_survey_collated[cols].applymap(lambda a: a[0] if type(a)==list else a)

In [None]:
## joins the texter_survey_response (i.e. other information about texter_survey_response, e.g. time of submission, etc.)
## with the collated texter survey on response_id

texter_survey_response['response_id'] = texter_survey_response.id
texter_survey_collated = texter_survey_response.drop([0]).join(texter_survey_collated, on="response_id")

In [None]:
## Not sure what's happening from this point

a = [69, 71, 73, 74, 75, 151, 205, 72, 144, 145, 85, 86, 218, '221', '222', '223', '224', '227', '272', '273', '274', '275', '278', '269', 217, 70, 292, 270, '152', '225', 240, 220, 219, 289, 291, 268, 238, 241]
texter_demo_questions = [str(i) for i in sorted([int(i) for i in a])]

In [None]:
texter_survey_collated.index = range(len(texter_survey_collated))

actor_2_index = texter_survey_collated['actor_id'].to_frame().reset_index().groupby('actor_id').groups

In [None]:
def merge(xx):
    xx = list(xx)
    assert len(xx) != 0
    if len(xx) == 1:
        return xx[0]
    if any(type(x) == list for x in xx):
        xx = [i for i in xx if type(i) == list]
        return list({i for j in xx for i in j if not pd.isna(i)}) #flatten list
    xx = [i for i in xx if not pd.isna(i)]
    if len(xx) == 0:
        return np.nan
    xx = list(set(xx))
    if len(xx) == 1:
        return xx[0]
    else:
        return None # CONFLICT

In [None]:
tqdm.pandas()
a = texter_survey_collated[texter_demo_questions+['actor_id']]
a = a.groupby('actor_id').progress_apply(lambda a:a.apply(merge))
a.drop('actor_id',axis=1,inplace=True)
a.dropna(how='all',inplace=True)

In [None]:
to_add = []

def f(i):
    global to_add
    actor_id = i[0]
    convo_ids = list(texters[texters == actor_id].index)
    vals = i[1].dropna()
    index = actor_2_index[actor_id]
    
    for col,val in vals.iteritems():
        if type(val) == list:
            texter_survey_collated.loc[index.tolist(),col] = texter_survey_collated.loc[index.tolist(),col].apply(lambda _:val)
        else:
            texter_survey_collated.loc[index.tolist(),col] = val
    
    convo_ids = [i for i in convo_ids if i not in survey_convo_ids]
    to_add += [{"actor_id":actor_id,"conversation_id":c, **vals} for c in convo_ids]

In [None]:
survey_convo_ids = set(texter_survey_collated.conversation_id)
to_add = []
for i in tqdm(a.iterrows(),total=a.shape[0]):
    f(i)

In [None]:
texter_survey_collated2 = texter_survey_collated.append(to_add,ignore_index=True)
texter_survey_collated2.to_pickle(save_directory+"texter_survey_collated.pickle")

### Pre-processing active rescues 

In [None]:
## loading file and standardizing column name 
## replaces all empty cells with NA 

reporting_log = pd.read_pickle(raw_directory+"REPORTING_LOG.tsv", sep="\t")
reporting_log.columns=reporting_log.columns.str.lower()

for col in reporting_log:
    reporting_log[col] = reporting_log[col].replace("",np.nan)

In [None]:
## not entirely sure what's happening here 

report_type = reporting_log.groupby('conversation_id').type.agg(list)
report_type.name = "report_type"
report_sub_type = reporting_log.groupby('conversation_id').sub_type.agg(list)
report_sub_type.name = "report_sub_type"
any_finalized = reporting_log.groupby('conversation_id').finalized.max()
any_finalized.name = "any_finalized"
any_canceled = reporting_log.groupby('conversation_id').canceled.max()
any_canceled.name = "any_canceled"
all_canceled = reporting_log.groupby('conversation_id').canceled.min()
all_canceled.name = "all_canceled"

In [None]:
## drops all NA values 
## all empty report types or sub-types are replaced with NA value

report_type=report_type.apply(lambda a: [i for i in a if not pd.isna(i)])
report_sub_type=report_sub_type.apply(lambda a: [i for i in a if not pd.isna(i)])

report_type.loc[report_type.apply(len) == 0] = np.nan
report_sub_type.loc[report_sub_type.apply(len) == 0] = np.nan

In [None]:
## creates new dataframe with the report type, sub-type, finalized, cancelled, etc. 

report_by_convo_df = report_type.to_frame().join(report_sub_type,how='outer',)\
                                            .join(any_finalized,how='outer',)\
                                            .join(any_canceled,how='outer') \
                                            .join(all_canceled,how='outer')

### Pre-processing average counselor reviews 

In [None]:
## loading the tsv
## standardize column names

conversation_participation = pd.read_csv(raw_directory+'CONVERSATION_PARTICIPATION.tsv', sep='\t')
conversation_participation.columns = [i.lower() for i in conversation_participation.columns]

In [None]:
## creates a column for the total time taken for the conversation
## note that there may be multiple entries for each conversation (i.e. multiple entries for one conversation_id) hence next step
conversation_participation["total_time"] = pd.to_datetime(conversation_participation.ended_on) - pd.to_datetime(conversation_participation.created_on)

In [None]:
## gets the total time for each conversation 

counselor_interactions = conversation_participation[conversation_participation.interaction == "counselor"]
counselor_by_convo = {}
for c_id, group in tqdm(counselor_interactions.groupby("conversation_id")):
    if group.shape[0] == 1:
        m = group.actor_id.values[0]
    else:
        m = group.actor_id.values[group.total_time.argmax()]
    counselor_by_convo[c_id] = m

In [None]:
## gets the counselors tagged to each conversation

counselor_by_convo = pd.Series(counselor_by_convo,name="counselor_actor_id")
counselor_by_convo.index.name = "conversation_id"
counselor_by_convo.to_pickle(save_directory+"counselor_by_convo.pickle")

In [None]:
counselor_by_convo = pd.read_pickle(save_directory+"counselor_by_convo.pickle")

In [None]:
## getting helpfulness ratings

texter_survey_collated = pd.read_pickle(save_directory+"texter_survey_collated.pickle")

In [None]:
## processing the helpfulness ratings

helpful1 = texter_survey_collated[['conversation_id','64']].set_index('conversation_id')
helpful1['64'] = helpful1['64'].replace({ 
    "No":0.0, 
    "Yes":1.0 
}).dropna().astype('int')

how_helpful = texter_survey_collated[['conversation_id', '65']].set_index('conversation_id').dropna()
how_helpful['65'] = how_helpful['65'].str.replace("5 (very helpful)", "5", regex=False)
how_helpful['65'] = how_helpful['65'].str.replace("1 (slightly helpful)", "1", regex=False)
how_helpful[['65>1 (slightly helpful)','65>2', '65>3', '65>4']]=0
for i in tqdm(how_helpful.index):
    x = int(how_helpful.loc[i, '65'])
    if (x > 4): 
        how_helpful.loc[i, ['65>1 (slightly helpful)','65>2', '65>3', '65>4']] += 1
    elif (x > 3): 
        how_helpful.loc[i, ['65>1 (slightly helpful)','65>2', '65>3']] += 1
    elif (x > 2): 
        how_helpful.loc[i, ['65>1 (slightly helpful)','65>2']] += 1
    elif (x > 1): 
        how_helpful.loc[i, '65>1 (slightly helpful)'] += 1

In [None]:
## joining counselor and convo IDs with helpfulness ratings. 

helpful_by_counselor = counselor_by_convo.to_frame().merge(helpful1, right_index=True, left_index=True).dropna()
helpful_by_counselor = helpful_by_counselor.merge(how_helpful.dropna().drop(columns='65'), right_index=True, left_index=True, how='left')

helpful_by_counselor_all = helpful_by_counselor.join(counselor_by_convo,how='right',lsuffix='blah').drop('counselor_actor_idblah',axis=1)


In [None]:
ratings = defaultdict(list)
rating_moving_average = dict()
counselor_scores = dict()
counselor_num_convos = defaultdict(lambda:0)
gamma = 0.9
data = dict()
convo_counselor_num_convos = dict()

In [None]:
for i in tqdm(helpful_by_counselor_all.itertuples(), total=helpful_by_counselor_all.shape[0]):
    # note that 1:-1 here indicates the other ratings i.e. 65>1, 65>2, etc. 
    conversation, counselor, current_rating = i[0], i[-1], i[1:-1] 
    
    # seems to be tracking the number of conversations a counselor has 
    counselor_num_convos[counselor] += 1
    
    # seems to be tracking the conversations the counselor of this conversation had 
    convo_counselor_num_convos[conversation] = counselor_num_convos[counselor]
    
    # if there was a rating for this conversation
    if pd.notna(current_rating[0]):
        
        # not entirely sure how += works with 5 columns; creates five rows? Gets rearranged into 5 columns
        ratings[counselor] += current_rating
        counselor_ratings = np.array(ratings[counselor]).reshape(-1, 5)
        
        # if there are more than 5 different ratings (i.e. for different conversations)...
        # keep the ratings; calculate the all time mean and the last five mean
        # not entirely sure why gamma is set to 0.9
        if counselor_ratings.shape[0] >= 5:
            current_rating = np.array(current_rating)

            ## keeps track of the all time mean
            all_time_mean = counselor_ratings.mean(0)

            ## keeps track of the last five ratings
            last_five_mean = counselor_ratings[-10:].mean(0)

            ## moving average rating 
            if counselor in rating_moving_average:
                rating_moving_average[counselor] = (rating_moving_average[counselor] * gamma) + (current_rating * (1-gamma))
            else:
                rating_moving_average[counselor] = all_time_mean

            ## stores all the different metrics (all time + last 5 mean + moving averate) in counselor_scores
            counselor_scores[counselor] = np.concatenate([all_time_mean,last_five_mean, rating_moving_average[counselor]])

    ## tags the counselor scores to the conversation
    ## doesn't update earlier scores but that's okay because the counselor was less 'skilled' then
    if counselor in counselor_scores:
        data[conversation] = counselor_scores[counselor]

In [None]:
## setting columns names for average counselor ratings

cols = ['64_Yes','65>1 (slightly helpful)', '65>2', '65>3', '65>4']
cols = cols*3
cols[:5] = ["all_time_avg_"+i for i in cols[:5]]
cols[5:10] = ["last5_avg_"+i for i in cols[:5]]
cols[10:] = ["moving_avg_"+i for i in cols[:5]]

In [None]:
## don't understand the need for the transpose, but otherwise... 
counselor_skill = pd.DataFrame(data,index=cols).T
counselor_skill = counselor_skill.join(pd.Series(convo_counselor_num_convos,name='counselor_num_convos'),how='outer')

In [None]:
## not entirely certain what quantile transformer does

quantile_transformer = preprocessing.QuantileTransformer()

a = quantile_transformer.fit_transform(counselor_skill.values)
m_perc = pd.DataFrame(a,columns = counselor_skill.columns, index=counselor_skill.index)

torch.save((quantile_transformer,counselor_skill.columns),"saved/quantile_transformer_skill.torch")

In [None]:
## saves the final result. 

m_perc.index.name = 'conversation_id'
m_perc.to_pickle(save_directory+"/counselor_skill.pickle")

### Label Processor

In [None]:
## importing data frames

counselor_survey_by_conversation = pd.read_pickle(save_directory+"/counselor_survey_by_conversation.pickle")
texter_survey_collated = pd.read_pickle(save_directory+"/texter_survey_collated.pickle")
avg_counselor_stats = pd.read_pickle(save_directory+"/counselor_skill.pickle")

In [None]:
## running checks and dropping unnnecessary columns

assert texter_survey_collated.conversation_id.notna().all()
assert texter_survey_collated.conversation_id.nunique() == texter_survey_collated.shape[0]

texter_survey_collated.set_index('conversation_id', inplace=True)
texter_survey_collated.drop(['survey_id','id'],axis=1,inplace=True)
texter_survey_collated.drop(['actor_id','response_id','returning_responder','status','time_submitted','time_imported'],axis=1,inplace=True)

avg_counselor_stats.index.name = 'conversation_id'

In [None]:
## joining the tables

l_df = counselor_survey_by_conversation.join(
    texter_survey_collated, 
    how="outer", 
)
l_df = l_df.join(
    avg_counselor_stats,
    how="outer"
)

In [None]:
## import messages
df2 = pd.read_pickle(save_directory+"/selected_messages.pickle")

In [None]:
## age regex
texter_messages = df2[df2.interaction == "texter"].groupby('conversation_id').message.agg(" ".join)
search = '\\bI\\\'?m (\d+)(?! ?%)\\b'
l = texter_messages[texter_messages.str.contains(search)].str.findall(search)
l = l[l.apply(len) == 1].apply(lambda a:a[0]).astype('int')
l = l[(l>=10) & (l<=70)]
l.name = "age_re"
l_df = l_df.join(l,how="outer")

### Pre-processing Active Rescues

In [None]:
reporting_log = pd.read_csv(raw_directory+"/REPORTING_LOG.tsv", sep="\t")
reporting_log.columns=reporting_log.columns.str.lower()

In [None]:
reporting_log = pd.read_csv(raw_directory+"/REPORTING_LOG.tsv", sep="\t")
reporting_log.columns=reporting_log.columns.str.lower()
for col in reporting_log:
    reporting_log[col] = reporting_log[col].replace("",np.nan)
    
## gets report type
report_type = reporting_log.groupby('conversation_id').type.agg(list)
report_type.name = "report_type"

## gets report sub-type
report_sub_type = reporting_log.groupby('conversation_id').sub_type.agg(list)
report_sub_type.name = "report_sub_type"

## whether reports are finalized
any_finalized = reporting_log.groupby('conversation_id').finalized.max()
any_finalized.name = "any_finalized"

## whether reports are cancelled
any_canceled = reporting_log.groupby('conversation_id').canceled.max()
any_canceled.name = "any_canceled"
all_canceled = reporting_log.groupby('conversation_id').canceled.min()
all_canceled.name = "all_canceled"


report_type=report_type.apply(lambda a: [i for i in a if not pd.isna(i)])
report_sub_type=report_sub_type.apply(lambda a: [i for i in a if not pd.isna(i)])

report_type.loc[report_type.apply(len) == 0] = np.nan
report_sub_type.loc[report_sub_type.apply(len) == 0] = np.nan

report_by_convo_df = report_type.to_frame().join(report_sub_type,how='outer',)\
                                            .join(any_finalized,how='outer',)\
                                            .join(any_canceled,how='outer') \
                                            .join(all_canceled,how='outer')

In [None]:
## joins l_df with the reporting log

l_df = l_df.join(report_by_convo_df,how="outer")
l_df.loc[report_by_convo_df.index, "report"] = 1
l_df['report'] = l_df['report'].fillna(0)

In [None]:
## tentatively save l_df as pickle
l_df.to_pickle(save_directory+"/l_df.pickle")

In [None]:
nltk.download('stopwords')

In [15]:
l_df = pd.read_pickle(save_directory+"/l_df.pickle")
l = l_df.age_re[~l_df.age_re.isna()]
avg_counselor_stats = pd.read_pickle(save_directory+"/counselor_skill.pickle")

In [8]:
## attempt to imitate labelizer function
labelizer = LabelProcessorSimplified(l_df,1000,200)
labelizer.remove_prefer_not_to_answer()
labelizer.convert_free_response('26',20)
labelizer.convert_free_response('27',20)
for i in ["White", "English", "Asian", "British", "Mixed", "Black"]:
    labelizer.add_union("75", i, substring=True)
labelizer.convert_ordinal_to_binary("age_re", sorted(l.unique()))

labelizer.convert_mse(avg_counselor_stats.columns.tolist(), super_col="counselor_rank")
labelizer.convert_all_checkboxs_to_binary(200)
labelizer.convert_all_ordinals_to_binary()
labelizer.convert_remaining_to_categorical_binary(200)

convert_checkbox_to_binary:   0%|          | 0/32 [00:00<?, ?it/s]

convert_ordinal_to_binary:   0%|          | 0/44 [00:00<?, ?it/s]

convert_categorical_to_binary:   0%|          | 0/35 [00:00<?, ?it/s]

In [10]:
## if the answer to 64 is not yes, all answers for 65 should be set to 0
labelizer.df.loc[labelizer.df['64_Yes']==0,["65>1 (slightly helpful)","65>2","65>3","65>4"]] = 0

In [11]:
## more labelizer functions
labelizer.get_binary_softmax_indxs()
labelizer.get_label_weights()
labelizer.remove_analyzers()
labelizer.drop_empty_rows()

In [None]:
## casts the datatypes to floats. Not casting the IDs to uint because they're alphanumeric, not numeric (i.e. the unhashed ver.)
for i,j in zip(labelizer.df.columns, labelizer.df.dtypes):
    labelizer.df[i] = labelizer.df[i].astype('float16')

In [6]:
## recoding suicide risk because stages are spread out in multiple permutation columns instead of 4 binary columns 
suicide_risk = [i for i in l_df.columns if '19_[' in i]
timeframe = []
capability = []
intent = []
desire = []
for i in suicide_risk: 
    if 'timeframe' in i:
        timeframe.append(i)
    elif 'capability' in i:
        capability.append(i)
    elif 'intent' in i:
        intent.append(i)
    elif 'desire' in i:
        desire.append(i)

In [30]:
l_df[['19_desire_x', '19_intent_x', '19_capability_x', '19_timeframe_x']] = 0.0
for i in tqdm(l_df.index):
    for j in suicide_risk:
        if l_df.loc[i, j] != 1.0:
            continue
        if 'desire' in j:
            l_df.loc[i, '19_desire_x'] = 1.0
        if 'intent' in j:
            l_df.loc[i, '19_intent_x'] = 1.0
        if 'capability' in j:
            l_df.loc[i, '19_capability_x'] = 1.0
        if 'timeframe' in j:
            l_df.loc[i, '19_timeframe_x'] = 1.0


In [115]:
## recoding topics because of similar issue as suicide risk 
topics = [i for i in l_df.columns if '18_[' in i]
substance = []
depressed = []
self_harm = []
suicide = []
for i in topics:
    if 'substance' in i:
        substance.append(i)
    if 'depressed' in i:
        depressed.append(i)
    if 'self_harm' in i:
        self_harm.append(i)
    if 'suicide' in i:
        suicide.append(i)

In [122]:
l_df[['18_substance', '18_depressed', '18_self_harm', '18_suicide']] = 0.0
for i in tqdm(l_df.index):
    for j in topics:
        if l_df.loc[i, j] != 1.0:
            continue
        if 'substance' in j:
            l_df.loc[i, '18_substance'] = 1.0
        if 'depressed' in j:
            l_df.loc[i, '18_depressed'] = 1.0
        if 'self_harm' in j:
            l_df.loc[i, '18_self_harm'] = 1.0
        if 'suicide' in j:
            l_df.loc[i, '18_suicide'] = 1.0

  0%|          | 0/526256 [00:00<?, ?it/s]

In [15]:
anxiety = [i for i in l_df.columns if 'anxiety' in i]
l_df['18_anxiety'] = 0.0
for i in tqdm(l_df.index):
    l_df.loc[i, '18_anxiety'] = max(l_df.loc[i, anxiety])

  0%|          | 0/526256 [00:00<?, ?it/s]

In [16]:
l_df['18_anxiety'].value_counts()

0.0    409712
1.0     94979
Name: 18_anxiety, dtype: int64

In [109]:
## combining two columns for 24 or younger metric 

l_df.loc[:,'69>24 or younger'] = np.nan
l_df.loc[(l_df.loc[:,'69>25-44']==0.0), '69>24 or younger'] = 1.0  
l_df.loc[(l_df.loc[:,'69>25-44']==1.0), '69>24 or younger'] = 0.0 
l_df.loc[(l_df.loc[:,'69>13 or younger']==0.0), '69>24 or younger'] = 0.0 
l_df.loc[(l_df.loc[:,'69>25-44']==np.nan), '69>24 or younger'] = np.nan  

In [123]:
labelizer.df = l_df

In [17]:
torch.save(labelizer,save_directory+'/labelizer.torch')

In [18]:
with open(save_directory+'/labelizer.pickle', 'wb') as f:
    pickle.dump(labelizer, f)

In [3]:
labelizer = torch.load(save_directory+"/labelizer.torch")
l_df = labelizer.df

In [137]:
print(l_df['18_substance'].value_counts(), "\n")
print(l_df['18_[substance]'].value_counts(), "\n")
print(l_df['18_depressed'].value_counts(), "\n")
print(l_df['18_[depressed]'].value_counts(), "\n")
print(l_df['18_suicide'].value_counts(), "\n")
print(l_df['18_[suicide]'].value_counts(), "\n")
print(l_df['18_self_harm'].value_counts(), "\n")
print(l_df['18_[self_harm]'].value_counts(), "\n")

0.0    523237
1.0      3019
Name: 18_substance, dtype: int64 

0.0    503393
1.0      1298
Name: 18_[substance], dtype: int64 

0.0    430115
1.0     96141
Name: 18_depressed, dtype: int64 

0.0    485759
1.0     18932
Name: 18_[depressed], dtype: int64 

0.0    421764
1.0    104492
Name: 18_suicide, dtype: int64 

0.0    465631
1.0     39060
Name: 18_[suicide], dtype: int64 

0.0    490239
1.0     36017
Name: 18_self_harm, dtype: int64 

0.0    494262
1.0     10429
Name: 18_[self_harm], dtype: int64 



In [153]:
l_df['69>25-44'].value_counts()

0.0    160091
1.0     10511
Name: 69>25-44, dtype: int64

In [154]:
l_df['69>13 or younger'].value_counts()

1.0    156995
0.0     13607
Name: 69>13 or younger, dtype: int64