In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
essay_empathy_raw = pd.read_csv('./dataset/WASSA23_essay_level_with_labels_train.tsv', sep='\t', header=0)
article_raw = pd.read_csv('./dataset/articles_adobe_AMT.csv', header=0, index_col=0)

In [3]:
# essay_empathy_raw

In [4]:
article_raw.head()

Unnamed: 0_level_0,text
article_id,Unnamed: 1_level_1
1,'Abhorrent' bottle attack on young Rangers fan...
2,'Afghan Girl' in iconic National Geographic ph...
3,'My whole family has been wiped out': Victims ...
4,'RHONY' STAR JULES WAINSTEIN Estranged Husband...
5,'Swam for their life': More survivors of Levia...


In [5]:
# essay_empathy_raw.columns

In [6]:
essay_empathy_raw.dtypes

conversation_id                    int64
article_id                         int64
essay                             object
empathy                          float64
distress                         float64
speaker_id                         int64
gender                            object
education                         object
race                              object
age                               object
income                            object
personality_conscientiousness     object
personality_openess               object
personality_extraversion          object
personality_agreeableness         object
personality_stability             object
iri_perspective_taking            object
iri_personal_distress             object
iri_fantasy                       object
iri_empathatic_concern            object
speaker_number                     int64
split                             object
essay_id                           int64
idx                               object
emotion         

In [7]:
essay_empathy = essay_empathy_raw.copy()

In [8]:
essay_empathy.drop(columns=['speaker_id','split','essay_id','idx'], inplace=True)

In [9]:
# essay_empathy.head()

In [10]:
all_columns = list(essay_empathy.columns)
# all_columns

In [11]:
string_columns = ['essay', 'emotion']
numeric_columns = [item for item in all_columns if item not in string_columns] # all other columns
# numeric_columns

In [12]:
essay_empathy = (essay_empathy.drop(columns=numeric_columns).join(essay_empathy[numeric_columns].apply(pd.to_numeric, errors='coerce'))) #converting non-numeric to null
essay_empathy.dropna(axis=0, inplace=True) #removing the rows having NA values
# essay_empathy

In [13]:
essay_empathy.isnull().any(axis=0)

essay                            False
emotion                          False
conversation_id                  False
article_id                       False
empathy                          False
distress                         False
gender                           False
education                        False
race                             False
age                              False
income                           False
personality_conscientiousness    False
personality_openess              False
personality_extraversion         False
personality_agreeableness        False
personality_stability            False
iri_perspective_taking           False
iri_personal_distress            False
iri_fantasy                      False
iri_empathatic_concern           False
speaker_number                   False
dtype: bool

In [14]:
# essay_empathy['emotion'].nunique()

In [15]:
essay_empathy['emotion'] = pd.Categorical(essay_empathy['emotion']).codes   #convert emotion texts to numbers
# essay_empathy.head()

In [16]:
essay_empathy.dtypes

essay                             object
emotion                             int8
conversation_id                    int64
article_id                         int64
empathy                          float64
distress                         float64
gender                           float64
education                        float64
race                             float64
age                              float64
income                           float64
personality_conscientiousness    float64
personality_openess              float64
personality_extraversion         float64
personality_agreeableness        float64
personality_stability            float64
iri_perspective_taking           float64
iri_personal_distress            float64
iri_fantasy                      float64
iri_empathatic_concern           float64
speaker_number                     int64
dtype: object

In [17]:
essay_empathy['essay'] = essay_empathy['essay'].apply(lambda x: x.replace('\n', ' '))   #converting newlines (if any) to space

In [18]:
essay_empathy.to_csv("./essay_texts_all_preprocessed.csv")

# FastText embeddings

In [17]:
# essay_empathy.head()

In [18]:
import fasttext
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')
ft = fasttext.load_model('cc.en.300.bin')



In [19]:
essay_fasttext = pd.DataFrame(essay_empathy['essay'].apply(lambda x: ft.get_sentence_vector(x)))
# essay_fasttext

In [20]:
# expanding embedding list to columns
essay_fasttext = pd.concat([essay_fasttext.drop(columns='essay'), pd.DataFrame(essay_fasttext['essay'].tolist(), index=essay_fasttext.index).add_prefix('fasttext_')],axis=1)

In [21]:
# essay_fasttext

In [22]:
essay_empathy.drop(columns='essay', inplace=True) # no longer required

In [23]:
# essay_empathy

In [24]:
empathy_fasttext = pd.concat([essay_empathy, essay_fasttext], axis=1)

In [25]:
empathy_fasttext

Unnamed: 0,emotion,conversation_id,article_id,empathy,distress,gender,education,race,age,income,...,fasttext_290,fasttext_291,fasttext_292,fasttext_293,fasttext_294,fasttext_295,fasttext_296,fasttext_297,fasttext_298,fasttext_299
0,20,2,35,6.714286,6.714286,1.0,6.0,3.0,37.0,40000.0,...,0.031908,0.008618,-0.084222,0.009917,-0.015905,0.013034,0.009217,0.101176,-0.013040,0.002871
1,0,3,35,5.857143,6.000000,1.0,6.0,2.0,32.0,35000.0,...,0.036240,0.009713,-0.074727,-0.008554,-0.012608,0.013154,0.003391,0.072847,-0.003598,-0.001064
2,27,5,35,1.000000,1.428571,1.0,6.0,1.0,29.0,85000.0,...,0.029181,0.004013,-0.071510,-0.003602,-0.014522,0.005235,-0.002368,0.090139,-0.004795,0.012187
3,27,6,213,6.000000,6.857143,2.0,5.0,1.0,28.0,50000.0,...,0.033555,0.014324,-0.072501,0.023933,-0.021073,0.010637,-0.005696,0.088332,-0.022485,0.023060
4,24,8,213,6.000000,1.000000,1.0,6.0,3.0,37.0,40000.0,...,0.028484,-0.005003,-0.073540,0.002448,-0.019296,0.008079,0.011888,0.088122,-0.007650,-0.003173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
787,24,495,218,6.571429,6.714286,1.0,6.0,3.0,37.0,40000.0,...,0.016001,-0.012279,-0.068207,0.004699,-0.006312,-0.000172,0.004574,0.092242,-0.002821,0.023676
788,24,496,103,3.571429,6.428571,2.0,5.0,1.0,28.0,50000.0,...,0.044965,0.000671,-0.071815,0.007341,-0.009898,0.017540,0.007518,0.066064,-0.006650,0.003420
789,24,498,103,5.428571,2.000000,2.0,6.0,1.0,33.0,110000.0,...,0.016542,0.000684,-0.024153,0.012848,-0.005384,-0.005280,-0.023967,0.063515,0.011698,-0.004720
790,14,499,103,6.000000,6.000000,2.0,3.0,1.0,27.0,25000.0,...,0.038192,-0.000660,-0.081046,-0.002569,-0.011361,0.006441,0.008258,0.089426,-0.011913,0.003074


In [26]:
empathy_fasttext.to_csv('./empathy_fasttext.csv')