# Preprocessing data

- Preprocessing data that is used for named entity recognition task. The data has been lemmatized beforehand.
- Preprocessing includes 
    1. collecting wordlists from Emolex and modifying them \
        a) with simple preprocessing function (process_wordlist), \
        b) by hand (removing irrelevant words (e.g. english words), adding words that would support the lemmatizer's properties (e.g. for word "annoyance" there are multiple conjugations (ärtyneisyys, ärtymys))),
    2. NER tagging data, and
    3. dividing data into different subsets.
- Wordlists are based on Emolex (the following research): \
    Mohammad, S. M., & Turney, P. D. (2013). Crowdsourcing a word–emotion association lexicon. Computational intelligence, 29(3), 436-465. [x](https://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm)
- This notebook utilized the older version of Finnish Emolex. 
- All in all, there are nine classes (nine different NER tags):
    1. O = other,
    2. s_ang = sentiment_angry,
    3. s_ant = anticipation, 
    4. s_d = disgust, 
    5. s_f = fear, 
    6. s_j = joy, 
    7. s_sa = sad, 
    8. s_su = surprise, 
    9. s_t = trust.

## 1. Import libraries

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

## 2. Load dataset that has been lemmatized

In [None]:
#There are six columns (id, lause (sentence), len, lause2 (sentence2), lemmat (lemmatized), vuosi (year))
df = pd.read_csv('./nerdata.csv')
df

In [None]:
#There are NAN values in year column ("vuosi"), so we fill them up correctly.
df[df.vuosi.isna()]

In [None]:
df['vuosi']=df['vuosi'].fillna(2019.0)
df.vuosi.isna().sum()

In [None]:
df["vuosi"] = df["vuosi"].astype(np.int64)
df

In [None]:
df.isna().sum()

In [None]:
# df.to_csv('nerdata.csv',index=False)

## 3. Modifying wordlists

In [None]:
def process_wordlist(f):
    df=pd.read_csv(f)
    print("orig length ",len(df))
    print('duplicates: ',df.word2.duplicated().sum())
    df=df[df.word2!="NO TRANSLATION"]
    df=df.drop_duplicates(subset=['word2'])
    df=df[(df['emotion-intensity-score']>0.5)]
    df=df.reset_index(drop=True)
    print("length after changes ",len(df))
    return df.word2.values

In [None]:
angy2=process_wordlist('df_anger.csv')
angy2.sort()
angy2

In [None]:
anti=process_wordlist('df_anticipation.csv')
anti.sort()
anti

In [None]:
disg=process_wordlist('df_disgust.csv')
disg.sort()
disg

In [None]:
fear=process_wordlist('df_fear.csv')
fear.sort()
fear

In [None]:
joy=process_wordlist('df_joy.csv')
joy.sort()
joy

In [None]:
sad=process_wordlist('df_sadness.csv')
sad.sort()
sad

In [None]:
surp=process_wordlist('df_surprise.csv')
surp.sort()
surp

In [None]:
trust=process_wordlist('df_trust.csv')
trust.sort()
trust

## 4. Add NER tags with the help of wordlists

In [None]:
#Note, that there has been manual changes between word list versions (e.g., df_anti and df_anti2).

anti = pd.read_csv('df_anti2.csv')
anti=anti.anticipation.values
joy = pd.read_csv('df_joy2.csv')
joy=joy.joy.values
trust = pd.read_csv('df_trust2.csv')
trust=trust.trust.values
angy = pd.read_csv('df_anger2.csv')
angy=angy.anger.values
surp = pd.read_csv('df_surprise2.csv')
surp=surp.surprise.values
sad = pd.read_csv('df_sad2.csv')
sad=sad.sad.values
fear = pd.read_csv('df_fear2.csv')
fear=fear.fear.values
disg = pd.read_csv('df_disgust2.csv')
disg=disg.disgust.values

print(len(anti))
print(len(joy))
print(len(trust))
print(len(angy))
print(len(surp))
print(len(sad))
print(len(fear))
print(len(disg))

In [None]:
df = pd.read_csv("nerdata.csv")
print(type(df.lemmat[0]))
df

In [None]:
#column "lemmat" type changed from string into array
from ast import literal_eval    
df["lemmat"] = df["lemmat"].apply(lambda x: literal_eval(x))
print(type(df.lemmat[0]))
df

In [None]:
#function to add empty ner_tag arrays
def add_nertag_arrays(sent_array):
    return ["O"]*len(sent_array)

In [None]:
#ner tag initialization (with "O") for every data row
from tqdm import tqdm
tqdm.pandas()
df["ner_tag"] = df["lemmat"].progress_apply(add_nertag_arrays)
df

In [None]:
#adding ner tags (comparing words from one sentiment word list with the lemmatized words in our dataset)
def add_ner_tags_for_group(x,sent_list,sent_name,df):
    for j in range(len(df.lemmat[x])):
        for i in range(len(sent_list)):
            if df.lemmat[x][j] == sent_list[i]:
                df.ner_tag[x][j]=sent_name

In [None]:
#sentiment lists
sent_list_header=[angy, anti, disg, fear, joy, sad, surp, trust]
sent_name_header=['s_ang','s_ant', 's_d', 's_f', 's_j', 's_sa', 's_su', 's_t']

In [None]:
#going through all words in sentiment lists and comparing them to the lemmatized word
for i in tqdm(range(len(sent_list_header))):
    [add_ner_tags_for_group(x,sent_list_header[i],sent_name_header[i],df) for x in range(len(df))]

In [None]:
# df.to_csv('ner_data_tagged.csv',index=False)

## 5. Divide data into train, test and valid sets

In [None]:
#using kfolds or stratified kfolds to create folds for the data 

from sklearn import datasets
from sklearn import model_selection

def create_strat_kfolds(data, num_splits, random_seed):
    data["kfold"] = -1
    kf = model_selection.StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=random_seed)
    for f, (t_, v_) in enumerate(kf.split(X=data.sent,y=data.label)):
        data.loc[v_, 'kfold'] = f
    return data

def create_kfolds(data, num_splits, random_seed):
    data["kfold"] = -1
    kf = model_selection.KFold(n_splits=num_splits, shuffle=True, random_state=random_seed)
    for f, (t_, v_) in enumerate(kf.split(X=data)):
        data.loc[v_, 'kfold'] = f
    return data

In [None]:
#first validation dataset extracted with kfolds/stratified kfolds
df = create_kfolds(df, num_splits=10, random_seed=52467)
df

In [None]:
valid=df[df.kfold==0]
traintest=df[df.kfold!=0]
valid=valid.reset_index(drop=True)
traintest=traintest.reset_index(drop=True)
traintest

In [None]:
valid

In [None]:
traintest = create_kfolds(traintest, num_splits=10, random_seed=52467)
traintest

In [None]:
trainset=traintest[(traintest.kfold!=2)&(traintest.kfold!=6)]
testset=traintest[(traintest.kfold==2)|(traintest.kfold==6)]
trainset=trainset.reset_index(drop=True)
testset=testset.reset_index(drop=True)
testset

In [None]:
trainset

In [None]:
trainset.to_csv('traindata.csv',index=False)
testset.to_csv('testdata.csv',index=False)
valid.to_csv('validdata.csv',index=False)