# Preprocessing data 

- Preprocessing data for sentiment analysis or text classification.
- Using "Tech in eldercare" data from JYU as an example.

## 1. Essentials (libraries, config, functions)

In [None]:
# import libraries
from transformers import AutoTokenizer
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.model_selection import train_test_split

In [None]:
#config
class cfg():
    model_name = "TurkuNLP/bert-base-finnish-cased-v1"
    data_folder = "/path/to/data/"

In [None]:
def check_class_distribution(df, print_lengths=True):
    all_class_dist=[]
    for i in df.label.unique():
        class_dist = len(df[df.label==i])
        if print_lengths:
            print('For label {0} there is {1} data samples'.format(i, class_dist))
        all_class_dist.append(class_dist)
    return all_class_dist

def check_token_length(df, print_lengths=True):
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
    x = df["text"].values
    
    # Encode our concatenated data
    encoded = [tokenizer.encode(sent, add_special_tokens=True) for sent in x]

    # Find the maximum, minimum, mean and median length
    t_lengths=[len(sent) for sent in encoded]
    max_len = max(t_lengths)
    mean_len = np.mean(t_lengths)
    median_len = np.median(t_lengths)
    min_len = min(t_lengths)
    
    if print_lengths:
        print('Min length: ', min_len)
        print('Mean length: ', mean_len)
        print('Median length: ', median_len)
        print('Max length: ', max_len)
    
    return min_len, median_len, mean_len, max_len

def create_kfolds(data, num_splits, random_seed):
    data["kfold"] = -1
    kf = model_selection.KFold(n_splits=num_splits, shuffle=True, random_state=random_seed)
    for f, (t_, v_) in enumerate(kf.split(X=data)):
        data.loc[v_, 'kfold'] = f
    return data

def check_dupl_nan(df):
    for i in df.columns:
        print( 'Duplicates in {0}: {1}'.format(i,df[i].duplicated().sum()))
    print('NaNs: ',df.isna().sum())

In [None]:
#loading jyu data for 1st open-ended question
df1 = pd.read_csv(cfg.data_folder+'jyudata_q59.csv')
df1

In [None]:
check_token_length(df1)

In [None]:
#loading jyu data for 2nd open-ended question
df2 = pd.read_csv(cfg.data_folder+'jyudata_q62.csv')
df2 = df2[['text','recnum','vuosi']] #skipping some columns
df2

In [None]:
check_token_length(df2)

In [None]:
#for future pseudolabeling, let's edit and save the df2
df_pseudolabel_this = df2[['text']]
df_pseudolabel_this
df_pseudolabel_this.to_csv(cfg.data_folder+'for_pseudolabeling.csv',index=False)

## 2. Creating pretraining (MLM) datasets

In [None]:
#combining two datasets and picking one column of each ("text")
df_pretrain = pd.concat([df1, df2], ignore_index=True)
df_pretrain = df_pretrain[['text']]
print(f"Dataframe length: {len(df_pretrain)}")

#checking (and dropping) NANs and duplicates
check_dupl_nan(df_pretrain)
df_pretrain = df_pretrain.drop_duplicates(subset=['text'])
#df_pretrain = df_pretrain.dropna(subset=['text'])
df_pretrain = df_pretrain.reset_index(drop=True)
df_pretrain

In [None]:
#separating data into training and validation sets

#creating folds
df_pretrain = create_kfolds(df_pretrain, num_splits=5, random_seed=2022)

#one fold picked for validation set, others for training set
mlm_val=df_pretrain[df_pretrain.kfold==3]
mlm_train=df_pretrain[df_pretrain.kfold!=3]
mlm_val=mlm_val.reset_index(drop=True)
mlm_train=mlm_train.reset_index(drop=True)

#dataset lengths
print(len(mlm_val))
print(len(mlm_train))

#saving datasets
mlm_train.to_csv(cfg.data_folder+'mlm_train.csv',index=False)
mlm_val.to_csv(cfg.data_folder+'mlm_valid.csv',index=False)

## 3. Creating finetuning datasets

In [None]:
#training and testing sets extracted from df1
#let's check NANs and duplicates first
check_dupl_nan(df1)

In [None]:
#deduplication
df1 = df1.drop_duplicates(subset=['text'])

#creating two datasets
training_data, testing_data = train_test_split(df1, test_size=0.2, random_state=2022)
training_data = training_data.reset_index(drop=True)
testing_data = testing_data.reset_index(drop=True)

In [None]:
print("Testing data distribution")
_ = check_class_distribution(testing_data)
print("Training data distribution")
_ = check_class_distribution(training_data)

In [None]:
#saving datasets
testing_data.to_csv(cfg.data_folder+'finetune_testset.csv',index=False)
training_data.to_csv(cfg.data_folder+'finetune_trainset.csv',index=False)