# Import packages and data

In [1]:
!pip install fasttext
import pandas as pd
import fasttext
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import numpy as np



You should consider upgrading via the 'c:\users\ligren\anaconda3\python.exe -m pip install --upgrade pip' command.


In [2]:
intersection_df = pd.read_json('intersection_df.json',  orient="records", lines = True)

In [3]:
intersection_df.head()

Unnamed: 0,text,label,label_new,binary,label_binary,label_new_2
0,I'm sure most people probably figured this is ...,Catastrophizing,Catastrophizing,1,Distorted,Catastrophizing
1,I've started to realise my nausea is mostly ca...,Not Distorted,Not Distorted,0,Not Distorted,Not Distorted
2,Every single day that I get my work done witho...,Not Distorted,Not Distorted,0,Not Distorted,Not Distorted
3,For a long long time I have found it difficult...,Not Distorted,Not Distorted,0,Not Distorted,Not Distorted
4,I’m sorry. I know it hurts. I know the pain of...,Not Distorted,Not Distorted,0,Not Distorted,Not Distorted


## Binary - Dist vs Not Dist

In [5]:
#CONVERT DATA INTO SUITABLE FORMAT FOR FASTTEXT
import copy

fasttext_df = copy.deepcopy(intersection_df)
fasttext_df['binary'] = fasttext_df['binary'].apply(lambda x: '__label__' + 'Not_Distorted' if x == 0 else '__label__' + 'Distorted')
fasttext_df = fasttext_df[['text', 'binary']]
fasttext_df['file_format'] = fasttext_df['binary'] + ' ' + fasttext_df['text']

In [6]:
# create dataset
X = fasttext_df['text']
y = fasttext_df['binary']

In [7]:
import statistics as stat
from statistics import mean
from sklearn.model_selection import StratifiedKFold

In [9]:
# configure the cross-validation procedure
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

# enumerate splits
f1_results = list()

#counter for counting folds
counter = 0

for train_ix, test_ix in cv.split(X,y):
    
    # split data
    X_train, X_test = X[train_ix], X[test_ix]
    y_train, y_test = y[train_ix], y[test_ix]
    
    #prepare train data in a suitable format for fasttext
    train = pd.concat([X_train, y_train], axis = 1)
    train.to_csv('train_fasttext.txt', sep = '\t', index = False, header = None, quoting=csv.QUOTE_NONE) #last parameter deletes unnecessary quote signs from text
    
    #train the model
    model = fasttext.train_supervised(input = 'train_fasttext.txt', lr = 0.25, epoch = 25)
    
    #create predictions
    y_pred = X_test.apply(lambda x: model.predict(x)[0][0])
    
    #evaluate model by its F1-score and add into F1-score list
    f1 = f1_score(y_test, y_pred, average='weighted')
    print('F1_score weighted of a fold nr_' + str(counter) + ' is ' + str(round(f1,2)))
    f1_results.append(f1)
    print(classification_report(y_test, y_pred))
    counter +=1

print('Max F1_score across folds is ' + str(max(f1_results)) + ' and mean score is ' + str(mean(f1_results)))

F1_score weighted of a fold nr_0 is 0.62
                        precision    recall  f1-score   support

    __label__Distorted       0.40      0.29      0.33         7
__label__Not_Distorted       0.71      0.80      0.75        15

              accuracy                           0.64        22
             macro avg       0.55      0.54      0.54        22
          weighted avg       0.61      0.64      0.62        22

F1_score weighted of a fold nr_1 is 0.69
                        precision    recall  f1-score   support

    __label__Distorted       0.67      0.29      0.40         7
__label__Not_Distorted       0.74      0.93      0.82        15

              accuracy                           0.73        22
             macro avg       0.70      0.61      0.61        22
          weighted avg       0.71      0.73      0.69        22

F1_score weighted of a fold nr_2 is 0.59
                        precision    recall  f1-score   support

    __label__Distorted       0.33     

  _warn_prf(average, modifier, msg_start, len(result))


                        precision    recall  f1-score   support

    __label__Distorted       0.00      0.00      0.00         6
__label__Not_Distorted       0.71      1.00      0.83        15

              accuracy                           0.71        21
             macro avg       0.36      0.50      0.42        21
          weighted avg       0.51      0.71      0.60        21

F1_score weighted of a fold nr_9 is 0.69
                        precision    recall  f1-score   support

    __label__Distorted       1.00      0.17      0.29         6
__label__Not_Distorted       0.75      1.00      0.86        15

              accuracy                           0.76        21
             macro avg       0.88      0.58      0.57        21
          weighted avg       0.82      0.76      0.69        21

Max F1_score across folds is 0.7731092436974789 and mean score is 0.6382772978151129


## Binary - Catastrophizing vs Other

In [22]:
df_with_distortions = intersection_df[intersection_df['binary'] != 0]
df_with_distortions = df_with_distortions.reset_index(drop=True)
df_with_distortions['label_new_2'].value_counts()

Catastrophizing     43
Other distortion    21
Name: label_new_2, dtype: int64

In [23]:
df_cat_other = copy.deepcopy(df_with_distortions)
df_cat_other = df_cat_other[['text', 'label_new_2']]
df_cat_other['label_new_2'] = df_cat_other['label_new_2'].apply(lambda x: '__label__' + x.replace(' ', '-').lower())
df_cat_other.head()

Unnamed: 0,text,label_new_2
0,I'm sure most people probably figured this is ...,__label__catastrophizing
1,"I know therapy has its ups and downs, but I fe...",__label__other-distortion
2,I'm currently having a panic attack because I'...,__label__catastrophizing
3,So I'll try to describe this the best I can. ...,__label__catastrophizing
4,I've been having this problem for a while. Ju...,__label__catastrophizing


In [25]:
X = df_cat_other['text']
y = df_cat_other['label_new_2']

In [26]:
# configure the cross-validation procedure
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

# enumerate splits
f1_results = list()

#counter for counting folds
counter = 0

for train_ix, test_ix in cv.split(X,y):
    
    # split data
    X_train, X_test = X[train_ix], X[test_ix]
    y_train, y_test = y[train_ix], y[test_ix]
    
    #prepare train data in a suitable format for fasttext
    train = pd.concat([X_train, y_train], axis = 1)
    train.to_csv('train_fasttext.txt', sep = '\t', index = False, header = None, quoting=csv.QUOTE_NONE) #last parameter deletes unnecessary quote signs from text
    
    #train the model
    model = fasttext.train_supervised(input = 'train_fasttext.txt', lr = 0.25, epoch = 25)
    
    #create predictions
    y_pred = X_test.apply(lambda x: model.predict(x)[0][0])
    
    #evaluate model by its F1-score and add into F1-score list
    f1 = f1_score(y_test, y_pred, average='weighted')
    print('F1_score weighted of a fold nr_' + str(counter) + ' is ' + str(round(f1,2)))
    f1_results.append(f1)
    print(classification_report(y_test, y_pred))
    counter +=1

print('Max F1_score across folds is ' + str(max(f1_results)) + ' and mean score is ' + str(mean(f1_results)))

F1_score weighted of a fold nr_0 is 0.6
                           precision    recall  f1-score   support

 __label__catastrophizing       0.71      1.00      0.83         5
__label__other-distortion       0.00      0.00      0.00         2

                 accuracy                           0.71         7
                macro avg       0.36      0.50      0.42         7
             weighted avg       0.51      0.71      0.60         7

F1_score weighted of a fold nr_1 is 0.6


  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 __label__catastrophizing       0.71      1.00      0.83         5
__label__other-distortion       0.00      0.00      0.00         2

                 accuracy                           0.71         7
                macro avg       0.36      0.50      0.42         7
             weighted avg       0.51      0.71      0.60         7

F1_score weighted of a fold nr_2 is 0.6
                           precision    recall  f1-score   support

 __label__catastrophizing       0.71      1.00      0.83         5
__label__other-distortion       0.00      0.00      0.00         2

                 accuracy                           0.71         7
                macro avg       0.36      0.50      0.42         7
             weighted avg       0.51      0.71      0.60         7

F1_score weighted of a fold nr_3 is 0.42
                           precision    recall  f1-score   support

 __label__catastrophizing       0.57     

## Binary with pre-trained vectors - Dist vs Not Dist

In [29]:
#!unzip wiki-news-300d-1M-subword.vec.zip -d wiki-news-300d-1M-subword.vec

In [27]:
# create dataset
X = fasttext_df['text']
y = fasttext_df['binary']

In [29]:
# configure the cross-validation procedure
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

# enumerate splits
f1_results = list()

#counter for counting folds
counter = 0

for train_ix, test_ix in cv.split(X,y):
    
    # split data
    X_train, X_test = X[train_ix], X[test_ix]
    y_train, y_test = y[train_ix], y[test_ix]
    
    #prepare train data in a suitable format for fasttext
    train = pd.concat([X_train, y_train], axis = 1)
    train.to_csv('train_fasttext.txt', sep = '\t', index = False, header = None, quoting=csv.QUOTE_NONE) #last parameter deletes unnecessary quote signs from text
    
    #train the model
    model = fasttext.train_supervised(input = 'train_fasttext.txt', lr = 1.0, epoch = 10, dim = 300, pretrainedVectors='wiki-news-300d-1M-subword.vec')
    
    #create predictions
    y_pred = X_test.apply(lambda x: model.predict(x)[0][0])
    
    #evaluate model by its F1-score and add into F1-score list
    f1 = f1_score(y_test, y_pred, average='weighted')
    print('F1_score weighted of a fold nr_' + str(counter) + ' is ' + str(round(f1,2)))
    f1_results.append(f1)
    print(classification_report(y_test, y_pred))
    counter +=1

print('Max F1_score across folds is ' + str(max(f1_results)) + ' and mean score is ' + str(mean(f1_results)))

F1_score weighted of a fold nr_0 is 0.73
                        precision    recall  f1-score   support

    __label__Distorted       0.57      0.57      0.57         7
__label__Not_Distorted       0.80      0.80      0.80        15

              accuracy                           0.73        22
             macro avg       0.69      0.69      0.69        22
          weighted avg       0.73      0.73      0.73        22

F1_score weighted of a fold nr_1 is 0.78
                        precision    recall  f1-score   support

    __label__Distorted       0.62      0.71      0.67         7
__label__Not_Distorted       0.86      0.80      0.83        15

              accuracy                           0.77        22
             macro avg       0.74      0.76      0.75        22
          weighted avg       0.78      0.77      0.78        22

F1_score weighted of a fold nr_2 is 0.51
                        precision    recall  f1-score   support

    __label__Distorted       0.25     

## Binary with pre-trained vectors - Catastrophizing vs Other

In [30]:
X = df_cat_other['text']
y = df_cat_other['label_new_2']

In [31]:
# configure the cross-validation procedure
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

# enumerate splits
f1_results = list()

#counter for counting folds
counter = 0

for train_ix, test_ix in cv.split(X,y):
    
    # split data
    X_train, X_test = X[train_ix], X[test_ix]
    y_train, y_test = y[train_ix], y[test_ix]
    
    #prepare train data in a suitable format for fasttext
    train = pd.concat([X_train, y_train], axis = 1)
    train.to_csv('train_fasttext.txt', sep = '\t', index = False, header = None, quoting=csv.QUOTE_NONE) #last parameter deletes unnecessary quote signs from text
    
    #train the model
    model = fasttext.train_supervised(input = 'train_fasttext.txt', lr = 1.0, epoch = 10, dim = 300, pretrainedVectors='wiki-news-300d-1M-subword.vec')
    
    #create predictions
    y_pred = X_test.apply(lambda x: model.predict(x)[0][0])
    
    #evaluate model by its F1-score and add into F1-score list
    f1 = f1_score(y_test, y_pred, average='weighted')
    print('F1_score weighted of a fold nr_' + str(counter) + ' is ' + str(round(f1,2)))
    f1_results.append(f1)
    print(classification_report(y_test, y_pred))
    counter +=1

print('Max F1_score across folds is ' + str(max(f1_results)) + ' and mean score is ' + str(mean(f1_results)))

F1_score weighted of a fold nr_0 is 0.43
                           precision    recall  f1-score   support

 __label__catastrophizing       0.60      0.60      0.60         5
__label__other-distortion       0.00      0.00      0.00         2

                 accuracy                           0.43         7
                macro avg       0.30      0.30      0.30         7
             weighted avg       0.43      0.43      0.43         7

F1_score weighted of a fold nr_1 is 0.6
                           precision    recall  f1-score   support

 __label__catastrophizing       0.71      1.00      0.83         5
__label__other-distortion       0.00      0.00      0.00         2

                 accuracy                           0.71         7
                macro avg       0.36      0.50      0.42         7
             weighted avg       0.51      0.71      0.60         7



  _warn_prf(average, modifier, msg_start, len(result))


F1_score weighted of a fold nr_2 is 0.71
                           precision    recall  f1-score   support

 __label__catastrophizing       0.80      0.80      0.80         5
__label__other-distortion       0.50      0.50      0.50         2

                 accuracy                           0.71         7
                macro avg       0.65      0.65      0.65         7
             weighted avg       0.71      0.71      0.71         7

F1_score weighted of a fold nr_3 is 0.42
                           precision    recall  f1-score   support

 __label__catastrophizing       0.57      1.00      0.73         4
__label__other-distortion       0.00      0.00      0.00         3

                 accuracy                           0.57         7
                macro avg       0.29      0.50      0.36         7
             weighted avg       0.33      0.57      0.42         7

F1_score weighted of a fold nr_4 is 0.81
                           precision    recall  f1-score   support


## Multi-class - Catastrophizing vs Other vs Labeling

In [50]:
df_with_distortions['label_new'].value_counts()

Catastrophizing     43
Other distortion    12
Labeling             9
Name: label_new, dtype: int64

In [51]:
df_cat_other = copy.deepcopy(df_with_distortions)
df_cat_other = df_cat_other[['text', 'label_new']]
df_cat_other['label_new'] = df_cat_other['label_new'].apply(lambda x: '__label__' + x.replace(' ', '-').lower())
df_cat_other.head()

Unnamed: 0,text,label_new
0,I'm sure most people probably figured this is ...,__label__catastrophizing
1,"I know therapy has its ups and downs, but I fe...",__label__labeling
2,I'm currently having a panic attack because I'...,__label__catastrophizing
3,So I'll try to describe this the best I can. ...,__label__catastrophizing
4,I've been having this problem for a while. Ju...,__label__catastrophizing


In [52]:
X_multi = df_cat_other['text']
y_multi = df_cat_other['label_new']

In [62]:
# configure the cross-validation procedure
cv = StratifiedKFold(n_splits=9, shuffle=True, random_state=1)

# enumerate splits
f1_results = list()

#counter for counting folds
counter = 0

for train_ix, test_ix in cv.split(X_multi,y_multi):
    
    # split data
    X_train, X_test = X_multi[train_ix], X_multi[test_ix]
    y_train, y_test = y_multi[train_ix], y_multi[test_ix]
    
    #prepare train data in a suitable format for fasttext
    train = pd.concat([X_train, y_train], axis = 1)
    train.to_csv('train_fasttext.txt', sep = '\t', index = False, header = None, quoting=csv.QUOTE_NONE) #last parameter deletes unnecessary quote signs from text
    
    #train the model
    model = fasttext.train_supervised(input = 'train_fasttext.txt', lr = 1.0, epoch = 15)
    
    #create predictions
    y_pred = X_test.apply(lambda x: model.predict(x)[0][0])
    
    #evaluate model by its F1-score and add into F1-score list
    f1 = f1_score(y_test, y_pred, average='weighted')
    print('F1_score weighted of a fold nr_' + str(counter) + ' is ' + str(round(f1,2)))
    f1_results.append(f1)
    print(classification_report(y_test, y_pred))
    counter +=1

print('Max F1_score across folds is ' + str(max(f1_results)) + ' and mean score is ' + str(mean(f1_results)))

F1_score weighted of a fold nr_0 is 0.48
                           precision    recall  f1-score   support

 __label__catastrophizing       0.62      1.00      0.77         5
        __label__labeling       0.00      0.00      0.00         1
__label__other-distortion       0.00      0.00      0.00         2

                 accuracy                           0.62         8
                macro avg       0.21      0.33      0.26         8
             weighted avg       0.39      0.62      0.48         8

F1_score weighted of a fold nr_1 is 0.6


  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 __label__catastrophizing       0.71      1.00      0.83         5
        __label__labeling       0.00      0.00      0.00         1
__label__other-distortion       0.00      0.00      0.00         1

                 accuracy                           0.71         7
                macro avg       0.24      0.33      0.28         7
             weighted avg       0.51      0.71      0.60         7

F1_score weighted of a fold nr_2 is 0.6
                           precision    recall  f1-score   support

 __label__catastrophizing       0.71      1.00      0.83         5
        __label__labeling       0.00      0.00      0.00         1
__label__other-distortion       0.00      0.00      0.00         1

                 accuracy                           0.71         7
                macro avg       0.24      0.33      0.28         7
             weighted avg       0.51      0.71      0.60         7

F1_score weighte

## Multi-class - Catastrophizing vs Other vs Not Distorted

In [54]:
intersection_df['label_new_2'].value_counts()

Not Distorted       152
Catastrophizing      43
Other distortion     21
Name: label_new_2, dtype: int64

In [55]:
df_cat_other = copy.deepcopy(intersection_df)
df_cat_other = df_cat_other[['text', 'label_new_2']]
df_cat_other['label_new_2'] = df_cat_other['label_new_2'].apply(lambda x: '__label__' + x.replace(' ', '-').lower())
df_cat_other.head()

Unnamed: 0,text,label_new_2
0,I'm sure most people probably figured this is ...,__label__catastrophizing
1,I've started to realise my nausea is mostly ca...,__label__not-distorted
2,Every single day that I get my work done witho...,__label__not-distorted
3,For a long long time I have found it difficult...,__label__not-distorted
4,I’m sorry. I know it hurts. I know the pain of...,__label__not-distorted


In [56]:
X_multi = df_cat_other['text']
y_multi = df_cat_other['label_new_2']

In [57]:
# configure the cross-validation procedure
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

# enumerate splits
f1_results = list()

#counter for counting folds
counter = 0

for train_ix, test_ix in cv.split(X_multi,y_multi):
    
    # split data
    X_train, X_test = X_multi[train_ix], X_multi[test_ix]
    y_train, y_test = y_multi[train_ix], y_multi[test_ix]
    
    #prepare train data in a suitable format for fasttext
    train = pd.concat([X_train, y_train], axis = 1)
    train.to_csv('train_fasttext.txt', sep = '\t', index = False, header = None, quoting=csv.QUOTE_NONE) #last parameter deletes unnecessary quote signs from text
    
    #train the model
    model = fasttext.train_supervised(input = 'train_fasttext.txt', lr = 1.0, epoch = 15)
    
    #create predictions
    y_pred = X_test.apply(lambda x: model.predict(x)[0][0])
    
    #evaluate model by its F1-score and add into F1-score list
    f1 = f1_score(y_test, y_pred, average='weighted')
    print('F1_score weighted of a fold nr_' + str(counter) + ' is ' + str(round(f1,2)))
    f1_results.append(f1)
    print(classification_report(y_test, y_pred))
    counter +=1

print('Max F1_score across folds is ' + str(max(f1_results)) + ' and mean score is ' + str(mean(f1_results)))

F1_score weighted of a fold nr_0 is 0.74
                           precision    recall  f1-score   support

 __label__catastrophizing       0.57      0.80      0.67         5
   __label__not-distorted       0.87      0.87      0.87        15
__label__other-distortion       0.00      0.00      0.00         2

                 accuracy                           0.77        22
                macro avg       0.48      0.56      0.51        22
             weighted avg       0.72      0.77      0.74        22



  _warn_prf(average, modifier, msg_start, len(result))


F1_score weighted of a fold nr_1 is 0.61
                           precision    recall  f1-score   support

 __label__catastrophizing       0.50      0.20      0.29         5
   __label__not-distorted       0.70      0.93      0.80        15
__label__other-distortion       0.00      0.00      0.00         2

                 accuracy                           0.68        22
                macro avg       0.40      0.38      0.36        22
             weighted avg       0.59      0.68      0.61        22

F1_score weighted of a fold nr_2 is 0.62
                           precision    recall  f1-score   support

 __label__catastrophizing       0.33      0.20      0.25         5
   __label__not-distorted       0.74      0.93      0.82        15
__label__other-distortion       0.00      0.00      0.00         2

                 accuracy                           0.68        22
                macro avg       0.36      0.38      0.36        22
             weighted avg       0.58      

## Multi-class with pre-trained vectors - Catastrophizing vs Other vs Labeling

In [59]:
df_with_distortions['label_new'].value_counts()

Catastrophizing     43
Other distortion    12
Labeling             9
Name: label_new, dtype: int64

In [60]:
df_cat_other = copy.deepcopy(df_with_distortions)
df_cat_other = df_cat_other[['text', 'label_new']]
df_cat_other['label_new'] = df_cat_other['label_new'].apply(lambda x: '__label__' + x.replace(' ', '-').lower())
df_cat_other.head()

Unnamed: 0,text,label_new
0,I'm sure most people probably figured this is ...,__label__catastrophizing
1,"I know therapy has its ups and downs, but I fe...",__label__labeling
2,I'm currently having a panic attack because I'...,__label__catastrophizing
3,So I'll try to describe this the best I can. ...,__label__catastrophizing
4,I've been having this problem for a while. Ju...,__label__catastrophizing


In [61]:
X_multi = df_cat_other['text']
y_multi = df_cat_other['label_new']

In [63]:
# configure the cross-validation procedure
cv = StratifiedKFold(n_splits=9, shuffle=True, random_state=1)

# enumerate splits
f1_results = list()

#counter for counting folds
counter = 0

for train_ix, test_ix in cv.split(X_multi,y_multi):
    
    # split data
    X_train, X_test = X_multi[train_ix], X_multi[test_ix]
    y_train, y_test = y_multi[train_ix], y_multi[test_ix]
    
    #prepare train data in a suitable format for fasttext
    train = pd.concat([X_train, y_train], axis = 1)
    train.to_csv('train_fasttext.txt', sep = '\t', index = False, header = None, quoting=csv.QUOTE_NONE) #last parameter deletes unnecessary quote signs from text
    
    #train the model
    model = fasttext.train_supervised(input = 'train_fasttext.txt', lr = 1.0, epoch = 15, dim = 300, pretrainedVectors='wiki-news-300d-1M-subword.vec')
    
    #create predictions
    y_pred = X_test.apply(lambda x: model.predict(x)[0][0])
    
    #evaluate model by its F1-score and add into F1-score list
    f1 = f1_score(y_test, y_pred, average='weighted')
    print('F1_score weighted of a fold nr_' + str(counter) + ' is ' + str(round(f1,2)))
    f1_results.append(f1)
    print(classification_report(y_test, y_pred))
    counter +=1

print('Max F1_score across folds is ' + str(max(f1_results)) + ' and mean score is ' + str(mean(f1_results)))

F1_score weighted of a fold nr_0 is 0.48
                           precision    recall  f1-score   support

 __label__catastrophizing       0.62      1.00      0.77         5
        __label__labeling       0.00      0.00      0.00         1
__label__other-distortion       0.00      0.00      0.00         2

                 accuracy                           0.62         8
                macro avg       0.21      0.33      0.26         8
             weighted avg       0.39      0.62      0.48         8



  _warn_prf(average, modifier, msg_start, len(result))


F1_score weighted of a fold nr_1 is 0.65
                           precision    recall  f1-score   support

 __label__catastrophizing       0.83      1.00      0.91         5
        __label__labeling       0.00      0.00      0.00         1
__label__other-distortion       0.00      0.00      0.00         1

                 accuracy                           0.71         7
                macro avg       0.28      0.33      0.30         7
             weighted avg       0.60      0.71      0.65         7

F1_score weighted of a fold nr_2 is 0.65
                           precision    recall  f1-score   support

 __label__catastrophizing       0.83      1.00      0.91         5
        __label__labeling       0.00      0.00      0.00         1
__label__other-distortion       0.00      0.00      0.00         1

                 accuracy                           0.71         7
                macro avg       0.28      0.33      0.30         7
             weighted avg       0.60      

## Multi-class with pre-trained vectors - Catastrophizing vs Other vs Not Distorted

In [64]:
intersection_df['label_new_2'].value_counts()

Not Distorted       152
Catastrophizing      43
Other distortion     21
Name: label_new_2, dtype: int64

In [65]:
df_cat_other = copy.deepcopy(intersection_df)
df_cat_other = df_cat_other[['text', 'label_new_2']]
df_cat_other['label_new_2'] = df_cat_other['label_new_2'].apply(lambda x: '__label__' + x.replace(' ', '-').lower())
df_cat_other.head()

Unnamed: 0,text,label_new_2
0,I'm sure most people probably figured this is ...,__label__catastrophizing
1,I've started to realise my nausea is mostly ca...,__label__not-distorted
2,Every single day that I get my work done witho...,__label__not-distorted
3,For a long long time I have found it difficult...,__label__not-distorted
4,I’m sorry. I know it hurts. I know the pain of...,__label__not-distorted


In [66]:
X_multi = df_cat_other['text']
y_multi = df_cat_other['label_new_2']

In [68]:
# configure the cross-validation procedure
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

# enumerate splits
f1_results = list()

#counter for counting folds
counter = 0

for train_ix, test_ix in cv.split(X_multi,y_multi):
    
    # split data
    X_train, X_test = X_multi[train_ix], X_multi[test_ix]
    y_train, y_test = y_multi[train_ix], y_multi[test_ix]
    
    #prepare train data in a suitable format for fasttext
    train = pd.concat([X_train, y_train], axis = 1)
    train.to_csv('train_fasttext.txt', sep = '\t', index = False, header = None, quoting=csv.QUOTE_NONE) #last parameter deletes unnecessary quote signs from text
    
    #train the model
    model = fasttext.train_supervised(input = 'train_fasttext.txt', lr = 1.0, epoch = 15, dim = 300, pretrainedVectors='wiki-news-300d-1M-subword.vec')
    
    #create predictions
    y_pred = X_test.apply(lambda x: model.predict(x)[0][0])
    
    #evaluate model by its F1-score and add into F1-score list
    f1 = f1_score(y_test, y_pred, average='weighted')
    print('F1_score weighted of a fold nr_' + str(counter) + ' is ' + str(round(f1,2)))
    f1_results.append(f1)
    print(classification_report(y_test, y_pred))
    counter +=1

print('Max F1_score across folds is ' + str(max(f1_results)) + ' and mean score is ' + str(mean(f1_results)))

F1_score weighted of a fold nr_0 is 0.64
                           precision    recall  f1-score   support

 __label__catastrophizing       0.40      0.40      0.40         5
   __label__not-distorted       0.80      0.80      0.80        15
__label__other-distortion       0.00      0.00      0.00         2

                 accuracy                           0.64        22
                macro avg       0.40      0.40      0.40        22
             weighted avg       0.64      0.64      0.64        22

F1_score weighted of a fold nr_1 is 0.59
                           precision    recall  f1-score   support

 __label__catastrophizing       0.50      0.20      0.29         5
   __label__not-distorted       0.68      0.87      0.76        15
__label__other-distortion       0.00      0.00      0.00         2

                 accuracy                           0.64        22
                macro avg       0.39      0.36      0.35        22
             weighted avg       0.58      

  _warn_prf(average, modifier, msg_start, len(result))


F1_score weighted of a fold nr_3 is 0.73
                           precision    recall  f1-score   support

 __label__catastrophizing       0.50      0.50      0.50         4
   __label__not-distorted       0.83      0.94      0.88        16
__label__other-distortion       0.00      0.00      0.00         2

                 accuracy                           0.77        22
                macro avg       0.44      0.48      0.46        22
             weighted avg       0.70      0.77      0.73        22

F1_score weighted of a fold nr_4 is 0.64
                           precision    recall  f1-score   support

 __label__catastrophizing       0.00      0.00      0.00         4
   __label__not-distorted       0.74      0.88      0.80        16
__label__other-distortion       1.00      0.50      0.67         2

                 accuracy                           0.68        22
                macro avg       0.58      0.46      0.49        22
             weighted avg       0.63      