### Read data

In [1]:
#!pip install --force-reinstall --no-dependencies "scikit-learn==0.24.2"
#!pip install sklearn-crfsuite

In [2]:
def read_file(f):
    data = open(f,'r').readlines()[1:]
    row_id = [i.split('\t')[0].strip() for i in data]
    data = [i.split('\t')[1].strip().split(' ') for i in data]
    return row_id,data

In [3]:
row_id_text, texts = read_file('/content/REVIEW_TEXT.txt')
row_id_tags, tags = read_file('/content/REVIEW_LABELSEQ.txt')





### Two entities of interest -- AE (adverse events) and SSI (signs, symptoms, and indications).

We use BIO scheme: 

     B- to denote beginning of a tagged named entity, 
     
     I- to denote inside a tagged named entity tag, 
     
     O to denote outside of any tagged named entity 
     
So, your sequential labeling task has five tags: B-AE, I-AE, B-SSI, I-SSI, and O.

In [4]:
index = 5
print('num of data', len(row_id_text))
assert len(row_id_text) == len(row_id_tags)
######## label dataset 

print('-'*89)
print('Token\tTag')
for idx in range(len(texts[index])):
    print(texts[index][idx], '\t', tags[index][idx])

num of data 4744
-----------------------------------------------------------------------------------------
Token	Tag
I 	 O
had 	 O
terrible 	 B-AE
anxiety 	 I-AE
the 	 I-AE
whole 	 I-AE
time 	 I-AE
, 	 O
the 	 B-AE
worst 	 I-AE
kind 	 I-AE
of 	 I-AE
anxiety 	 I-AE
I've 	 O
ever 	 O
experienced. 	 O


In [5]:
index = 22
print('Token\tTag')
for idx in range(len(texts[index])):
    print(texts[index][idx], '\t', tags[index][idx])

Token	Tag
constipation 	 B-AE
, 	 O
drastic 	 B-AE
mood 	 I-AE
swings 	 I-AE
, 	 O
100% 	 O
helped 	 O
my 	 O
anxiety 	 B-SSI
and 	 O
panic 	 B-SSI
. 	 O


### Inputs

In [35]:
import numpy as np
from gensim.models import KeyedVectors
import string

# def get_features(word):
#     word=word.lower()
#     try:
#       # Load vectors directly from the file
#         model1 = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) ### Loading pre-trainned word2vec model
#         vector=model1[word]
#     except:
#         # if the word is not in vocabulary,
#         # returns zeros array
#         vector=np.zeros(300,)

#     return vector   

def word2features(word):
    # word = sent[i][0]
    # wordembdding=get_features(word)   ## word embedding vector 
    # postag = sent[i][1]
    # tag1=sent[i][2]
    # tag2=sent[i][4]
    # tag3 = sent[i][5]
    # print(word)
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        # 'word.isalpha()': word.isalpha(),
        # 'word.ispace()' : word.isspace(),
        # 'len(word)' : len(word),
        # 'word[-4:]': word[-4:],
        # 'word[-5:]': word[-5:],
        # 'prefix_1' : word[0],

    }
    # features = {
    #     'word.lower()': word.lower(),  # 
    #     'word.isdigit()': word.isdigit(), 
        
    #     ## you can add more feature extractor here
    #     # https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html#features
    # }
    return features

def text2features(text):
    return [word2features(i) for i in text]


In [36]:
X = [text2features(text) for text in texts]
y = tags
# y

### Train / Validation sets

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = 0.2, random_state=42)

### CRF model

In [38]:
# https://sklearn-crfsuite.readthedocs.io/en/latest/

from sklearn_crfsuite import CRF
crf = CRF()

### Training & Prediction

In [39]:
crf.fit(X_train, y_train) # train step 
y_pred = crf.predict(X_validation) # inference step

### Result

In [40]:

from sklearn_crfsuite.metrics import flat_classification_report
report = flat_classification_report(y_validation, y_pred)
print(report)

              precision    recall  f1-score   support

        B-AE       0.73      0.53      0.62       752
       B-SSI       0.76      0.49      0.60       168
        I-AE       0.66      0.43      0.53      1485
       I-SSI       0.16      0.05      0.07        66
           O       0.90      0.97      0.93     11859

    accuracy                           0.88     14330
   macro avg       0.64      0.49      0.55     14330
weighted avg       0.87      0.88      0.87     14330





In [12]:
#!pip3 install scikit-learn

In [41]:
# find instruction of hyperparameters here :https://sklearn-crfsuite.readthedocs.io/en/latest/api.html
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn.metrics import make_scorer
import sklearn
import scipy.stats

crf = CRF(algorithm='lbfgs', # Gradient descent using the L-BFGS method
     # The coefficient for L1 regularization.
     c1=0.1,
    c2=0.1,  # The coefficient for L1 regularization.
    max_iterations=50,
    all_possible_transitions=True)


In [14]:
# from sklearn.model_selection import cross_val_predict, cross_val_score
# pred = cross_val_predict(estimator=crf, X=X, y=y, cv=6)
# report = flat_classification_report(y_validation, y_pred)
# print(report)

In [42]:
crf.fit(X_train, y_train) # train step 
y_pred = crf.predict(X_validation) # inference step
report = flat_classification_report(y_validation, y_pred)
print(report)



              precision    recall  f1-score   support

        B-AE       0.72      0.61      0.66       752
       B-SSI       0.73      0.57      0.64       168
        I-AE       0.67      0.51      0.58      1485
       I-SSI       0.09      0.05      0.06        66
           O       0.92      0.96      0.94     11859

    accuracy                           0.89     14330
   macro avg       0.63      0.54      0.58     14330
weighted avg       0.88      0.89      0.88     14330



In [16]:
#!pip install --upgrade sklearn-crfsuite

#!pip install -U 'scikit-learn<0.24'

In [None]:
# #Using GridSearchCV for best hyperparameters
# from sklearn.model_selection import GridSearchCV
# from sklearn_crfsuite import CRF

# # Define hyperparameter search space
# params_space = {
#     'algorithm': ['lbfgs', 'l2sgd', 'ap'],
#     'c1': [0.1, 0.2, 0.01],
#     'c2': [0.1, 0.2, 0.01],
#     'max_iterations': [20, 50, 100, 40],
#     'all_possible_transitions': [True, False],
#     'linesearch' : ['MoreThuente', 'Backtracking', 'StrongBacktracking'],
#     'period' : [10,20,8],
#     'epsilon' : [1e-5,1e-10,1e-8]
# }

# # Define CRF model
# crf = CRF()

# # Define GridSearchCV object
# gs = GridSearchCV(crf, params_space, cv=3, verbose=1)

# # Fit GridSearchCV object to data
# gs.fit(X_train, y_train)

# # Print best hyperparameters and score
# print("Best hyperparameters:", gs.best_params_)
# print("Best score:", gs.best_score_)



In [None]:
#Best hyperparameters: {'algorithm': 'lbfgs', 'all_possible_transitions': True, 'c1': 0.1, 'c2': 0.1, 'max_iterations': 50}
#Best score: 0.8664383751467778

In [43]:
#TEST DATA
row_id_text, texts = read_file('/content/TEST_REVIEW_TEXT.txt')
X = [text2features(text) for text in texts]
y_pred = crf.predict(X) # inference step

In [50]:
print(len(y_pred[0]))
print(len(y_pred[1]))
print(len(row_id_text))

20
29
1259


In [84]:
import pandas as pd
#Generate output file
a = ""
index = 0
df = pd.DataFrame()
df['ID'] = row_id_text
df['TAGSEQ'] = y_pred
for i in df['TAGSEQ']:
  for j in i:
      a = a + " " + j
      
  #print(a)
  df['TAGSEQ'][index] = a
  index+=1
  a = ""
# df.head()

In [90]:
#Generate output file
df.to_csv("output_iamsk.txt", index=None, sep="\t")