In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
import time
import pickle 
import gc
import ner_f1
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter
import spacy
from tqdm import tqdm
from sklearn.metrics import make_scorer
import scipy.stats
from sklearn.model_selection import RandomizedSearchCV

In [2]:
labels = ['B-indications','I-indications']

In [3]:
with open('train_x.pkl', 'rb') as f:
    train_x = pickle.load(f)

In [4]:
with open('train_y.pkl', 'rb') as f:
    train_y = pickle.load(f)

In [5]:
with open('test.pkl', 'rb') as f:
    test = pickle.load(f)

In [6]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.2,
    c2=0.02,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(train_x, train_y)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.2, c2=0.02,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [7]:
y_test_pred = crf.predict(test)

In [8]:
y_test_pred_unpack=[val for sub_li in y_test_pred for val in sub_li]

In [9]:
test_df=pd.read_csv('test.csv')

In [10]:
sub=pd.DataFrame()
sub['id']=test_df.id.values
sub['Sent_ID']=test_df['Sent_ID']
sub['tag']=y_test_pred_unpack
sub.head()

Unnamed: 0,id,Sent_ID,tag
0,4543834,191283,O
1,4543835,191283,O
2,4543836,191283,O
3,4543837,191283,O
4,4543838,191283,O


In [11]:
sub['id'].describe()

count    2.994463e+06
mean     6.041065e+06
std      8.644272e+05
min      4.543834e+06
25%      5.292450e+06
50%      6.041065e+06
75%      6.789680e+06
max      7.538296e+06
Name: id, dtype: float64

### Rule 1

No I indication without B indication

In [12]:
i_set=set(sub.loc[sub['tag']=='I-indications']['Sent_ID'].values)
b_set=set(sub.loc[sub['tag']=='B-indications']['Sent_ID'].values)

In [13]:
only_i=i_set.difference(b_set)
len(only_i)

3

In [14]:
sub.loc[sub['Sent_ID'] ==305997, 'tag'] ='O' 

### Rule 2

All I indications should follow B indications

In [105]:
sub=sub.sort_values('id')

In [106]:
i_id_set=list(sub.loc[sub['tag']=='I-indications']['id'].values)
b_id_set=list(sub.loc[sub['tag']=='B-indications']['id'].values)

In [107]:
cond_set=[]
for val in i_id_set:
    if val-1 in b_id_set:
        pass
    else:
        cond_set.append(val)    

In [108]:
len(cond_set)

7207

In [97]:
sub.loc[sub['id'].isin(cond_set),'tag']='B-indications'

In [95]:
asub=asub.merge(test_df,on='id',how='left')

In [96]:
asub.head()

Unnamed: 0,id,Sent_ID_x,tag,Doc_ID,Sent_ID_y,Word
0,4544018,191294,I-indications,30005,191294,multocida
1,4544721,191324,I-indications,30010,191324,injury
2,4544748,191325,I-indications,30010,191325,injury
3,4544749,191325,I-indications,30010,191325,(
4,4544750,191325,I-indications,30010,191325,TBI


In [15]:
sub.to_csv('submission_crf_c002.csv',index=False)

In [16]:
!zip -r submission_crf_post_processed_c002.zip submission_crf_c002.csv

  adding: submission_crf_c002.csv (deflated 89%)


In [7]:
from sklearn.model_selection import KFold
kf=KFold(n_splits=10, random_state=None, shuffle=True)

In [10]:
y_pred_crf = {}

In [11]:
for fold_n, (train_index, valid_index) in enumerate(kf.split(train_x)):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train=[]
    X_valid=[]
    y_train=[]
    y_valid=[]
    for ind in train_index:
        X_train.append(train_x[ind])
        y_train.append(train_y[ind])
    for ind in valid_index:
        X_valid.append(train_x[ind])
        y_valid.append(train_y[ind])
    crf = sklearn_crfsuite.CRF(
                algorithm='lbfgs',
                c1=0.2,
                c2=0.1,
                max_iterations=100,
                all_possible_transitions=True
            )
    crf.fit(X_train, y_train)
    #oof_crf[valid_index]=crf.predict(X_valid)
    y_pred_crf[fold_n]= crf.predict(test)

Fold 0 started at Sun Mar 24 09:41:02 2019
Fold 1 started at Sun Mar 24 09:50:26 2019
Fold 2 started at Sun Mar 24 09:59:55 2019
Fold 3 started at Sun Mar 24 10:09:20 2019
Fold 4 started at Sun Mar 24 10:18:49 2019
Fold 5 started at Sun Mar 24 10:28:21 2019
Fold 6 started at Sun Mar 24 10:37:54 2019
Fold 7 started at Sun Mar 24 10:47:24 2019
Fold 8 started at Sun Mar 24 10:57:02 2019
Fold 9 started at Sun Mar 24 11:06:24 2019


In [12]:
for i in range(10):
    y_pred_crf[i]=[val for sub_li in y_pred_crf[i] for val in sub_li]

In [23]:
y_pred_df=pd.DataFrame(y_pred_crf)
y_pred_df=y_pred_df.replace('O',0)
y_pred_df=y_pred_df.replace('I-indications',1)
y_pred_df=y_pred_df.replace('B-indications',2)

In [24]:
y_pred_df['final']=y_pred_df.max(axis=1)

In [15]:
y_pred_df['final']=np.round(y_pred_df['final'].values)

In [25]:
y_pred_df['final']=y_pred_df['final'].replace(0,'O')
y_pred_df['final']=y_pred_df['final'].replace(1,'I-indications')
y_pred_df['final']=y_pred_df['final'].replace(2,'B-indications')

In [26]:
sub=pd.DataFrame()
sub['id']=test_df.id.values
sub['Sent_ID']=test_df['Sent_ID']
sub['tag']=y_pred_df['final'].values

In [27]:
sub.to_csv('submission-crf_cv.csv',index=False)

In [28]:
!zip -r submission-crf_cv2.zip submission-crf_cv.csv

  adding: submission-crf_cv.csv (deflated 89%)


In [110]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=20,
                        scoring=f1_scorer)
rs.fit(train_x, train_y)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Exception in thread QueueManagerThread:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.5/threading.py", line 862, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/externals/loky/process_executor.py", line 708, in _queue_management_worker
    executor._adjust_process_count()
  File "/usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/externals/loky/process_executor.py", line 1007, in _adjust_process_count
    p.start()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 105, in start
    self._popen = self._Popen(self)
  File "/usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/externals/loky/backend/process.py", line 37, in _Popen
    return Popen(process_obj)
  File "/usr/local/lib/python3.5/dist-packages

KeyboardInterrupt: 