In [1]:
import os, sys
sys.path.append(os.path.abspath('./src'))
import utils, json

In [2]:
# PATHS
data_path = '/data/rsg/nlp/fake_proj/__temp__juanmoo__'
EMA_dump_path = os.path.join(data_path, './jsons/EMA_dump.json')
EMA_xmls_path = os.path.join(data_path, './xmls/')
EMA_annotations_path = os.path.join(data_path, './bayer/VendorEMAforMIT/annotations.xlsx')

In [3]:
# Raw Data
data = json.loads(open(EMA_dump_path, 'r').read())
'''
Format:
[
    document_name: <str>: {
                            document_name: <str>,
                            element_text: <str> (raw text),
                            processed_text: <str> (lowercase, only alphanumeric),
                            element_tag: <str> (TEI XML tag)
                          },
                          
    ...
]
'''

# Labels
'''
Dict in form:
{
    file_name: {
        texts: [ <str>, ...],
        labels: [ <str>, ...]
    },
    
    ...
    
}
'''
annotations = utils.parse_spreadsheet(EMA_annotations_path)

## Matching Data to Labels

In [22]:
min_paragraph_length = 2

paragraphs = []
labels = []

for doc_name in data:
    assert(doc_name in annotations)
    
    for t in data[doc_name]['element_text']:
        if len(t) >=  min_paragraph_length:
            for i, text_candidate in enumerate(annotations[doc_name]['texts']):
                if t in text_candidate:
                    paragraphs.append(t)
                    labels.append(annotations[doc_name]['labels'][i])
                    break

In [17]:
print('parlen:', len(paragraphs))
print('lablen:', len(labels))

parlen: 1258
lablen: 1258


## Preprocessing

In [38]:
import re 
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\n", "", string)    
    string = re.sub(r"\r", "", string) 
    string = re.sub(r"[0-9]", "digit", string)
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)
    return string.strip().lower()

X = [clean_str(raw) for raw in paragraphs]
Y = [l.lower() for l in labels]

In [43]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

In [60]:
#pipeline of feature engineering and model
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import *
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

model = Pipeline([('vectorizer', CountVectorizer()),
 ('tfidf', TfidfTransformer()),
 ('clf', OneVsRestClassifier(LinearSVC(class_weight='balanced')))])

In [61]:
#paramater selection
from sklearn.model_selection import GridSearchCV
parameters = {'vectorizer__ngram_range': [(i, j) for i in range(1, 5) for j in range(1, 5)],
               'tfidf__use_idf': (True, False)}
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X, Y)
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)



0.8362486561689749
{'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 1)}




In [62]:
model = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,1))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

#Training of Final Model
model.fit(X_train, y_train)
#Test
pred = model.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(pred, y_test)



array([[ 31,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,  26,   0,   0,   3,   0,   0,   0,   0,   2],
       [  0,   1,  32,   1,   0,   0,   0,   0,   0,   1],
       [  0,   0,   0,   4,   0,   0,   0,   2,   0,   2],
       [  0,   7,   0,   1,  20,   0,   0,   0,   0,   5],
       [  0,   0,   0,   0,   0,   8,   0,   1,   0,   3],
       [  0,   0,   0,   0,   0,   0,   1,   0,   0,   0],
       [  3,   0,   0,   0,   0,   0,   0,  50,   0,   1],
       [  0,   0,   0,   0,   0,   0,   0,   1,   0,   0],
       [  7,   0,   0,   1,   0,   1,   0,   5,   1, 157]])

### TODO's
* Remove common words from text
* Remove classes with few examples
* Append Corresponding headers to examples
* Explore other models ?