In [4]:
import re
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import os
import pandas

In [6]:
def read(data_loc='',
        file_name='full_docs_2.csv', tokenization='word'):
    """
    Given data type and location, load data
    :param data_loc: location of dataset
    :param tokenization: mode of tokenization : word, char
    :return: (text,label) df_text is the tokenized text, df['l3'] last layer label
    """
    df = pandas.read_csv(data_loc + '/' + file_name)
    df = df.sample(frac=1).reset_index(drop=True)
    # df_texts = [self.tokenize(text) for text in df.text]
    # # df_texts = df.text.apply(self.tokenize)
    # # create dictionary
    # assert len(df_texts) == len(df['l3']) # l3 is the end level label
    # # print("finished tokenizing %d data instances"%len(df['l3']))
    # 
    # df = pandas.DataFrame(list(zip(df_texts, list(df['l3']))))
    # df.columns=['text', 'label']
    # # return df_texts, df['l3']
    return df


In [13]:

def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()

def text_cleaner(text):
    text = text.replace(".", "")
    text = text.replace("[", " ")
    text = text.replace(",", " ")
    text = text.replace("]", " ")
    text = text.replace("(", " ")
    text = text.replace(")", " ")
    text = text.replace("\"", "")
    text = text.replace("-", "")
    text = text.replace("=", "")
    rules = [
        {r'>\s+': u'>'},  # remove spaces after a tag opens or closes
        {r'\s+': u' '},  # replace consecutive spaces
        {r'\s*<br\s*/?>\s*': u'\n'},  # newline after a <br>
        {r'</(div)\s*>\s*': u'\n'},  # newline after </p> and </div> and <h1/>...
        {r'</(p|h\d)\s*>\s*': u'\n\n'},  # newline after </p> and </div> and <h1/>...
        {r'<head>.*<\s*(/head|body)[^>]*>': u''},  # remove <head> to </head>
        {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'},  # show links instead of texts
        {r'[ \t]*<[^<]*?/?>': u''},  # remove remaining tags
        {r'^\s+': u''}  # remove spaces at the beginning
    ]
    for rule in rules:
        for (k, v) in rule.items():
            regex = re.compile(k)
            text = regex.sub(v, text)
        text = text.rstrip()
        text = text.strip()
    return text.lower()

class LabelDictionary():
    def __init__(self):
        self.y2i = {}
        self.i2y ={}
def create_class_dict(y_labels):
    dict = LabelDictionary()
    # this function takes labels in L2 and
    # transform into range (0,max_classes) for train
    unique_labels = set(np.unique(y_labels))
    for i,label in enumerate(unique_labels):
        dict.y2i[label]=i
        dict.i2y[i]=label
    return dict

In [10]:
data_loc = '/home/ml/ksinha4/mlp/hier-class/data'
df_train = read(data_loc=data_loc, file_name="df_small_train.csv")
df_test = read(data_loc=data_loc, file_name="df_small_test.csv")
df_train.text = [clean_str(x) for x in  df_train.text]
df_test.text = [clean_str(x) for x in df_test.text]

In [14]:
l1_x_train, l1_y_train, l1_x_val, l1_y_val = \
        train_test_split(df_train.text, df_train.l1, test_size=0.1, random_state=0)
number_of_classes_L1 = len(df_train.l1.unique())
l1_dict = create_class_dict(df_train.l1.unique())


In [17]:
print(l1_dict.y2i)

{'TopicalConcept': 0, 'Event': 1, 'Work': 2, 'SportsSeason': 3, 'Place': 4, 'Species': 5, 'UnitOfWork': 6, 'Agent': 7, 'Device': 8}


In [21]:
l2_data = [None]*len(l1_dict.y2i)
print(l2_data)

[None, None, None, None, None, None, None, None, None]


In [24]:
for i in range(len(l1_dict.i2y.keys())):
    l2_data[i]=df_train[df_train.l1==l1_dict.i2y[i]]

In [99]:
class Dataset():
    def __init__(self):
        self.x_train=None
        self.y_train=None
        self.x_val=None
        self.y_val=None
        self.label_dict=None
        self.number_of_classes=None
        self.childs=None


def data_pipline(df_train,level=1,stop_level=3):
    d = Dataset()
    d.number_of_classes= len(df_train['l%d'%level].unique())
    d.dict = create_class_dict(df_train['l%d'%level].unique())
    d.x_train, d.x_val, d.y_train, d.y_val = \
            train_test_split(df_train.text, 
            df_train['l%d'%level].apply(lambda x: d.dict.y2i[x]),
                             test_size=0.1, random_state=0)
    
    children_data = [None]*len(d.dict.y2i)
    next_level= 1+level
    if stop_level+1 == next_level:
        d_childs = None
    else:
        for i in range(len(d.dict.i2y.keys())):
            children_data[i]=data_pipline(df_train[df_train['l%d'%level]\
                                      ==d.dict.i2y[i]],
                                          level=next_level)
        d.childs = children_data
    return d

In [100]:
d = data_pipline(df_train,1)

In [102]:
print(d.y_val[:10])
print(d.childs[7].dict.y2i)

177075    5
32155     7
182380    5
125377    5
173033    5
57524     4
243873    4
73242     7
117668    7
183114    2
Name: l1, dtype: int64
{'Person': 0, 'GridironFootballPlayer': 1, 'OrganisationMember': 2, 'MusicalArtist': 3, 'Actor': 4, 'WinterSportPlayer': 5, 'RacingDriver': 6, 'SportsTeam': 7, 'Athlete': 8, 'SportsLeague': 9, 'Organisation': 10, 'Coach': 11, 'Company': 12, 'Scientist': 13, 'SportsManager': 14, 'ComicsCharacter': 15, 'MotorcycleRider': 16, 'BritishRoyalty': 17, 'Group': 18, 'Writer': 19, 'Broadcaster': 20, 'Politician': 21, 'Artist': 22, 'EducationalInstitution': 23, 'Cleric': 24, 'Wrestler': 25, 'FictionalCharacter': 26, 'VolleyballPlayer': 27, 'Presenter': 28, 'Boxer': 29}
