In [34]:
import pandas as pd
import numpy as np
import functools
import json
import matplotlib.pyplot as plt
import random
from decisiontree import DecisionTreeModel as DT, accuracy_score 
import utils as ut

In [21]:
# get target around words list
def _splitFeatures(sent, posi, rang):
    sent = str(sent).split(" ")
    str_len = len(sent)
    list = [sent[int(posi)-i] if int(posi)-i >=0 else "<BEGIN_SENTENCE>" for i in range(rang, 0, -1)] \
            + [sent[int(posi)+1+i] if int(posi)+1+i < str_len else "<END_SENTENCE>" for i in range(rang)]
    return list

In [22]:
# get one hot encoding dict
def _onehot_map(vocabs):
    df = pd.get_dummies(pd.Series(vocabs))
    df['one_hot'] = df.apply(lambda x: ','.join(x.astype(str)), axis=1)
    r = dict(zip(vocabs, df.one_hot))
    return r

In [23]:
# clean up raw data
def _check_position(sent, posi):
    sent = str(sent).split(" ")
    index = -1
    if '_______' in sent:
        index = sent.index('_______')
    return posi == index

In [29]:
# Archive ---   create dataset
def _create_dataset(file, rang, vocabs_oh=None):
    # get file content into DF
    df = pd.DataFrame(open(file, "r").readlines(), columns = ['ORG_DATA'])

    # split into 3 columns
    df[['Target', 'Position', 'Sentence']] = df.ORG_DATA.apply(lambda x: pd.Series(str(x).split(" ", 2)))
    
    # clean up
    df['check'] = df.apply(lambda x : _check_position(x['Sentence'], int(x['Position'])), axis = 1)
    df = df[df['check']]
    
    # get target around words list for each sample
    df['Words_List'] = df.apply(lambda x : _splitFeatures(x['Sentence'], x['Position'], rang), axis = 1)
    
    if vocabs_oh == None:
        # get vocabulary
        vocabs = df['Words_List'].explode().unique()
        vocabs = np.append(vocabs, '<UNKNOWN_WORD>')
        # one hot mapping based on vocabs
        vocabs_oh = _onehot_map(vocabs)
    
    X = pd.DataFrame(df.Words_List.tolist(), index = df.index)
#     X.columns = ['T-2','T-1','T+1','T+2']
    
#     X["T-2"] = X["T-2"].apply(lambda i: vocabs_oh[i] if i in vocabs_oh else vocabs_oh['<UNKNOWN_WORD>'])
#     X["T-1"] = X["T-1"].apply(lambda i: vocabs_oh[i] if i in vocabs_oh else vocabs_oh['<UNKNOWN_WORD>'])
#     X["T+1"] = X["T+1"].apply(lambda i: vocabs_oh[i] if i in vocabs_oh else vocabs_oh['<UNKNOWN_WORD>'])
#     X["T+2"] = X["T+2"].apply(lambda i: vocabs_oh[i] if i in vocabs_oh else vocabs_oh['<UNKNOWN_WORD>'])
    
    X.columns = range(rang*2)  
    X = X.applymap(lambda i: vocabs_oh[i] if i in vocabs_oh else vocabs_oh['<UNKNOWN_WORD>'])
    
    temp = X.apply(lambda x: ','.join(x.astype(str)), axis=1)
    Xs = temp.apply(lambda x: pd.Series(str(x).split(",")))
    
    y = df['Target']
    
    return Xs, y, vocabs_oh

In [5]:
# create dataset
def _create_dataset(file, rang, vocabs_oh=None):
    # get file content into DF
    df = pd.DataFrame(open(file, "r").readlines(), columns = ['ORG_DATA'])

    # split into 3 columns
    df[['Target', 'Position', 'Sentence']] = df.ORG_DATA.apply(lambda x: pd.Series(str(x).split(" ", 2)))
    
    # clean up
    df['check'] = df.apply(lambda x : _check_position(x['Sentence'], int(x['Position'])), axis = 1)
    df = df[df['check']]
    
    # get words list around target for each sample
    df['Words_List'] = df.apply(lambda x : _splitFeatures(x['Sentence'], x['Position'], rang), axis = 1)
    
    # create vocabulary using Train dataset. Re-use the vovabulary for Dev and Test dataset
    if vocabs_oh == None:
        # get vocabulary(unique words)
        vocabs = df['Words_List'].explode().unique()
        vocabs = np.append(vocabs, '<UNKNOWN_WORD>')
        # one hot mapping based on vocabs
        vocabs_oh = _onehot_map(vocabs)
    
    # replace each word with one hot mapping
    X = pd.DataFrame(df.Words_List.tolist(), index = df.index)
    X.columns = range(rang*2)  
    X = X.applymap(lambda i: vocabs_oh[i] if i in vocabs_oh else vocabs_oh['<UNKNOWN_WORD>'])
    
    # Generate one hot mapped feature dataframe
    temp = X.apply(lambda x: ','.join(x.astype(str)), axis=1)
    Xs = temp.apply(lambda x: pd.Series(str(x).split(",")))
    
    y = df['Target']
    
    return Xs, y, vocabs_oh

In [6]:
def _write_dict_to_file(dic, file_name):
    with open(file_name, 'w') as f: f.write(json.dumps(dic))

In [71]:
X, y, vocabs_oh = ut.create_dataset("one_sentence.csv", 1)
print(X)

   0  1  2  3  4  5
0  0  1  0  0  0  1
1  0  1  0  0  0  1


In [72]:
vocabs_oh

{'abc': '0,1,0', 'the': '0,0,1', '<UNKNOWN_WORD>': '1,0,0'}

In [73]:
y

0    weather
1    whether
Name: Target, dtype: object

In [74]:
model2 = DT()

In [75]:
model2.fit(X,y)

Start fitting


ValueError: attempt to get argmax of an empty sequence

In [8]:
X.to_csv('X_train_range1.csv', index=False)  

In [9]:
y.to_csv('y_train_range1.csv', index=False)  

In [10]:
_write_dict_to_file(vocabs_oh, 'vocabs_oh_range1.txt')

In [11]:
X_test, y_test, _ = _create_dataset("hw1_dataset/hw1.test.col", 1, vocabs_oh)
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7482,7483,7484,7485,7486,7487,7488,7489,7490,7491
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
X_test.to_csv('X_test_range1.csv', index=False)  

In [13]:
y_test.to_csv('y_test_range1.csv', index=False)  

In [14]:
X_dev, y_dev, _ = _create_dataset("hw1_dataset/hw1.dev.col", 1, vocabs_oh)

In [15]:
X_dev.to_csv('X_dev_range1.csv', index=False)  

In [16]:
y_dev.to_csv('y_dev_range1.csv', index=False)  

## Train-Dev-Test

In [18]:
model1 = DT()

In [19]:
model1.fit(X,y)

Start fitting


ValueError: attempt to get argmax of an empty sequence

In [None]:
y_dev_pred = model1.predict(X_dev)

In [None]:
acc = accuracy_score(y_dev, y_dev_pred)
acc

In [None]:
X_test.shape

In [None]:
X.shape

In [None]:
X_dev.shape

In [None]:
X, y, vocabs_oh = _create_dataset("hw1_dataset/hw1.train.col", 2)