# Imports

In [1]:
import numpy as np
import pandas as pd
import sys
libraries = (('Numpy', np), ('Pandas', pd))

print("Python Version:", sys.version, '\n')
for lib in libraries:
    print('{0} Version: {1}'.format(lib[0], lib[1].__version__))

Python Version: 3.6.5 |Anaconda, Inc.| (default, Apr 29 2018, 16:14:56) 
[GCC 7.2.0] 

Numpy Version: 1.16.4
Pandas Version: 0.23.0


In [2]:
import txtFiles as tf
import ftCommands as ftc
import evaluatePredictions as ep

sys.path.append('../myModules/')
import kleptoFunctions as kf

# Data

In [3]:
data_path = 'data/'

In [4]:
data = kf.puking_file('2019.06.25_cleaned_newsgroups', data_path)

Filename: 2019.06.25_cleaned_newsgroups 
# of Folders: 5 
Type: <class 'pandas.core.frame.DataFrame'> 
Len: 18752


In [5]:
tier1_targets = dict(zip(data['tier1_label'], data['tier1_targets']))
del tier1_targets['misc']
print(tier1_targets)

{'rec': 3, 'comp': 1, 'talk': 6, 'sci': 4, 'soc': 5}


# FastText Txt files

## split ids

In [6]:
random_state = 19

In [7]:
t1_ids = tf.training_validation_holdout_split(data['_id'], random_state)

holdout	 4688
training	 10548
validation	 3516


In [8]:
experiment_ids = list(t1_ids[[k for k in t1_ids.keys() if 'training' in k][0]])+\
list(t1_ids[[k for k in t1_ids.keys() if 'validation' in k][0]])
t2_ids = tf.tier2_training_validation(data, experiment_ids, \
                                       list(tier1_targets.keys()), \
                                       random_state)

*** comp ***
training	 2751
validation	 917
*** rec ***
training	 2199
validation	 734
*** sci ***
training	 2221
validation	 741
*** soc ***
training	 1350
validation	 451
*** talk ***
training	 1495
validation	 499


## txt Files

In [9]:
for name_part in t1_ids.keys():
    filepath = data_path+'%s.txt' % name_part
    df = data[data['_id'].isin(t1_ids[name_part])]
    tf.make_txtfile(filepath, df, 'tier1_targets')
    print(filepath)

data/holdout_19.txt
data/T1_training_19.txt
data/T1_validation_19.txt


In [10]:
for name_part in t2_ids.keys():
    filepath = data_path+'%s.txt' % name_part
    df = data[data['_id'].isin(t2_ids[name_part])]
    tf.make_txtfile(filepath, df, 'tier2_targets')
    print(filepath)

data/comp_training_19.txt
data/comp_validation_19.txt
data/rec_training_19.txt
data/rec_validation_19.txt
data/sci_training_19.txt
data/sci_validation_19.txt
data/soc_training_19.txt
data/soc_validation_19.txt
data/talk_training_19.txt
data/talk_validation_19.txt


# Training Model

In [11]:
# FastText Parameters
ngram = '2'
lr = '0.5'
dim = '200'
ws = '5'
epoch = '25'
loss = 'ns'

## Tier 1

In [12]:
train_filename = [k for k in t1_ids.keys() if 'training' in k][0]+'.txt'
train_address = data_path+train_filename
test_filename = [k for k in t1_ids.keys() if 'validation' in k][0]+'.txt'
test_address = data_path+test_filename
holdout_filename = [k for k in t1_ids.keys() if 'holdout' in k][0]+'.txt'
holdout_address = data_path+holdout_filename

In [13]:
model_address, predict_address = ftc.train_test_predict(train_address, test_address, ngram, lr, dim, ws, epoch, loss)
print(model_address)
print(predict_address)

N	3516
P@1	0.902
R@1	0.902

data/T1_model
data/T1_prediction_19.txt


In [14]:
precision, recall, fscore, support = ep.score_txtfiles(test_address, predict_address)

precision: 0.8905375812624793
recall: 0.8758823788944
fscore: 0.8826725725846437
support: [913 166 751 739 460 487]


## Tier 2

In [15]:
for label in sorted(tier1_targets.keys()):
    train_address = data_path+'%s_training_%s.txt' % (label, str(random_state))
    test_address = data_path+'%s_validation_%s.txt' % (label, str(random_state))
    
    print('\n***', label, '***')
    model_address, predict_address = ftc.train_test_predict(train_address, test_address, ngram, lr, dim, ws, epoch, loss)
    print(model_address)


*** comp ***
N	917
P@1	0.833
R@1	0.833

data/comp_model

*** rec ***
N	734
P@1	0.931
R@1	0.931

data/rec_model

*** sci ***
N	741
P@1	0.947
R@1	0.947

data/sci_model

*** soc ***
N	451
P@1	0.825
R@1	0.825

data/soc_model

*** talk ***
N	499
P@1	0.902
R@1	0.902

data/talk_model


# Holdout set

## Tier 1

### Using Model

In [16]:
model_address = data_path+'T1_model'
test_address = data_path+'holdout_%s.txt' % str(random_state)

In [17]:
predict_address = ftc.test_predict(test_address, model_address)
print(predict_address)

N	4688
P@1	0.917
R@1	0.917

data/holdout_prediction_19.txt


In [18]:
precision, recall, fscore, support = ep.score_txtfiles(test_address, predict_address)

precision: 0.9062264886158423
recall: 0.8951481608099091
fscore: 0.9004165917597077
support: [1181  244 1034  980  619  630]


### Update df with Predictions

In [19]:
holdout_data = data[data['_id'].isin(t1_ids['holdout_'+str(random_state)])]
print(holdout_data.shape)

(4688, 7)


In [20]:
predict_labels = [int(p[9:]) for p in ep.collect_labels(predict_address)]
holdout_data.insert(loc=0, column='tier1_predictions', value=predict_labels)
print(holdout_data.shape)

(4688, 8)


## Tier 2

### Tier2 Txt Files

In [21]:
for t1_name in tier1_targets:
    filename = data_path+'holdout_%s.txt' % t1_name
    df = holdout_data[holdout_data['tier1_predictions']==tier1_targets[t1_name]]
    tf.make_txtfile(filename, df, 'tier2_targets')

### Use Models

In [22]:
for t1_name in tier1_targets:
    model_address = data_path+'%s_model' % t1_name
    test_address = data_path+'holdout_%s.txt' % t1_name

    print('\n***', t1_name, '***')
    predict_address = ftc.test_predict(test_address, model_address)
    print(predict_address)


*** rec ***
N	986
P@1	0.953
R@1	0.953

data/holdout_prediction_rec.txt

*** comp ***
N	1105
P@1	0.86
R@1	0.86

data/holdout_prediction_comp.txt

*** talk ***
N	577
P@1	0.939
R@1	0.939

data/holdout_prediction_talk.txt

*** sci ***
N	884
P@1	0.974
R@1	0.974

data/holdout_prediction_sci.txt

*** soc ***
N	558
P@1	0.855
R@1	0.855

data/holdout_prediction_soc.txt


### Update df with Predictions

In [23]:
tier2_pred_dfs = {}
for t1_name in tier1_targets:
    print('***', t1_name, '***')
    predict_address = data_path+'holdout_prediction_%s.txt' % t1_name   
    predict_labels = [int(p[9:]) for p in ep.collect_labels(predict_address)]
    df = holdout_data[holdout_data['tier1_predictions']==tier1_targets[t1_name]]
    
    df.insert(loc=0, column='tier2_predictions', value=predict_labels)
    print(df.shape)
    tier2_pred_dfs[t1_name] = df

*** rec ***
(1039, 9)
*** comp ***
(1216, 9)
*** talk ***
(616, 9)
*** sci ***
(991, 9)
*** soc ***
(599, 9)


In [24]:
df = holdout_data[holdout_data['tier1_predictions']==2]
df.insert(loc=0, column='tier2_predictions', value=21)
tier2_pred_dfs['misc'] = df

In [25]:
holdout_data = pd.concat(tier2_pred_dfs.values())
holdout_data.shape

(4688, 9)

## Holdout Set Hiearchical Evaluation

In [26]:
precision, recall, fscore, support = ep.score_columns(holdout_data, 'tier2_targets', 'tier2_predictions')

precision: 0.8427593764773548
recall: 0.8374776844288027
fscore: 0.8383926824765956
support: [228 274 230 209 240 244 249 258 268 259 245 236 239 260 204 256 159 237
 211 182]
