In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model

In [None]:
# size of the vocabularies (label set) for each tbid
vocab_dict = {'ar_padt':1123,
              'bg_btb':178,
              'cs_cac':646,
              'cs_fictree':301,
              'cs_pdt':782,
              'en_ewt':389,
              'et_edt':178,
              'fr_sequoia':72,
              'fi_tdt':438,
              'it_isdt':353,
              'nl_alpino':424,
              'nl_lassysmall':304,
              'lt_alksnis':205,
              'lv_lvtb':134,
              'pl_lfg':180,
              'pl_pdb':890,
              'ru_syntagrus':657,
              'sk_snk':282,
              'sv_talbanken':326,
              'ta_ttb':123,
              'uk_iu':362
             }

In [None]:
# size of the training set for each tbid
training_dict = {'ar_padt':6075,
'bg_btb': 8907,
'cs_cac': 23478,
'cs_fictree': 10160,
'cs_pdt': 68495,
'en_ewt': 12543,
'et_ewt': 1116,
'fr_sequoia': 2231,
'fi_tdt': 12217,
'it_isdt': 13121,
'nl_alpino': 12264,
'nl_lassysmall': 5787,
'lt_alksnis': 2341,
'lv_lvtb': 10156, 
'pl_lfg': 13774,
'pl_pdb': 17722,
'ru_syntagrus': 48814,
'sk_snk': 8483,
'sv_talbanken': 4303,
'ta_ttb':400,
'uk_iu': 5496
}

In [None]:
# skip pl_lfg as we used pl_pdb for the test set, et_ewt we used rule-based
# fr_sequoia doesn't apply the rule.

TBIDS_TO_SKIP = ['et_edt', 'fr_sequoia', 'pl_lfg']
#TBIDS_TO_SKIP = ['et_edt', 'fr_sequoia', 'pl_lfg', 'ar_padt', 'ta_ttb']

In [None]:
tbids = []

# gold and system success rates
gold_success = []
system_success = []

# Response/Dependent Variable
# y values: differences between success rate of rule on gold vs. dev
diffs = []

# Predictor/Independent Variable(s)
# x values: vocabulary (label set) size
# x values: training set size
vocab_sizes = []
training_sizes = []

with open("../case.csv") as f:
    for i, line in enumerate(f):
        # skip header
        if i >= 1:
            items = line.split(",")
            
            tbid = items[0]
            if tbid not in TBIDS_TO_SKIP:
                gold_s = items[1]
                system_s = items[2]
                gold_s = gold_s.split("%")[0]
                system_s = system_s.split("%")[0]
                
                diff = items[3]
                # remove percent sign
                diff = diff.split("%")[0]
                diff = float(diff)
                if diff * 100 > 10:
                    print("warning: considerable difference for {}".format(tbid))
                tbids.append(tbid)
                gold_success.append(float(gold_s) * 100)
                system_success.append(float(system_s) * 100)
                diffs.append(float(diff) * 100)
            
                if tbid in vocab_dict:
                    vocab_sizes.append(vocab_dict[tbid])
                else:
                    print("warning: no vocab size found for {}".format(tbid))
                    
                if tbid in training_dict:
                    training_sizes.append(training_dict[tbid])
                else:
                    print("warning: no training size found for {}".format(tbid))

In [None]:
print(tbids)
print(gold_success)
print(system_success)
print(diffs)
print(vocab_sizes)
print(training_sizes)

assert len(tbids) == len(diffs) == len(vocab_sizes) == len(gold_success) == len(system_success) == len(training_sizes)

### Bar plot

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 20, 10
plt.rcParams.update({'font.size': 10.})

In [None]:
N = len(tbids)

ind = np.arange(N) 
width = 0.35    
plt.bar(ind, gold_success, width, label='GOLD')
plt.bar(ind + width, system_success, width, label='SYSTEM')

plt.ylabel('Num case dependent lemmas successfully attached')
plt.title('Case Rule Success')

plt.xticks(ind + width / 2, tbids)
plt.legend(loc='best')
plt.savefig('case_success.png')
plt.show()

### Regression

In [None]:
X = np.array(vocab_sizes)
X = X.reshape(-1, 1)

Y = np.array(diffs)
Y = Y.reshape(-1, 1)

In [None]:
model = linear_model.LinearRegression()
model.fit(X,Y)

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
pred_Y = model.predict(X)

In [None]:
plt.scatter(X, Y, color='b')
plt.plot(X, pred_Y, 'r')
plt.legend(['Predicted line','TBID'])
plt.xlabel('Vocabulary Size')
plt.ylabel('Case Rule Success Difference between Gold and System')
plt.savefig('case_diff_vocab.png')
plt.show()

### Regression 2

In [None]:
X = np.array(training_sizes)
X = X.reshape(-1, 1)

Y = np.array(diffs)
Y = Y.reshape(-1, 1)

In [None]:
model = linear_model.LinearRegression()
model.fit(X,Y)

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
pred_Y = model.predict(X)

In [None]:
plt.scatter(X, Y, color='b')
plt.plot(X, pred_Y, 'r')
plt.legend(['Predicted line','TBID'])
plt.xlabel('Training Set Size')
plt.ylabel('Case Rule Success Difference between Gold and System')
plt.savefig('case_diff_training.png')
plt.show()