In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from collections import Counter

In [None]:
# get the initial state of the RNG
state = np.random.get_state()[1][0]
print(state)

np.random.seed(seed=711)
state = np.random.get_state()[1][0]
print(state)

In [None]:
# size of the vocabularies (label set) for each tbid
vocab_dict = {'ar_padt':1123,
              'bg_btb':178,
              'cs_cac':646,
              'cs_fictree':301,
              'cs_pdt':782,
              'en_ewt':389,
              'et_edt':178,
              'fr_sequoia':72,
              'fi_tdt':438,
              'it_isdt':353,
              'nl_alpino':424,
              'nl_lassysmall':304,
              'lt_alksnis':205,
              'lv_lvtb':134,
              'pl_lfg':180,
              'pl_pdb':890,
              'ru_syntagrus':657,
              'sk_snk':282,
              'sv_talbanken':326,
              'ta_ttb':123,
              'uk_iu':362
             }

In [None]:
# size of the training set for each tbid
training_dict = {'ar_padt':6075,
                 'bg_btb': 8907,
                 'cs_cac': 23478,
                 'cs_fictree': 10160,
                 'cs_pdt': 68495,
                 'en_ewt': 12543,
                 'et_ewt': 1116,
                 'fr_sequoia': 2231,
                 'fi_tdt': 12217,
                 'it_isdt': 13121,
                 'nl_alpino': 12264,
                 'nl_lassysmall': 5787,
                 'lt_alksnis': 2341,
                 'lv_lvtb': 10156, 
                 'pl_lfg': 13774,
                 'pl_pdb': 17722,
                 'ru_syntagrus': 48814,
                 'sk_snk': 8483,
                 'sv_talbanken': 4303,
                 'ta_ttb':400,
                 'uk_iu': 5496
                }

In [None]:
# pl_lfg - we used pl_pdb for the test set, et_ewt we used rule-based and fr_sequoia doesn't apply the rule.

#TBIDS_TO_SKIP = ['et_edt', 'fr_sequoia', 'pl_lfg']
TBIDS_TO_SKIP = ['et_edt', 'fr_sequoia', 'pl_lfg', 'ar_padt', 'ta_ttb', 'fi_tdt', 'lv_lvtb', 'lt_alksnis', 'ru_syntagrus']

In [None]:
tbids = []

# gold and system success rates
gold_success = []
system_success = []

# Y / Response / Dependent Variable
# difference between the success rate of rule on gold vs. dev
diffs = []

# X / Predictor / Independent Variable(s)
# vocabulary (label set) size
# training set size
vocab_sizes = []
training_sizes = []

with open("../case.csv") as f:
    for i, line in enumerate(f):
        # skip header
        if i >= 1:
            items = line.split(",")
            
            tbid = items[0]
            if tbid not in TBIDS_TO_SKIP:
                gold_s = items[1]
                system_s = items[2]
                gold_s = float(gold_s.split("%")[0]) * 100
                system_s = float(system_s.split("%")[0]) * 100
                diff = items[3]
                diff = float(diff.split("%")[0]) * 100
                if diff > 10:
                    print("warning: considerable difference for {}".format(tbid))
                tbids.append(tbid)
                gold_success.append(gold_s)
                system_success.append(system_s)
                diffs.append(diff)
                
                if tbid in vocab_dict:
                    vocab_sizes.append(vocab_dict[tbid])
                else:
                    print("warning: no vocab size found for {}".format(tbid)) 
                
                if tbid in training_dict:
                    training_sizes.append(training_dict[tbid])
                else:
                    print("warning: no training size found for {}".format(tbid))

In [None]:
print(tbids)
print(gold_success)
print(system_success)
print(diffs)
print(vocab_sizes)
print(training_sizes)

assert len(tbids) == len(diffs) == len(vocab_sizes) == len(gold_success) == len(system_success) == len(training_sizes)

### Bar plot

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 20, 10
plt.rcParams.update({'font.size': 10.})

In [None]:
N = len(tbids)

ind = np.arange(N) 
width = 0.35    
plt.bar(ind, gold_success, width, label='GOLD')
plt.bar(ind + width, system_success, width, label='SYSTEM')

plt.ylabel('Num case dependent lemmas successfully attached')
plt.title('Case Rule Success')

plt.xticks(ind + width / 2, tbids)
plt.legend(loc='best')
plt.savefig('case_success.png')
plt.show()

### Regression

In [None]:
X = np.array(vocab_sizes)
X = X.reshape(-1, 1)

Y = np.array(diffs)
Y = Y.reshape(-1, 1)

In [None]:
model = linear_model.LinearRegression()
model.fit(X,Y)

In [None]:
true_coefficient = model.coef_
print("true coefficient: {}".format(true_coefficient))

In [None]:
model.intercept_

In [None]:
pred_Y = model.predict(X)

In [None]:
plt.scatter(X, Y, color='b')
plt.plot(X, pred_Y, 'r')
plt.legend(['Predicted line','TBID'])
plt.xlabel('Vocabulary Size')
plt.ylabel('Case Rule Success Difference between Gold and System')
plt.savefig('case_diff_vocab_no_outliers.png')
plt.show()

In [None]:
# Randomly shuffle Y
coefficients = []
intercepts = []
counts = Counter()

for i in range(10000):
    Y = np.array(diffs)
    np.random.shuffle(Y)
    Y = Y.reshape(-1, 1)
    
    model = linear_model.LinearRegression()
    model.fit(X,Y)
    
    coefficient = model.coef_
    intercept = model.intercept_
 
    if coefficient > 0:
        counts.update(["coefficient is positive"])
    elif coefficient < 0:
        counts.update(["coefficient is negative"])
    
    if coefficient > true_coefficient:
        #print(f"shuffled {coefficient} is more positive than {true_coefficient}")
        counts.update(["shuffled coefficient is more positive than true coefficient"])
    elif coefficient < true_coefficient:
        #print(f"shuffled {coefficient} is more negative than {true_coefficient}")
        counts.update(["shuffled coefficient is more negative than true coefficient"])
    
    coefficients.append(coefficient)
    intercepts.append(intercept)  

In [None]:
print(counts)

### Regression 2

In [None]:
X = np.array(training_sizes)
X = X.reshape(-1, 1)

Y = np.array(diffs)
Y = Y.reshape(-1, 1)

In [None]:
model = linear_model.LinearRegression()
model.fit(X,Y)

In [None]:
true_coefficient = model.coef_
print("true coefficient: {}".format(true_coefficient))

In [None]:
model.intercept_

In [None]:
pred_Y = model.predict(X)

In [None]:
plt.scatter(X, Y, color='b')
plt.plot(X, pred_Y, 'r')
plt.legend(['Predicted line','TBID'])
plt.xlabel('Training Set Size')
plt.ylabel('Case Rule Success Difference between Gold and System')
plt.savefig('case_diff_training_no_outliers.png')
plt.show()

In [None]:
# Randomly shuffle Y
coefficients = []
intercepts = []
counts = Counter()


for i in range(10000):
    Y = np.array(diffs)
    np.random.shuffle(Y)
    Y = Y.reshape(-1, 1)
    
    model = linear_model.LinearRegression()
    model.fit(X,Y)
    
    coefficient = model.coef_
    intercept = model.intercept_
 
    if coefficient > 0:
        counts.update(["coefficient is positive"])
    elif coefficient < 0:
        counts.update(["coefficient is negative"])
    
    if coefficient > true_coefficient:
        #print(f"shuffled {coefficient} is more positive than {true_coefficient}")
        counts.update(["shuffled coefficient is more positive than true coefficient"])
    elif coefficient < true_coefficient:
        #print(f"shuffled {coefficient} is more negative than {true_coefficient}")
        counts.update(["shuffled coefficient is more negative than true coefficient"])
    
    coefficients.append(coefficient)
    intercepts.append(intercept)  

In [None]:
print(counts)