## Load model

### Input parameters

In [103]:
dataset_file = './trainingdata_stepwise_turkish.tsv'
model_load_file = 'none'

### Load the packages and functions

In [104]:
from dev import *
import re
import pandas as pd
import numpy as np
import statsmodels.api as sm

data_stepwise = Dataset(dataset_file)
model = Seq2Seq(training_data=data_stepwise)



#### Create a new model or load a previous one

In [105]:
# load a previous model
if model_load_file != 'none':
    model = Seq2Seq(load=model_load_file)

In [106]:
# create a new model
if model_load_file == 'none':
    model.train_model(training_data=data_stepwise, n_epochs=200)

 10%|█         | 21/200 [00:25<03:37,  1.21s/it]

## Evaluate model

### Run the model on all 2 syllable words

In [None]:
# load dataframe
data = pd.read_csv(dataset_file, sep='\t')
data = data[data['syllables']==2]

# helper function to get decoder outputs
def get_decoder(input: torch.Tensor, target: torch.Tensor) -> np.ndarray:
    with torch.no_grad():
        _, attn_map_seq = model(input, target)
    return attn_map_seq.numpy()[:,0] # attention paid to the first letter

# helper functions to get correct inputs
def get_trial(training_data, word):
    trial = training_data.make_trial(word)
    return trial[0], torch.cat((trial[1], trial[2]), axis=1)

# get the decoder outputs for each word
get_out = lambda x, y : pd.DataFrame(get_decoder(*get_trial(x, y)))
df = get_out(data_stepwise, data['underlying'].values[0]).T

for i in range(1, data['underlying'].shape[0]):
    df = pd.concat(
        (df, get_out(data_stepwise, data['underlying'].values[i]).T),
        axis=0
    )

df = df.reset_index().drop('index', axis=1)

# add columns
for c in ['underlying', 'consonant', 'vowel']:
    col = data[c]
    col = col.reset_index().drop('index', axis=1)
    df[c] = col

df = df.rename({'vowel': "V2"}, axis=1)
df = df.assign(
    V1 = lambda d: d['underlying'].astype(str).str[0]
)

print(df)

           0         1         2         3         4         5         6  \
0   0.159603  0.159574  0.159588  0.159598  0.159599  0.159599  0.159599   
1   0.158703  0.158714  0.158722  0.158729  0.158729  0.158729  0.158729   
2   0.159603  0.159574  0.159588  0.159598  0.159599  0.159599  0.159599   
3   0.155098  0.155115  0.155114  0.155122  0.155123  0.155123  0.155123   
4   0.155098  0.155115  0.155114  0.155122  0.155123  0.155123  0.155123   
5   0.157877  0.157833  0.157838  0.157852  0.157851  0.157852  0.157852   
6   0.157877  0.157833  0.157838  0.157852  0.157851  0.157852  0.157852   
7   0.158703  0.158714  0.158722  0.158729  0.158729  0.158729  0.158729   
8   0.169476  0.169465  0.169462  0.169469  0.169469  0.169468  0.169468   
9   0.168214  0.168218  0.168216  0.168229  0.168230  0.168228  0.168229   
10  0.169476  0.169465  0.169462  0.169469  0.169469  0.169468  0.169468   
11  0.168214  0.168218  0.168216  0.168229  0.168230  0.168228  0.168229   
12  0.168214

### Prep dataframe for analysis

In [None]:
# create additional categorical values
df_a = df.assign(
    rounded = lambda d: d["V1"].apply(lambda y: 1 if y in ["ø", "u", "y", "o"] else 0)
)
df_b = df_a.assign(
    fronted = lambda d: d["V1"].apply(lambda y: 1 if y in ["ø", "e", "y", "i"] else 0)
)
df_c = df_b.assign(
    high = lambda d: d["V1"].apply(lambda y: 1 if y in["ø", "u", "y", "o", "i", "ɯ"] else 0)
)

In [None]:
df_melt = pd.melt(
    frame=df_c,
    id_vars=["V1", "V2", "consonant", "underlying", "fronted", "rounded", "high"],
    value_name="Attention",
    value_vars=[5, 6, 7, 8, 9],
    var_name="Time"
)

# set the categories as well
df_mle = df_melt.astype(
    {
        "Time": 'int64', 
        "V1": 'category', 
        "V2": 'category', 
        "consonant": 'category', 
        "fronted": 'category', 
        "rounded": 'category', 
        "high": 'category', 
        "underlying": 'category'
    }
)
print(df_mle)

    V1 V2 consonant underlying fronted rounded high  Time  Attention
0    i  H         b       ib-H       1       0    1     5   0.159599
1    a  H         b       ab-H       0       0    0     5   0.158729
2    e  H         b       eb-H       1       0    0     5   0.159599
3    o  H         b       ob-H       0       1    1     5   0.155123
4    u  H         b       ub-H       0       1    1     5   0.155123
..  .. ..       ...        ...     ...     ...  ...   ...        ...
235  o  L         d       od-L       0       1    1     9   0.158606
236  u  L         d       ud-L       0       1    1     9   0.158606
237  y  L         d       yd-L       1       1    1     9   0.159755
238  ø  L         d       ød-L       1       1    1     9   0.159755
239  ɯ  L         d       ɯd-L       0       0    1     9   0.158606

[240 rows x 9 columns]


### Run the analysis on the data

In [None]:
# data = sm.datasets.get_rdataset("dietox", "geepack").data
# print(data.dtypes)

Pig         int64
Evit       object
Cu         object
Litter      int64
Start     float64
Weight    float64
Feed      float64
Time        int64
dtype: object


In [None]:
import statsmodels.formula.api as smf

md = smf.ols("Attention ~ Time + rounded + fronted + high + V2", df_mle, groups=df_mle["underlying"])
mdf = md.fit()
print(mdf.summary())

                            OLS Regression Results                            
Dep. Variable:              Attention   R-squared:                       0.540
Model:                            OLS   Adj. R-squared:                  0.530
Method:                 Least Squares   F-statistic:                     54.94
Date:                Tue, 31 May 2022   Prob (F-statistic):           1.35e-37
Time:                        14:31:39   Log-Likelihood:                 959.15
No. Observations:                 240   AIC:                            -1906.
Df Residuals:                     234   BIC:                            -1885.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        0.1552      0.002     96.057   



In [None]:
from statsmodels.stats.multicomp import MultiComparison

mc = MultiComparison(df_mle['Attention'], groups=df_mle['fronted'])
mc.tukeyhsd().summary()

group1,group2,meandiff,p-adj,lower,upper,reject
0,1,0.0015,0.0737,-0.0001,0.0032,False
