<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Corpus Linguistics - Study 2 - Phase 1 - Eduardo

This phase aims to adding Eduardo's Target Corpus to the Brazilian Portuguese Multi-Dimensional Analysis.

## Required Python packages

- pandas

## Import the required libraries

In [1]:
import pandas as pd
import os
import sys

## Define input variables

In [2]:
input_directory = 'tagcount/sas'
output_directory = 'cl_st2_ph1_eduardo'

## Create output directory

In [3]:
# Check if the output directory already exists. If it does, do nothing. If it doesn't exist, create it.
if os.path.exists(output_directory):
    print('Output directory already exists.')
else:
    try:
        os.makedirs(output_directory)
        print('Output directory successfully created.')
    except OSError as e:
        print('Failed to create the directory:', e)
        sys.exit(1)

Output directory already exists.


## Import the data into DataFrames

### Normed frequency counts for the linguistic features

In [4]:
df_normed = pd.read_csv(f"{input_directory}/normed.tsv", sep='\t')

In [5]:
df_normed

Unnamed: 0,file,v000182,v000184,v000181,v000175,v000185,v000172,v000186,v000174,v000171,...,v000126,v000115,v000146,v000114,v000027,v000038,v000103,v000086,v000034,v000078
0,t000001,15.873016,10.582011,26.455026,21.164021,84.656085,15.873016,169.312169,15.873016,37.037037,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,t000002,8.219178,0.0,30.136986,57.534247,115.068493,8.219178,180.821918,5.479452,43.835616,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,t000003,12.658228,0.0,21.097046,54.852321,97.046414,0.0,189.873418,4.219409,50.632911,...,4.219409,8.438819,4.219409,8.438819,12.658228,4.219409,0.0,0.0,0.0,0.0
3,t000004,11.278195,11.278195,37.593985,30.075188,97.744361,11.278195,191.729323,3.759398,37.593985,...,0.0,0.0,3.759398,0.0,0.0,0.0,3.759398,7.518797,0.0,0.0
4,t000005,6.711409,0.0,16.778523,36.912752,124.161074,0.0,197.986577,13.422819,67.114094,...,6.711409,0.0,0.0,0.0,10.067114,0.0,0.0,0.0,3.355705,0.0
5,t000006,14.577259,2.915452,11.661808,26.239067,72.886297,0.0,139.941691,8.746356,23.323615,...,0.0,0.0,0.0,0.0,0.0,0.0,2.915452,0.0,0.0,5.830904


### Variable index

In [6]:
df_var_index = pd.read_csv('var_index.txt', sep=' ', header=None)

In [7]:
df_var_index.columns = ['Variable ID', 'Variable Label', 'Variable Descriptor']

In [8]:
df_var_index

Unnamed: 0,Variable ID,Variable Label,Variable Descriptor
0,v000001,adjaffi,adj.adjective_affiliative.adjaffi
1,v000002,adjall,adj.adjective_all.adjall
2,v000003,adjattr,adj.adjective_attributive.adjattr
3,v000004,adjcolr,adj.adjective_color.adjcolr
4,v000005,adjeval,adj.adjective_evaluative.adjeval
...,...,...,...
185,v000186,vball,vb.verbs_all.vball
186,v000187,clauselgth,vd.clause_length.clauselgth
187,v000188,ttr,vd.type-token_ratio.ttr
188,v000189,wrcount,vd.word_count.wrcount


In [9]:
df_var_index.dtypes

Variable ID            object
Variable Label         object
Variable Descriptor    object
dtype: object

### CBVR dimension variables

In [10]:
df_var_dim = pd.read_csv('var_dim.tsv', sep='\t', header=None)

Note: `var_dim.tsv` was created by enriching `var_index.txt` with the `Mean Score` and `Standard Deviation` of the variables that load the dimensions of the previous study.

In [11]:
df_var_dim.columns = ['Variable ID', 'Variable Label', 'Variable Descriptor', 'Variable MS', 'Variable SD']

In [12]:
df_var_dim

Unnamed: 0,Variable ID,Variable Label,Variable Descriptor,Variable MS,Variable SD
0,v000001,adjaffi,adj.adjective_affiliative.adjaffi,1.46,2.51
1,v000003,adjattr,adj.adjective_attributive.adjattr,37.66,18.66
2,v000005,adjeval,adj.adjective_evaluative.adjeval,5.57,4.27
3,v000008,adjpred,adj.adjective_predicative.adjpred,8.82,5.27
4,v000010,adjrela,adj.adjective_relational.adjrela,10.98,6.23
...,...,...,...,...,...
87,v000181,vbpriv,vb.verb_private.vbpriv,12.41,8.11
88,v000182,vbpubl,vb.verb_public.vbpubl,9.76,6.46
89,v000183,vb2,vb.verb_secondperson.vb2,1.36,3.91
90,v000188,ttr,vd.type-token_ratio.ttr,0.53,0.07


In [13]:
df_var_dim.dtypes

Variable ID             object
Variable Label          object
Variable Descriptor     object
Variable MS            float64
Variable SD            float64
dtype: object

## Replace `Variable ID` by `Variable Label`

In [14]:
# DataFrames
# 'df_var_index' contains 'Variable ID' and 'Variable Label'
# 'df_normed' has columns matching 'Variable ID'

# Create a mapping dictionary: {Variable ID → Variable Label}
var_mapping = dict(zip(df_var_index['Variable ID'], df_var_index['Variable Label']))

# Create 'df_normed_labeled' by renaming 'df_normed' columns using the mapping dictionary
df_normed_labeled = df_normed.rename(columns=var_mapping)

# Display the updated DataFrame
df_normed_labeled

Unnamed: 0,file,vbpubl,vbsua,vbpriv,vbexist,vb3,vbaspct,vball,vbcomm,vbact,...,prn3sngsubj,nqueinfcl,vbinf,nnonfcque,advemph,advpl,nplac,mdconseguir,advmanner,clqueeprp
0,t000001,15.873016,10.582011,26.455026,21.164021,84.656085,15.873016,169.312169,15.873016,37.037037,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,t000002,8.219178,0.0,30.136986,57.534247,115.068493,8.219178,180.821918,5.479452,43.835616,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,t000003,12.658228,0.0,21.097046,54.852321,97.046414,0.0,189.873418,4.219409,50.632911,...,4.219409,8.438819,4.219409,8.438819,12.658228,4.219409,0.0,0.0,0.0,0.0
3,t000004,11.278195,11.278195,37.593985,30.075188,97.744361,11.278195,191.729323,3.759398,37.593985,...,0.0,0.0,3.759398,0.0,0.0,0.0,3.759398,7.518797,0.0,0.0
4,t000005,6.711409,0.0,16.778523,36.912752,124.161074,0.0,197.986577,13.422819,67.114094,...,6.711409,0.0,0.0,0.0,10.067114,0.0,0.0,0.0,3.355705,0.0
5,t000006,14.577259,2.915452,11.661808,26.239067,72.886297,0.0,139.941691,8.746356,23.323615,...,0.0,0.0,0.0,0.0,0.0,0.0,2.915452,0.0,0.0,5.830904


#### Export to a file

In [15]:
df_normed_labeled.to_excel(f"{output_directory}/df_normed_labeled.xlsx")

## Dimension 1

### Filter and order `df_normed_labeled` according `dim1_variable_list`

The negative pole starts at `Reduced progressive clause [vbprogphr]`.

In [16]:
dim1_variable_list = [
    'prn2obl',
    'vb1',
    'vbment',
    'vbfutir',
    'prn2sngsubj',
    'vbpriv',
    'vbact',
    'advnao',
    'prn1sngsubj',
    'advtime',
    'prn1obl',
    'prnqtf',
    'adjeval',
    'prnposs',
    'advints',
    'qsqu',
    'advampl',
    'advemph',
    'vbqueindic',
    'adjpred',
    'vbinf',
    'advmanner',
    'vbcomm',
    'vbgerall',
    'mdprecisar',
    'clqueeadv',
    'vbproginf',
    'advneg',
    'prnnomsubj',
    'vbprogphr',
    'prnqualcujo',
    'clpassless',
    'adjrela',
    'vbpastprt',
    'nominlzsubj',
    'adjtopi',
    'nabst',
    'wl',
    'adjattr',
    'ncomp',
    'artdef',
    'prpall'
]

In [17]:
# Ensure only columns from 'dim1_variable_list' remain
df_dim1_normed_filtered = df_normed_labeled[df_normed_labeled.columns.intersection(dim1_variable_list + ['file'])]

# Reorder columns based on 'dim1_variable_list', placing 'file' first
df_dim1_normed_ordered = df_dim1_normed_filtered[['file'] + dim1_variable_list]

# Display the updated DataFrame
df_dim1_normed_ordered

Unnamed: 0,file,prn2obl,vb1,vbment,vbfutir,prn2sngsubj,vbpriv,vbact,advnao,prn1sngsubj,...,adjrela,vbpastprt,nominlzsubj,adjtopi,nabst,wl,adjattr,ncomp,artdef,prpall
0,t000001,10.582011,5.291005,42.328042,0.0,5.291005,26.455026,37.037037,0.0,0,...,15.873016,31.746032,15.873016,15.873016,89.94709,34.121693,84.656085,31.746032,148.148148,137.566138
1,t000002,5.479452,0.0,21.917808,10.958904,2.739726,30.136986,43.835616,8.219178,0,...,8.219178,19.178082,8.219178,24.657534,120.547945,18.342466,98.630137,73.972603,71.232877,131.506849
2,t000003,8.438819,8.438819,33.755274,0.0,4.219409,21.097046,50.632911,8.438819,0,...,21.097046,37.974684,4.219409,12.658228,80.168776,24.654008,135.021097,88.607595,88.607595,168.776371
3,t000004,7.518797,18.796992,41.353383,0.0,3.759398,37.593985,37.593985,0.0,0,...,7.518797,45.112782,15.037594,30.075188,101.503759,25.112782,90.225564,45.112782,101.503759,146.616541
4,t000005,10.067114,3.355705,26.845638,0.0,6.711409,16.778523,67.114094,10.067114,0,...,10.067114,30.201342,10.067114,23.489933,104.026846,21.922819,80.536913,40.268456,93.959732,134.228188
5,t000006,2.915452,11.661808,23.323615,0.0,2.915452,11.661808,23.323615,0.0,0,...,8.746356,26.239067,11.661808,11.661808,122.44898,18.833819,93.294461,87.463557,142.857143,177.842566


### Standardizing the normed counts

In [18]:
# Creating a new DataFrame with the 'file' column retained
df_dim1_standardised = df_dim1_normed_ordered[['file']].copy()

In [19]:
# Mapping Variable MS and SD for standardisation
var_dim_dict = dict(zip(df_var_dim['Variable Label'], zip(df_var_dim['Variable MS'], df_var_dim['Variable SD'])))

In [20]:
var_dim_dict

{'adjaffi': (1.46, 2.51),
 'adjattr': (37.66, 18.66),
 'adjeval': (5.57, 4.27),
 'adjpred': (8.82, 5.27),
 'adjrela': (10.98, 6.23),
 'adjtopi': (6.31, 6.19),
 'advampl': (7.04, 5.05),
 'advemph': (5.06, 3.95),
 'advhedg': (0.88, 1.3),
 'advints': (6.59, 4.91),
 'advlikl': (1.51, 1.67),
 'advmanner': (2.06, 2.64),
 'advnao': (7.97, 6.73),
 'advneg': (0.42, 0.84),
 'advpl': (4.08, 5.01),
 'advtime': (5.58, 4.49),
 'advcomp': (9.38, 5.15),
 'discmrkr': (5.41, 5.97),
 'artdef': (113.62, 28.08),
 'artindef': (14.02, 7.84),
 'cjadv': (2.34, 2.32),
 'cjcoorcls': (11.37, 6.04),
 'cjcncl': (1.64, 2.05),
 'cjou': (3.39, 4.22),
 'cjcoorphr': (6.3, 5.36),
 'cjcond': (1.63, 1.98),
 'cjfinal': (0.01, 0.08),
 'clpassless': (1.95, 2.38),
 'clinfadj': (0.58, 1.06),
 'clinfadjease': (0.23, 0.62),
 'clinfprp': (9.1, 4.66),
 'adjque': (8.96, 4.97),
 'clqueeadv': (0.53, 1.25),
 'clqueeprp': (1.05, 1.28),
 'nounque': (5.85, 3.63),
 'vbqueindic': (3.76, 3.87),
 'mddever': (1.21, 1.94),
 'mdhaver': (0.05, 0.

In [21]:
# Loop through columns except 'file'
for col in df_dim1_normed_ordered.columns[1:]:
    if col in var_dim_dict:
        mean_value, sd_value = var_dim_dict[col]
        df_dim1_standardised[col] = (df_dim1_normed_ordered[col] - mean_value) / sd_value

# Display the resulting DataFrame
df_dim1_standardised

Unnamed: 0,file,prn2obl,vb1,vbment,vbfutir,prn2sngsubj,vbpriv,vbact,advnao,prn1sngsubj,...,adjrela,vbpastprt,nominlzsubj,adjtopi,nabst,wl,adjattr,ncomp,artdef,prpall
0,t000001,0.626789,-0.582259,2.283232,-0.689394,0.742751,1.731816,0.003239,-1.18425,-0.520202,...,0.785396,1.228093,1.729267,1.544914,1.390926,68.236496,2.518547,1.497709,1.229635,-0.285998
1,t000002,0.047611,-0.864145,0.452718,2.078006,0.104932,2.185818,0.471461,0.037025,-0.520202,...,-0.44315,0.051319,0.245965,2.96406,2.515544,31.540618,3.267424,5.534666,-1.509513,-0.46532
2,t000003,0.383521,-0.414554,1.514374,-0.689394,0.474852,1.071152,0.939594,0.069661,-0.520202,...,1.623924,1.8113,-0.529184,1.025562,1.031561,46.218624,5.217636,6.933804,-0.890755,0.637655
3,t000004,0.279092,0.137293,2.195819,-0.689394,0.35985,3.1053,0.041597,-1.18425,-0.520202,...,-0.55557,2.479661,1.567363,3.839287,1.815647,47.285539,2.817018,2.775601,-0.43149,-0.018155
4,t000005,0.568344,-0.685365,0.894676,-0.689394,1.097852,0.538659,2.074662,0.311607,-0.520202,...,-0.146531,1.083459,0.604092,2.775433,1.908374,39.86702,2.297798,2.312472,-0.700152,-0.384783
5,t000006,-0.243422,-0.242845,0.5788,-0.689394,0.148863,-0.092256,-0.941211,-1.18425,-0.520202,...,-0.35853,0.712459,0.913141,0.864589,2.585409,32.683301,2.981482,6.824432,1.041209,0.905965


### Computing the dimension scores

In [22]:
dim1_variable_list_pos = [
    'prn2obl',
    'vb1',
    'vbment',
    'vbfutir',
    'prn2sngsubj',
    'vbpriv',
    'vbact',
    'advnao',
    'prn1sngsubj',
    'advtime',
    'prn1obl',
    'prnqtf',
    'adjeval',
    'prnposs',
    'advints',
    'qsqu',
    'advampl',
    'advemph',
    'vbqueindic',
    'adjpred',
    'vbinf',
    'advmanner',
    'vbcomm',
    'vbgerall',
    'mdprecisar',
    'clqueeadv',
    'vbproginf',
    'advneg',
    'prnnomsubj'
]

In [23]:
dim1_variable_list_neg = [
    'vbprogphr',
    'prnqualcujo',
    'clpassless',
    'adjrela',
    'vbpastprt',
    'nominlzsubj',
    'adjtopi',
    'nabst',
    'wl',
    'adjattr',
    'ncomp',
    'artdef',
    'prpall'
]

In [24]:
# Calculate the sum of positive pole variables
df_dim1_standardised['dim_scores'] = df_dim1_standardised[dim1_variable_list_pos].sum(axis=1)

# Subtract the sum of negative pole variables
df_dim1_standardised['dim_scores'] -= df_dim1_standardised[dim1_variable_list_neg].sum(axis=1)

# Display the updated DataFrame
df_dim1_standardised

Unnamed: 0,file,prn2obl,vb1,vbment,vbfutir,prn2sngsubj,vbpriv,vbact,advnao,prn1sngsubj,...,vbpastprt,nominlzsubj,adjtopi,nabst,wl,adjattr,ncomp,artdef,prpall,dim_scores
0,t000001,0.626789,-0.582259,2.283232,-0.689394,0.742751,1.731816,0.003239,-1.18425,-0.520202,...,1.228093,1.729267,1.544914,1.390926,68.236496,2.518547,1.497709,1.229635,-0.285998,-99.301663
1,t000002,0.047611,-0.864145,0.452718,2.078006,0.104932,2.185818,0.471461,0.037025,-0.520202,...,0.051319,0.245965,2.96406,2.515544,31.540618,3.267424,5.534666,-1.509513,-0.46532,-42.022774
2,t000003,0.383521,-0.414554,1.514374,-0.689394,0.474852,1.071152,0.939594,0.069661,-0.520202,...,1.8113,-0.529184,1.025562,1.031561,46.218624,5.217636,6.933804,-0.890755,0.637655,-73.813133
3,t000004,0.279092,0.137293,2.195819,-0.689394,0.35985,3.1053,0.041597,-1.18425,-0.520202,...,2.479661,1.567363,3.839287,1.815647,47.285539,2.817018,2.775601,-0.43149,-0.018155,-77.326101
4,t000005,0.568344,-0.685365,0.894676,-0.689394,1.097852,0.538659,2.074662,0.311607,-0.520202,...,1.083459,0.604092,2.775433,1.908374,39.86702,2.297798,2.312472,-0.700152,-0.384783,-59.211327
5,t000006,-0.243422,-0.242845,0.5788,-0.689394,0.148863,-0.092256,-0.941211,-1.18425,-0.520202,...,0.712459,0.913141,0.864589,2.585409,32.683301,2.981482,6.824432,1.041209,0.905965,-69.690676


#### Export to a file

In [25]:
df_dim1_standardised.to_excel(f"{output_directory}/df_dim1_standardised.xlsx")

### Computing the mean dimension scores for the new registers

In [26]:
mean_dim1_scores = df_dim1_standardised['dim_scores'].mean()
mean_dim1_scores

-70.22761216807842

## Dimension 2

### Filter and order `df_normed_labeled` according `dim2_variable_list`

This dimension has no negative pole.

In [27]:
dim2_variable_list = [
    'nounque',
    'prnque',
    'advcomp',
    'ncogn',
    'nqueinfcl',
    'clinfadj',
    'clqueeprp',
    'prndem',
    'clinfprp',
    'adjque',
    'clinfadjease',
    'advhedg',
    'artindef',
    'vbfutpret',
    'cjadv'
]

In [28]:
# Ensure only columns from 'dim2_variable_list' remain
df_dim2_normed_filtered = df_normed_labeled[df_normed_labeled.columns.intersection(dim2_variable_list + ['file'])]

# Reorder columns based on 'dim2_variable_list', placing 'file' first
df_dim2_normed_ordered = df_dim2_normed_filtered[['file'] + dim2_variable_list]

# Display the updated DataFrame
df_dim2_normed_ordered

Unnamed: 0,file,nounque,prnque,advcomp,ncogn,nqueinfcl,clinfadj,clqueeprp,prndem,clinfprp,adjque,clinfadjease,advhedg,artindef,vbfutpret,cjadv
0,t000001,10.582011,10.582011,5.291005,21.164021,0.0,0,0.0,31.746032,42.328042,10.582011,0,0.0,0.0,0,0.0
1,t000002,21.917808,16.438356,5.479452,5.479452,0.0,0,0.0,30.136986,27.39726,0.0,0,5.479452,19.178082,0,5.479452
2,t000003,8.438819,25.316456,12.658228,0.0,8.438819,0,0.0,33.755274,42.194093,8.438819,0,0.0,12.658228,0,4.219409
3,t000004,30.075188,18.796992,3.759398,7.518797,0.0,0,0.0,30.075188,30.075188,0.0,0,0.0,15.037594,0,0.0
4,t000005,6.711409,6.711409,10.067114,6.711409,0.0,0,0.0,36.912752,40.268456,0.0,0,0.0,3.355705,0,6.711409
5,t000006,26.239067,8.746356,2.915452,14.577259,0.0,0,5.830904,23.323615,17.492711,0.0,0,0.0,20.408163,0,0.0


### Standardizing the normed counts

In [29]:
# Creating a new DataFrame with the 'file' column retained
df_dim2_standardised = df_dim2_normed_ordered[['file']].copy()

In [30]:
# Mapping Variable MS and SD for standardisation
var_dim_dict = dict(zip(df_var_dim['Variable Label'], zip(df_var_dim['Variable MS'], df_var_dim['Variable SD'])))

In [31]:
# Loop through columns except 'file'
for col in df_dim2_normed_ordered.columns[1:]:
    if col in var_dim_dict:
        mean_value, sd_value = var_dim_dict[col]
        df_dim2_standardised[col] = (df_dim2_normed_ordered[col] - mean_value) / sd_value

# Display the resulting DataFrame
df_dim2_standardised

Unnamed: 0,file,nounque,prnque,advcomp,ncogn,nqueinfcl,clinfadj,clqueeprp,prndem,clinfprp,adjque,clinfadjease,advhedg,artindef,vbfutpret,cjadv
0,t000001,1.303584,0.067638,-0.79398,2.485073,-0.715686,-0.54717,-0.820312,2.915855,7.130481,0.32636,-0.370968,-0.676923,-1.788265,-0.693487,-1.008621
1,t000002,4.426393,1.132428,-0.757388,-0.237942,-0.715686,-0.54717,-0.820312,2.684004,3.926451,-1.802817,-0.370968,3.53804,0.657919,-0.693487,1.353212
2,t000003,0.713173,2.746628,0.636549,-1.189236,7.557665,-0.54717,-0.820312,3.205371,7.101737,-0.104865,-0.370968,-0.676923,-0.173695,-0.693487,0.81009
3,t000004,6.673606,1.561271,-1.091379,0.116111,-0.715686,-0.54717,-0.820312,2.675099,4.501113,-1.802817,-0.370968,-0.676923,0.129795,-0.693487,-1.008621
4,t000005,0.237303,-0.636107,0.13342,-0.024061,-0.715686,-0.54717,-0.820312,3.660339,6.68851,-1.802817,-0.370968,-0.676923,-1.360242,-0.693487,1.884228
5,t000006,5.616823,-0.266117,-1.255252,1.341538,-0.715686,-0.54717,3.735081,1.70225,1.801011,-1.802817,-0.370968,-0.676923,0.814817,-0.693487,-1.008621


### Computing the dimension scores

In [32]:
dim2_variable_list_pos = [
    'nounque',
    'prnque',
    'advcomp',
    'ncogn',
    'nqueinfcl',
    'clinfadj',
    'clqueeprp',
    'prndem',
    'clinfprp',
    'adjque',
    'clinfadjease',
    'advhedg',
    'artindef',
    'vbfutpret',
    'cjadv'
]

In [33]:
dim2_variable_list_neg = [
]

In [34]:
# Calculate the sum of positive pole variables
df_dim2_standardised['dim_scores'] = df_dim2_standardised[dim2_variable_list_pos].sum(axis=1)

# Subtract the sum of negative pole variables
df_dim2_standardised['dim_scores'] -= df_dim2_standardised[dim2_variable_list_neg].sum(axis=1)

# Display the updated DataFrame
df_dim2_standardised

Unnamed: 0,file,nounque,prnque,advcomp,ncogn,nqueinfcl,clinfadj,clqueeprp,prndem,clinfprp,adjque,clinfadjease,advhedg,artindef,vbfutpret,cjadv,dim_scores
0,t000001,1.303584,0.067638,-0.79398,2.485073,-0.715686,-0.54717,-0.820312,2.915855,7.130481,0.32636,-0.370968,-0.676923,-1.788265,-0.693487,-1.008621,6.81358
1,t000002,4.426393,1.132428,-0.757388,-0.237942,-0.715686,-0.54717,-0.820312,2.684004,3.926451,-1.802817,-0.370968,3.53804,0.657919,-0.693487,1.353212,11.772677
2,t000003,0.713173,2.746628,0.636549,-1.189236,7.557665,-0.54717,-0.820312,3.205371,7.101737,-0.104865,-0.370968,-0.676923,-0.173695,-0.693487,0.81009,18.194557
3,t000004,6.673606,1.561271,-1.091379,0.116111,-0.715686,-0.54717,-0.820312,2.675099,4.501113,-1.802817,-0.370968,-0.676923,0.129795,-0.693487,-1.008621,7.929633
4,t000005,0.237303,-0.636107,0.13342,-0.024061,-0.715686,-0.54717,-0.820312,3.660339,6.68851,-1.802817,-0.370968,-0.676923,-1.360242,-0.693487,1.884228,4.956027
5,t000006,5.616823,-0.266117,-1.255252,1.341538,-0.715686,-0.54717,3.735081,1.70225,1.801011,-1.802817,-0.370968,-0.676923,0.814817,-0.693487,-1.008621,7.67448


#### Export to a file

In [35]:
df_dim2_standardised.to_excel(f"{output_directory}/df_dim2_standardised.xlsx")

### Computing the mean dimension scores for the new registers

In [36]:
mean_dim2_scores = df_dim2_standardised['dim_scores'].mean()
mean_dim2_scores

9.55682556020905

## Dimension 3

### Filter and order `df_normed_labeled` according `dim3_variable_list`

The negative pole starts at `Type–token ratio [ttr]`.

In [37]:
dim3_variable_list = [
    'qsttag',
    'contrac',
    'discmrkr',
    'qsyn',
    'prn3sngsubj',
    'prn3plusubj',
    'cjcncl',
    'advpl',
    'mdter',
    'ttr'
]

In [38]:
# Ensure only columns from 'dim3_variable_list' remain
df_dim3_normed_filtered = df_normed_labeled[df_normed_labeled.columns.intersection(dim3_variable_list + ['file'])]

# Reorder columns based on 'dim3_variable_list', placing 'file' first
df_dim3_normed_ordered = df_dim3_normed_filtered[['file'] + dim3_variable_list]

# Display the updated DataFrame
df_dim3_normed_ordered

Unnamed: 0,file,qsttag,contrac,discmrkr,qsyn,prn3sngsubj,prn3plusubj,cjcncl,advpl,mdter,ttr
0,t000001,0,0,0,0,0.0,0.0,0,0.0,0,2.444444
1,t000002,0,0,0,0,0.0,5.479452,0,0.0,0,1.210959
2,t000003,0,0,0,0,4.219409,0.0,0,4.219409,0,1.983122
3,t000004,0,0,0,0,0.0,0.0,0,0.0,0,1.729323
4,t000005,0,0,0,0,6.711409,0.0,0,0.0,0,1.526846
5,t000006,0,0,0,0,0.0,2.915452,0,0.0,0,1.230321


### Standardizing the normed counts

In [39]:
# Creating a new DataFrame with the 'file' column retained
df_dim3_standardised = df_dim3_normed_ordered[['file']].copy()

In [40]:
# Mapping Variable MS and SD for standardisation
var_dim_dict = dict(zip(df_var_dim['Variable Label'], zip(df_var_dim['Variable MS'], df_var_dim['Variable SD'])))

In [41]:
# Loop through columns except 'file'
for col in df_dim3_normed_ordered.columns[1:]:
    if col in var_dim_dict:
        mean_value, sd_value = var_dim_dict[col]
        df_dim3_standardised[col] = (df_dim3_normed_ordered[col] - mean_value) / sd_value

# Display the resulting DataFrame
df_dim3_standardised

Unnamed: 0,file,qsttag,contrac,discmrkr,qsyn,prn3sngsubj,prn3plusubj,cjcncl,advpl,mdter,ttr
0,t000001,-0.235012,-0.309302,-0.906198,-0.484928,-0.609977,-0.509091,-0.8,-0.814371,-0.611111,27.349206
1,t000002,-0.235012,-0.309302,-0.906198,-0.484928,-0.609977,4.472229,-0.8,-0.814371,-0.611111,9.727984
2,t000003,-0.235012,-0.309302,-0.906198,-0.484928,0.346805,-0.509091,-0.8,0.027826,-0.611111,20.758891
3,t000004,-0.235012,-0.309302,-0.906198,-0.484928,-0.609977,-0.509091,-0.8,-0.814371,-0.611111,17.13319
4,t000005,-0.235012,-0.309302,-0.906198,-0.484928,0.911884,-0.509091,-0.8,-0.814371,-0.611111,14.240652
5,t000006,-0.235012,-0.309302,-0.906198,-0.484928,-0.609977,2.14132,-0.8,-0.814371,-0.611111,10.004581


### Computing the dimension scores

In [42]:
dim3_variable_list_pos = [
    'qsttag',
    'contrac',
    'discmrkr',
    'qsyn',
    'prn3sngsubj',
    'prn3plusubj',
    'cjcncl',
    'advpl',
    'mdter'
]

In [43]:
dim3_variable_list_neg = [
    'ttr'
]

In [44]:
# Calculate the sum of positive pole variables
df_dim3_standardised['dim_scores'] = df_dim3_standardised[dim3_variable_list_pos].sum(axis=1)

# Subtract the sum of negative pole variables
df_dim3_standardised['dim_scores'] -= df_dim3_standardised[dim3_variable_list_neg].sum(axis=1)

# Display the updated DataFrame
df_dim3_standardised

Unnamed: 0,file,qsttag,contrac,discmrkr,qsyn,prn3sngsubj,prn3plusubj,cjcncl,advpl,mdter,ttr,dim_scores
0,t000001,-0.235012,-0.309302,-0.906198,-0.484928,-0.609977,-0.509091,-0.8,-0.814371,-0.611111,27.349206,-32.629197
1,t000002,-0.235012,-0.309302,-0.906198,-0.484928,-0.609977,4.472229,-0.8,-0.814371,-0.611111,9.727984,-10.026655
2,t000003,-0.235012,-0.309302,-0.906198,-0.484928,0.346805,-0.509091,-0.8,0.027826,-0.611111,20.758891,-24.239902
3,t000004,-0.235012,-0.309302,-0.906198,-0.484928,-0.609977,-0.509091,-0.8,-0.814371,-0.611111,17.13319,-22.413181
4,t000005,-0.235012,-0.309302,-0.906198,-0.484928,0.911884,-0.509091,-0.8,-0.814371,-0.611111,14.240652,-17.998781
5,t000006,-0.235012,-0.309302,-0.906198,-0.484928,-0.609977,2.14132,-0.8,-0.814371,-0.611111,10.004581,-12.634161


#### Export to a file

In [45]:
df_dim3_standardised.to_excel(f"{output_directory}/df_dim3_standardised.xlsx")

### Computing the mean dimension scores for the new registers

In [46]:
mean_dim3_scores = df_dim3_standardised['dim_scores'].mean()
mean_dim3_scores

-19.99031266960335

## Dimension 4

### Filter and order `df_normed_labeled` according `dim4_variable_list`

The dimension has no negative pole.

In [47]:
dim4_variable_list = [
    'vbsubpres',
    'vbimp',
    'nconc',
    'subjdrop',
    'vbfacil',
    'cjcoorcls'
]

In [48]:
# Ensure only columns from 'dim4_variable_list' remain
df_dim4_normed_filtered = df_normed_labeled[df_normed_labeled.columns.intersection(dim4_variable_list + ['file'])]

# Reorder columns based on 'dim4_variable_list', placing 'file' first
df_dim4_normed_ordered = df_dim4_normed_filtered[['file'] + dim4_variable_list]

# Display the updated DataFrame
df_dim4_normed_ordered

Unnamed: 0,file,vbsubpres,vbimp,nconc,subjdrop,vbfacil,cjcoorcls
0,t000001,5.291005,0,10.582011,0,15.873016,21.164021
1,t000002,5.479452,0,8.219178,0,8.219178,10.958904
2,t000003,8.438819,0,4.219409,0,4.219409,16.877637
3,t000004,3.759398,0,0.0,0,30.075188,45.112782
4,t000005,6.711409,0,3.355705,0,16.778523,50.33557
5,t000006,8.746356,0,2.915452,0,8.746356,14.577259


### Standardizing the normed counts

In [49]:
# Creating a new DataFrame with the 'file' column retained
df_dim4_standardised = df_dim4_normed_ordered[['file']].copy()

In [50]:
# Mapping Variable MS and SD for standardisation
var_dim_dict = dict(zip(df_var_dim['Variable Label'], zip(df_var_dim['Variable MS'], df_var_dim['Variable SD'])))

In [51]:
# Loop through columns except 'file'
for col in df_dim4_normed_ordered.columns[1:]:
    if col in var_dim_dict:
        mean_value, sd_value = var_dim_dict[col]
        df_dim4_standardised[col] = (df_dim4_normed_ordered[col] - mean_value) / sd_value

# Display the resulting DataFrame
df_dim4_standardised

Unnamed: 0,file,vbsubpres,vbimp,nconc,subjdrop,vbfacil,cjcoorcls
0,t000001,-0.135325,-0.498567,-0.846366,-1.225836,4.047992,1.621527
1,t000002,-0.110299,-0.498567,-1.039251,-1.225836,1.47959,-0.068062
2,t000003,0.282712,-0.498567,-1.365763,-1.225836,0.137386,0.91186
3,t000004,-0.338725,-0.498567,-1.710204,-1.225836,8.813821,5.586553
4,t000005,0.053308,-0.498567,-1.436269,-1.225836,4.351854,6.451253
5,t000006,0.323553,-0.498567,-1.472208,-1.225836,1.656495,0.531003


### Computing the dimension scores

In [52]:
dim4_variable_list_pos = [
    'vbsubpres',
    'vbimp',
    'nconc',
    'subjdrop',
    'vbfacil',
    'cjcoorcls'
]

In [53]:
dim4_variable_list_neg = [
]

In [54]:
# Calculate the sum of positive pole variables
df_dim4_standardised['dim_scores'] = df_dim4_standardised[dim4_variable_list_pos].sum(axis=1)

# Subtract the sum of negative pole variables
df_dim4_standardised['dim_scores'] -= df_dim4_standardised[dim4_variable_list_neg].sum(axis=1)

# Display the updated DataFrame
df_dim4_standardised

Unnamed: 0,file,vbsubpres,vbimp,nconc,subjdrop,vbfacil,cjcoorcls,dim_scores
0,t000001,-0.135325,-0.498567,-0.846366,-1.225836,4.047992,1.621527,2.963425
1,t000002,-0.110299,-0.498567,-1.039251,-1.225836,1.47959,-0.068062,-1.462425
2,t000003,0.282712,-0.498567,-1.365763,-1.225836,0.137386,0.91186,-1.758208
3,t000004,-0.338725,-0.498567,-1.710204,-1.225836,8.813821,5.586553,10.627042
4,t000005,0.053308,-0.498567,-1.436269,-1.225836,4.351854,6.451253,7.695743
5,t000006,0.323553,-0.498567,-1.472208,-1.225836,1.656495,0.531003,-0.685559


#### Export to a file

In [55]:
df_dim4_standardised.to_excel(f"{output_directory}/df_dim4_standardised.xlsx")

### Computing the mean dimension scores for the new registers

In [56]:
mean_dim4_scores = df_dim4_standardised['dim_scores'].mean()
mean_dim4_scores

2.8966697472174783

## Dimension 5

### Filter and order `df_normed_labeled` according `dim5_variable_list`

The negative pole starts at `Nouns: Place [nplac]`.

In [57]:
dim5_variable_list = [
    'vbsubfut',
    'cjou',
    'vbfutpres',
    'mddever',
    'mdpoder',
    'cjcond',
    'advlikl',
    'cjcoorphr',
    'nplac',
    'vbsubpast',
    'adjaffi',
    'vbimpf',
    'vbpast'
]

In [58]:
# Ensure only columns from 'dim5_variable_list' remain
df_dim5_normed_filtered = df_normed_labeled[df_normed_labeled.columns.intersection(dim5_variable_list + ['file'])]

# Reorder columns based on 'dim5_variable_list', placing 'file' first
df_dim5_normed_ordered = df_dim5_normed_filtered[['file'] + dim5_variable_list]

# Display the updated DataFrame
df_dim5_normed_ordered

Unnamed: 0,file,vbsubfut,cjou,vbfutpres,mddever,mdpoder,cjcond,advlikl,cjcoorphr,nplac,vbsubpast,adjaffi,vbimpf,vbpast
0,t000001,0.0,5.291005,0.0,0,0.0,0,0.0,31.746032,0.0,0,0.0,0.0,42.328042
1,t000002,8.219178,0.0,2.739726,0,16.438356,0,5.479452,16.438356,0.0,0,2.739726,5.479452,27.39726
2,t000003,0.0,12.658228,0.0,0,8.438819,0,8.438819,0.0,0.0,0,0.0,8.438819,46.413502
3,t000004,0.0,0.0,0.0,0,0.0,0,0.0,45.112782,3.759398,0,0.0,7.518797,26.315789
4,t000005,0.0,0.0,0.0,0,0.0,0,6.711409,10.067114,0.0,0,3.355705,0.0,36.912752
5,t000006,2.915452,0.0,0.0,0,0.0,0,0.0,17.492711,2.915452,0,0.0,2.915452,20.408163


### Standardizing the normed counts

In [59]:
# Creating a new DataFrame with the 'file' column retained
df_dim5_standardised = df_dim5_normed_ordered[['file']].copy()

In [60]:
# Mapping Variable MS and SD for standardisation
var_dim_dict = dict(zip(df_var_dim['Variable Label'], zip(df_var_dim['Variable MS'], df_var_dim['Variable SD'])))

In [61]:
# Loop through columns except 'file'
for col in df_dim5_normed_ordered.columns[1:]:
    if col in var_dim_dict:
        mean_value, sd_value = var_dim_dict[col]
        df_dim5_standardised[col] = (df_dim5_normed_ordered[col] - mean_value) / sd_value

# Display the resulting DataFrame
df_dim5_standardised

Unnamed: 0,file,vbsubfut,cjou,vbfutpres,mddever,mdpoder,cjcond,advlikl,cjcoorphr,nplac,vbsubpast,adjaffi,vbimpf,vbpast
0,t000001,-0.683333,0.450475,-0.634304,-0.623711,-1.062284,-0.823232,-0.904192,4.747394,-1.168622,-0.610778,-0.581673,-0.703892,1.314552
1,t000002,2.056393,-0.803318,-0.190983,-0.623711,4.625729,-0.823232,2.376917,1.891484,-1.168622,-0.610778,0.509851,-0.240317,0.452498
2,t000003,-0.683333,2.196263,-0.634304,-0.623711,1.857723,-0.823232,4.148993,-1.175373,-1.168622,-0.610778,-0.581673,0.010052,1.550433
3,t000004,-0.683333,-0.803318,-0.634304,-0.623711,-1.062284,-0.823232,-0.904192,7.241191,-0.61739,-0.610778,-0.581673,-0.067784,0.390057
4,t000005,-0.683333,-0.803318,-0.634304,-0.623711,-1.062284,-0.823232,3.114616,0.70282,-1.168622,-0.610778,0.755261,-0.703892,1.001891
5,t000006,0.288484,-0.803318,-0.634304,-0.623711,-1.062284,-0.823232,-0.904192,2.088192,-0.741136,-0.610778,-0.581673,-0.457238,0.04897


### Computing the dimension scores

In [62]:
dim5_variable_list_pos = [
    'vbsubfut',
    'cjou',
    'vbfutpres',
    'mddever',
    'mdpoder',
    'cjcond',
    'advlikl',
    'cjcoorphr'
]

In [63]:
dim5_variable_list_neg = [
    'nplac',
    'vbsubpast',
    'adjaffi',
    'vbimpf',
    'vbpast'
]

In [64]:
# Calculate the sum of positive pole variables
df_dim5_standardised['dim_scores'] = df_dim5_standardised[dim5_variable_list_pos].sum(axis=1)

# Subtract the sum of negative pole variables
df_dim5_standardised['dim_scores'] -= df_dim5_standardised[dim5_variable_list_neg].sum(axis=1)

# Display the updated DataFrame
df_dim5_standardised

Unnamed: 0,file,vbsubfut,cjou,vbfutpres,mddever,mdpoder,cjcond,advlikl,cjcoorphr,nplac,vbsubpast,adjaffi,vbimpf,vbpast,dim_scores
0,t000001,-0.683333,0.450475,-0.634304,-0.623711,-1.062284,-0.823232,-0.904192,4.747394,-1.168622,-0.610778,-0.581673,-0.703892,1.314552,2.217226
1,t000002,2.056393,-0.803318,-0.190983,-0.623711,4.625729,-0.823232,2.376917,1.891484,-1.168622,-0.610778,0.509851,-0.240317,0.452498,9.566648
2,t000003,-0.683333,2.196263,-0.634304,-0.623711,1.857723,-0.823232,4.148993,-1.175373,-1.168622,-0.610778,-0.581673,0.010052,1.550433,5.063612
3,t000004,-0.683333,-0.803318,-0.634304,-0.623711,-1.062284,-0.823232,-0.904192,7.241191,-0.61739,-0.610778,-0.581673,-0.067784,0.390057,3.194385
4,t000005,-0.683333,-0.803318,-0.634304,-0.623711,-1.062284,-0.823232,3.114616,0.70282,-1.168622,-0.610778,0.755261,-0.703892,1.001891,-0.086606
5,t000006,0.288484,-0.803318,-0.634304,-0.623711,-1.062284,-0.823232,-0.904192,2.088192,-0.741136,-0.610778,-0.581673,-0.457238,0.04897,-0.132509


#### Export to a file

In [65]:
df_dim5_standardised.to_excel(f"{output_directory}/df_dim5_standardised.xlsx")

### Computing the mean dimension scores for the new registers

In [66]:
mean_dim5_scores = df_dim5_standardised['dim_scores'].mean()
mean_dim5_scores

3.3037925194906452

## Dimension 6

### Filter and order `df_normed_labeled` according `dim6_variable_list`

The negative pole starts at `Adjectives: Evaluative [adjeval]`, this is a variable that is excluded from the analysis, though.

In [67]:
dim6_variable_list = [
    'objprnrare',
    'vb2',
    'prnposs',
    'cjfinal',
    'prn3obl',
    'vbpubl',
    'mdhaver'
]

In [68]:
# Ensure only columns from 'dim6_variable_list' remain
df_dim6_normed_filtered = df_normed_labeled[df_normed_labeled.columns.intersection(dim6_variable_list + ['file'])]

# Reorder columns based on 'dim6_variable_list', placing 'file' first
df_dim6_normed_ordered = df_dim6_normed_filtered[['file'] + dim6_variable_list]

# Display the updated DataFrame
df_dim6_normed_ordered

Unnamed: 0,file,objprnrare,vb2,prnposs,cjfinal,prn3obl,vbpubl,mdhaver
0,t000001,0,0,10.582011,0,0.0,15.873016,0
1,t000002,0,0,2.739726,0,2.739726,8.219178,0
2,t000003,0,0,0.0,0,21.097046,12.658228,0
3,t000004,0,0,11.278195,0,0.0,11.278195,0
4,t000005,0,0,0.0,0,20.134228,6.711409,0
5,t000006,0,0,5.830904,0,8.746356,14.577259,0


### Standardizing the normed counts

In [69]:
# Creating a new DataFrame with the 'file' column retained
df_dim6_standardised = df_dim6_normed_ordered[['file']].copy()

In [70]:
# Mapping Variable MS and SD for standardisation
var_dim_dict = dict(zip(df_var_dim['Variable Label'], zip(df_var_dim['Variable MS'], df_var_dim['Variable SD'])))

In [71]:
# Loop through columns except 'file'
for col in df_dim6_normed_ordered.columns[1:]:
    if col in var_dim_dict:
        mean_value, sd_value = var_dim_dict[col]
        df_dim6_standardised[col] = (df_dim6_normed_ordered[col] - mean_value) / sd_value

# Display the resulting DataFrame
df_dim6_standardised

Unnamed: 0,file,objprnrare,vb2,prnposs,cjfinal,prn3obl,vbpubl,mdhaver
0,t000001,-0.53012,-0.347826,-0.008685,-0.125,-1.372439,0.946287,-0.217391
1,t000002,-0.53012,-0.347826,-0.88199,-0.125,-0.862248,-0.238517,-0.217391
2,t000003,-0.53012,-0.347826,-1.187082,-0.125,2.556247,0.448642,-0.217391
3,t000004,-0.53012,-0.347826,0.068841,-0.125,-1.372439,0.235015,-0.217391
4,t000005,-0.53012,-0.347826,-1.187082,-0.125,2.376951,-0.471918,-0.217391
5,t000006,-0.53012,-0.347826,-0.537761,-0.125,0.256305,0.745706,-0.217391


### Computing the dimension scores

In [72]:
dim6_variable_list_pos = [
    'objprnrare',
    'vb2',
    'prnposs',
    'cjfinal',
    'prn3obl',
    'vbpubl',
    'mdhaver'
]

In [73]:
dim6_variable_list_neg = [
]

In [74]:
# Calculate the sum of positive pole variables
df_dim6_standardised['dim_scores'] = df_dim6_standardised[dim6_variable_list_pos].sum(axis=1)

# Subtract the sum of negative pole variables
df_dim6_standardised['dim_scores'] -= df_dim6_standardised[dim6_variable_list_neg].sum(axis=1)

# Display the updated DataFrame
df_dim6_standardised

Unnamed: 0,file,objprnrare,vb2,prnposs,cjfinal,prn3obl,vbpubl,mdhaver,dim_scores
0,t000001,-0.53012,-0.347826,-0.008685,-0.125,-1.372439,0.946287,-0.217391,-1.655175
1,t000002,-0.53012,-0.347826,-0.88199,-0.125,-0.862248,-0.238517,-0.217391,-3.203094
2,t000003,-0.53012,-0.347826,-1.187082,-0.125,2.556247,0.448642,-0.217391,0.597469
3,t000004,-0.53012,-0.347826,0.068841,-0.125,-1.372439,0.235015,-0.217391,-2.288921
4,t000005,-0.53012,-0.347826,-1.187082,-0.125,2.376951,-0.471918,-0.217391,-0.502387
5,t000006,-0.53012,-0.347826,-0.537761,-0.125,0.256305,0.745706,-0.217391,-0.756089


#### Export to a file

In [75]:
df_dim6_standardised.to_excel(f"{output_directory}/df_dim6_standardised.xlsx")

### Computing the mean dimension scores for the new registers

In [76]:
mean_dim6_scores = df_dim6_standardised['dim_scores'].mean()
mean_dim6_scores

-1.3013661848024505

## Summary

In [77]:
# Define data
dimensions = [f'Dimension {i}' for i in range(1, 7)]
mean_scores = [mean_dim1_scores, mean_dim2_scores, mean_dim3_scores, mean_dim4_scores, mean_dim5_scores, mean_dim6_scores]

# Create the DataFrame
df_mean_dim_scores = pd.DataFrame({'Dimension': dimensions, 'Mean Dimension Score': mean_scores})

# Display the DataFrame
df_mean_dim_scores

Unnamed: 0,Dimension,Mean Dimension Score
0,Dimension 1,-70.227612
1,Dimension 2,9.556826
2,Dimension 3,-19.990313
3,Dimension 4,2.89667
4,Dimension 5,3.303793
5,Dimension 6,-1.301366


### Export to a file

In [78]:
df_mean_dim_scores.to_excel(f"{output_directory}/df_mean_dim_scores.xlsx")