In [1]:
import pandas as pd
import numpy as np
import os
from collections import defaultdict

### Rating Sheet Generation

In [2]:
def generate_sheet(file_name, target_list, rate_str):
    data = pd.read_excel(file_name)
    pure = data[target_list]
    pure = pure.sample(frac=1)
    pure[rate_str] = np.nan
    return pure

In [4]:
for index in range(20):
    writer = pd.ExcelWriter('rating_sheets\\AoA_stage\\AoA_stage_{}.xlsx'.format(index))
    noun_sheet = generate_sheet('freq_noun_merged_selected.xlsx', ['pinyin', 'word'], 'AoA stage')
    verb_sheet = generate_sheet('freq_verb_merged_selected.xlsx', ['pinyin', 'word'], 'AoA stage')
    noun_sheet.to_excel(writer, sheet_name='nouns')
    verb_sheet.to_excel(writer, sheet_name='verbs')
    writer.save()

In [5]:
for index in range(20):
    writer = pd.ExcelWriter('rating_sheets\\imageability\\imageability_{}.xlsx'.format(index))
    noun_sheet = generate_sheet('freq_noun_merged_selected.xlsx', ['pinyin', 'word'], 'imageability')
    verb_sheet = generate_sheet('freq_verb_merged_selected.xlsx', ['pinyin', 'word'], 'imageability')
    noun_sheet.to_excel(writer, sheet_name='nouns')
    verb_sheet.to_excel(writer, sheet_name='verbs')
    writer.save()

In [6]:
# for index in range(20):
#     dir_name = 'rating_sheets\\packages\\packed_{}'.format(index)
#     os.makedirs(dir_name)

#     instr_file = 'rating_sheets\\instruction_AoAstage.txt'
#     target_file = dir_name + '\\instruction_AoAstage.txt'
#     os.system('copy {} {}'.format(instr_file, target_file))

#     instr_file = 'rating_sheets\\instruction_imageability.txt'
#     target_file = dir_name + '\\instruction_imageability.txt'
#     os.system('copy {} {}'.format(instr_file, target_file))

#     instr_file = 'rating_sheets\\AoA_stage\\AoA_stage_{}.xlsx'.format(index)
#     target_file = dir_name + '\\AoA_stage_{}.xlsx'.format(index)
#     os.system('copy {} {}'.format(instr_file, target_file))

#     instr_file = 'rating_sheets\\imageability\\imageability_{}.xlsx'.format(index)
#     target_file = dir_name + '\\imageability_{}.xlsx'.format(index)
#     os.system('copy {} {}'.format(instr_file, target_file))

### Rating Sheet Results Processing

In [2]:
result_files = os.listdir('rating_sheets/result')
AoA_files, img_files = [], []
for result_file in result_files:
    if result_file.startswith('AoA'):
        AoA_files.append(result_file)
    elif result_file.startswith('image'):
        img_files.append(result_file)
print(AoA_files, img_files)

['AoA_stage_0.xlsx', 'AoA_stage_1-吴尘(1).xlsx', 'AoA_stage_10安安(1).xlsx', 'AoA_stage_11.xlsx', 'AoA_stage_12_x(1).xlsx', 'AoA_stage_13-1.xlsx', 'AoA_stage_14.xlsx', 'AoA_stage_15.xlsx', 'AoA_stage_16(1).xlsx', 'AoA_stage_17(1).xlsx', 'AoA_stage_18.xlsx', 'AoA_stage_19.xlsx', 'AoA_stage_2(1).xlsx', 'AoA_stage_3_lc.xlsx', 'AoA_stage_4.xlsx', 'AoA_stage_5.xlsx', 'AoA_stage_6.xlsx', 'AoA_stage_7.xlsx', 'AoA_stage_8.xlsx', 'AoA_stage_9.xlsx'] ['imageability_0_zyj.xlsx', 'imageability_1(3).xlsx', 'imageability_10_lzs.xlsx', 'imageability_11.xlsx', 'imageability_12.xlsx', 'imageability_13_zya.xlsx', 'imageability_14(1).xlsx', 'imageability_15(1).xlsx', 'imageability_16.xlsx', 'imageability_17_fym.xlsx', 'imageability_18-支(1).xlsx', 'imageability_19.xlsx', 'imageability_2_weh.xlsx', 'imageability_3_czn.xlsx', 'imageability_4.xlsx', 'imageability_5.xlsx', 'imageability_6_xty.xlsx', 'imageability_7(1).xlsx', 'imageability_8_cyt.xlsx', 'imageability_9.xlsx']


In [9]:
def get_vote(file_list, feature = 'AoA stage'):
    noun_dict = defaultdict(list)
    verb_dict = defaultdict(list)

    for file in file_list:
        df_nouns = pd.read_excel('rating_sheets/result/' + file, sheet_name=0)
        df_verbs = pd.read_excel('rating_sheets/result/' + file, sheet_name=1)
        
        if df_nouns[feature].hasnans or df_verbs[feature].hasnans:
            print(file, "has nans!")
            continue
            
        idx_nouns = df_nouns.index.values
        value_nouns = df_nouns[feature].values
        
        for i in range(len(idx_nouns)):
            noun_dict[idx_nouns[i]].append(value_nouns[i])
            
        idx_verbs = df_verbs.index.values
        value_verbs = df_verbs[feature].values
        
        for i in range(len(idx_verbs)):
            verb_dict[idx_verbs[i]].append(value_verbs[i])
    
    for i in range(len(idx_verbs)):
        verb_dict[idx_verbs[i]].append(np.argmax(np.bincount(verb_dict[idx_verbs[i]])))
        verb_dict[idx_verbs[i]].append(((np.bincount(verb_dict[idx_verbs[i]]))!=0).sum())
    
    for i in range(len(idx_nouns)):
        noun_dict[idx_nouns[i]].append(np.argmax(np.bincount(noun_dict[idx_nouns[i]])))
        noun_dict[idx_nouns[i]].append(((np.bincount(noun_dict[idx_nouns[i]]))!=0).sum())
        
    df_noun = pd.DataFrame(noun_dict).transpose()
    df_noun.rename(columns = {20: 'result'}, inplace=True)
    df_noun.rename(columns = {21: 't_score'}, inplace=True)
    
    df_verb = pd.DataFrame(verb_dict).transpose()
    df_verb.rename(columns = {20: 'result'}, inplace=True)
    df_verb.rename(columns = {21: 't_score'}, inplace=True)
    
    return df_noun, df_verb

In [10]:
aoa_noun, aoa_verb = get_vote(AoA_files, 'AoA stage')

In [11]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
aoa_noun.sort_index()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,result,t_score
1,1,2,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,2,1,3,1,3
2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,2
3,1,2,1,1,3,1,2,1,1,3,2,2,1,1,2,2,3,3,1,2,1,3
4,1,2,1,2,2,2,1,1,1,1,2,2,1,1,1,1,2,1,1,1,1,2
5,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,2
6,1,1,1,1,1,2,1,1,1,2,1,2,1,2,1,1,3,1,1,1,1,3
7,2,2,1,1,1,1,2,2,1,2,1,1,1,1,3,2,3,3,2,2,1,3
8,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2
9,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2
10,1,2,1,1,1,1,1,1,2,1,1,1,1,1,1,1,2,1,1,1,1,2


In [12]:
pd.options.display.max_columns = None
aoa_verb.sort_index()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,result,t_score
0,1,2,1,1,2,2,2,2,1,2,1,1,2,2,1,2,2,1,2,1,2,2
1,1,2,1,1,1,1,2,2,1,1,1,2,1,1,1,1,1,1,1,2,1,2
2,1,2,1,1,1,1,2,1,1,1,1,1,1,2,2,2,1,1,2,1,1,2
3,1,2,1,1,1,1,2,2,1,3,1,1,1,1,1,1,1,2,1,1,1,3
4,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2
5,1,2,1,1,2,1,2,1,1,1,1,1,1,1,2,1,1,2,1,1,1,2
6,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,2
7,1,3,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,3,1,1,1,3
8,1,2,1,1,1,1,2,1,1,1,1,1,1,1,1,2,2,1,2,1,1,2
9,1,2,1,1,1,1,1,1,1,1,2,1,1,1,2,1,1,1,1,1,1,2


In [66]:
noun_res = aoa_noun[aoa_noun.columns[-2]].values
print(np.bincount(noun_res))

[ 0 83 72 30]


In [67]:
verb_res = aoa_verb[aoa_verb.columns[-2]].values
print(np.bincount(verb_res))

[ 0 58 71 33]


In [None]:
# the result is unbalanced if adopting votes

In [34]:
def get_mean(file_list, normalize=False):
    total_nouns = pd.DataFrame()
    total_verbs = pd.DataFrame()
    for file in file_list:
        df_nouns = pd.read_excel('rating_sheets/result/' + file, sheet_name=0)
        df_verbs = pd.read_excel('rating_sheets/result/' + file, sheet_name=1)
        
        if normalize:
            feature_name = df_nouns.columns[2]
            df_nouns[feature_name] = (df_nouns[feature_name] - df_nouns[feature_name].mean()) / df_nouns[feature_name].std()
            df_verbs[feature_name] = (df_verbs[feature_name] - df_verbs[feature_name].mean()) / df_verbs[feature_name].std()

        total_nouns = pd.concat([total_nouns, df_nouns])
        total_verbs = pd.concat([total_verbs, df_verbs])

    total_nouns_mean = total_nouns.groupby(total_nouns.index, sort=True)['imageability'].mean()
    total_verbs_mean = total_verbs.groupby(total_verbs.index, sort=True)['imageability'].mean()
    return total_nouns_mean, total_verbs_mean

In [35]:
img_nouns_mean, img_verbs_mean = get_mean(img_files[0:2], False)
img_nouns_mean.head()

Unnamed: 0,imageability
1,6.5
2,4.0
3,5.0
4,5.0
5,4.0


In [None]:


for file in file_list:
    df_nouns = pd.read_excel('rating_sheets/result/' + file, sheet_name=0)

In [36]:
img_verbs_mean.head()

Unnamed: 0,imageability
0,1.5
1,1.5
2,1.0
3,2.5
4,6.0


In [8]:
aoa_nouns_mean, aoa_verbs_mean = get_mean(AoA_files, False)

In [24]:
print(aoa_nouns_mean.quantile(q=1/3))
print(aoa_nouns_mean.quantile(q=2/3))

AoA stage    1.529412
Name: 0.3333333333333333, dtype: float64
AoA stage    2.058824
Name: 0.6666666666666666, dtype: float64


In [25]:
print(aoa_verbs_mean.quantile(q=1/3))
print(aoa_verbs_mean.quantile(q=2/3))

AoA stage    1.588235
Name: 0.3333333333333333, dtype: float64
AoA stage    2.176471
Name: 0.6666666666666666, dtype: float64


In [45]:
def aoa_noun_trans(x):
    if x <= aoa_nouns_mean.quantile(q=1/3)[0]:
        return 1
    elif x <= aoa_nouns_mean.quantile(q=2/3)[0]:
        return 2
    else:
        return 3
    
def aoa_verb_trans(x):
    if x <= aoa_verbs_mean.quantile(q=1/3)[0]:
        return 1
    elif x <= aoa_verbs_mean.quantile(q=2/3)[0]:
        return 2
    else:
        return 3

In [44]:
aoa_nouns_mean['class'] = aoa_nouns_mean['AoA stage'].apply(aoa_noun_trans)
aoa_nouns_mean.head()

Unnamed: 0,AoA stage,class
1,1.235294,1
2,1.058824,1
3,1.705882,2
4,1.352941,1
5,1.058824,1


In [48]:
aoa_verbs_mean['class'] = aoa_verbs_mean['AoA stage'].apply(aoa_verb_trans)
aoa_verbs_mean.head(50)

Unnamed: 0,AoA stage,class
0,1.529412,1
1,1.235294,1
2,1.294118,1
3,1.176471,1
4,1.0,1
5,1.235294,1
6,1.058824,1
7,1.176471,1
8,1.235294,1
9,1.117647,1
