Real data analysis

In [1]:
# Preliminaries
import pandas as pd
import numpy as np
from umc_module import main_program # import self define module

In [2]:
# load the data
df_raw=pd.read_excel('dataset.xlsx')
df_raw.replace([False,True],[0,1],inplace=True)

In [3]:
df_raw.columns
df_raw['dataset'].unique()

array(['dutch3M', 'french10M', 'italian10M', 'english10M'], dtype=object)

In [4]:
df_eng=df_raw.loc[df_raw.dataset=='english10M'].reset_index(drop=True)
df_fre=df_raw.loc[df_raw.dataset=='french10M'].reset_index(drop=True)
df_dut=df_raw.loc[df_raw.dataset=='dutch3M'].reset_index(drop=True)
df_ita=df_raw.loc[df_raw.dataset=='italian10M'].reset_index(drop=True)
df_fre.shape, df_dut.shape, df_ita.shape, df_eng.shape

((7126, 36), (5072, 36), (9638, 36), (4178, 36))

In [5]:
featurelist=['negations', 'refs_to_other', 'refs_to_group', 'refs_to_self', \
       'conjunctions', 'has_initial_conjunction', 'basic_conjunctions', \
       'other_conjunctions', 'levelers', 'sense_words', 'causal_words', \
       'cognitive_words', 'has_question_mark', 'subj_verb_inversion', \
       'structure', 'use', 'qwords_literal', 'qwords_functional', 'sentiment', \
       'is_question', 'tweet_hashtags', 'tweet_num_questions', 'tweet_has_question']
groupvariable=['tweet_has_disinfo_hashtags','tweet_has_disinfo_text','tweet_has_disinfo_text_or_hashtags']

In [16]:
def holm(pvals, alpha):
    '''
    pvals is the list of p-values //
    alpha is FWER, family-wise error rate, e.g. 0.1
    '''
    index=np.argsort(pvals)
    pval_ordered=pvals[index]
    m=len(pvals)
    threshold=1-(1-alpha)**(1/m)
    for i in range(m):
        if i==0:
            pval_ordered[i] = (m-i)*pval_ordered[i]
        else:
            pval_ordered[i] = max(pval_ordered[i-1],(m-i)*pval_ordered[i])
    new_pvals=pval_ordered[np.argsort(index)]
    output=[]
    for i in range(m):
        if new_pvals[i] <= threshold:
            output.append('Reject null hypothesis')
        else:
            output.append('Accept null hypothesis')
    return pval_ordered

In [30]:
# choose 'tweet_has_disinfo_text_or_hashtags' as group variable
# English data set
p_value=main_program.multiple_test(featurelist,groupvariable[2],df_eng,[0.75,0.75])
result=main_program.holm(p_value, alpha=0.1)
np.unique(np.array(result))

array(['Accept null hypothesis'], dtype='<U22')

In [40]:
# choose 'tweet_has_disinfo_text_or_hashtags' as group variable
# French data set
p_value=main_program.multiple_test(featurelist,groupvariable[2],df_fre,[0.75,0.75])
result=main_program.holm(p_value, alpha=0.1)
np.unique(np.array(result))

array(['Accept null hypothesis', 'Reject null hypothesis'], dtype='<U22')

In [43]:
# choose 'tweet_has_disinfo_text_or_hashtags' as group variable
# Dutch data set
p_value=main_program.multiple_test(featurelist,groupvariable[2],df_dut,[0.80,0.80])
result=main_program.holm(p_value, alpha=0.1)
np.unique(np.array(result))

ValueError: math domain error

In [45]:
# choose 'tweet_has_disinfo_text_or_hashtags' as group variable
# Italy data set
p_value=main_program.multiple_test(featurelist,groupvariable[2],df_ita,[0.75,0.75])
result=main_program.holm(p_value, alpha=0.1)
np.unique(np.array(result))

array(['Accept null hypothesis', 'Reject null hypothesis'], dtype='<U22')