# Experiment 2: audience effect
In which we determine whether intended audience (using @-reply as "audience") affects Catalan usage in control data.

In [1]:
import pandas as pd

## Load data

In [2]:
all_tweet_data = pd.read_csv('../../data/tweets/extra_user_tweets/Jan-01-17_Oct-31-17_user_tweets.tsv', sep='\t', index_col=False)
# fillna
all_tweet_data.fillna('', inplace=True)
print(all_tweet_data.head())

                   id             user  \
0  850626678380535808     CampsCliment   
1  850628981040848896         Dom70Bcn   
2  850631111747276800  Estrellas_Siete   
3  850632361649860608    pasionxespana   
4  850632420365873152        MH17files   

                                                text  \
0  RT @_Gafas_y_reloj_: El nulo interés de PP y C...   
1  RT @MTudela: ‘Té gràcia estudiar a una univers...   
2  Las empresas valencianas pueden solicitar #sub...   
3  #españaesuna #stopUE #stopOTAN #stopLGTB #stop...   
4  RT @pthefigg: @EliotHiggins @benimmo @DFRLab @...   

                                            hashtags  contains_ref_hashtag  \
0                                                                        0   
1                                                                        0   
2         subvenciones,contratacion,jovenes,Valencia                     0   
3  españaesuna,stopUE,stopOTAN,stopLGTB,stopgloba...                     0   
4                   

In [3]:
# tag at-replies
import re
at_matcher = re.compile('@\w+')
all_tweet_data.loc[:, 'reply'] = all_tweet_data.apply(lambda r: int(len(at_matcher.findall(r.loc['text'])) > 0 and r.loc['retweeted']==0), axis=1)
# and hashtag counts
all_tweet_data.loc[:, 'hashtag_count'] = all_tweet_data.loc[:, 'hashtags'].apply(lambda x: len(x.split(',')) if x != '' else 0)

## Test for difference in reply vs. non-reply

In [41]:
print(all_tweet_data.loc[:, 'reply'].value_counts() / all_tweet_data.shape[0])

0    0.812205
1    0.187795
Name: reply, dtype: float64


Only 18% of tweets are replies!

In [4]:
# filter for retweets
tweet_data_original = all_tweet_data[all_tweet_data.loc[:, 'retweeted'] == 0]

In [44]:
# count replies
print(tweet_data_original.loc[:, 'reply'].value_counts() / tweet_data_original.shape[0])

0    0.510152
1    0.489848
Name: reply, dtype: float64


Much better ratio!

Hypothesis:

$H_{0}$: Tweeters who use @-replies are just as likely to use Catalan in @-reply as in non-reply (controlling for hashtag usage).

In [97]:
tweet_data_original_with_ref = tweet_data_original[tweet_data_original.loc[:, 'contains_ref_hashtag']==1]
print(tweet_data_original_with_ref.loc[:, 'reply'].value_counts())

0    1267
1     386
Name: reply, dtype: int64


Small sample size...but we push ahead!

In [98]:
relevant_users = tweet_data_original_with_ref.groupby('user').apply(lambda x: (x.loc[:, 'reply'].min()==0 and
                                                                               x.loc[:, 'reply'].max()==1))
relevant_users = relevant_users[relevant_users].index.tolist()
print('%d relevant users'%(len(relevant_users)))
tweet_data_original_with_ref_relevant = tweet_data_original_with_ref[tweet_data_original_with_ref.loc[:, 'user'].isin(relevant_users)]
print('%d relevant tweets'%(tweet_data_original_with_ref_relevant.shape[0]))

51 relevant users
128 relevant tweets


In [99]:
# compute differences in CA use
from __future__ import division
from scipy.stats import ttest_1samp
tweet_data_original_with_ref_relevant_reply = tweet_data_original_with_ref_relevant[tweet_data_original_with_ref_relevant.loc[:, 'reply']==1]
tweet_data_original_with_ref_relevant_non_reply = tweet_data_original_with_ref_relevant[tweet_data_original_with_ref_relevant.loc[:, 'reply']==0]
ca_reply_use = tweet_data_original_with_ref_relevant_reply.groupby('user').apply(lambda x: (x.loc[:, 'lang']=='ca').astype(int).sum() / x.shape[0])
ca_non_reply_use = tweet_data_original_with_ref_relevant_non_reply.groupby('user').apply(lambda x: (x.loc[:, 'lang']=='ca').astype(int).sum() / x.shape[0])
ca_use_diff = ca_reply_use - ca_non_reply_use
d_u = ca_use_diff.mean()
d_u_err = ca_use_diff.std()
h_0 = 0.
t_stat, p_val = ttest_1samp(ca_use_diff, h_0)
print('d_u = %.5f, err = %.5f, t=%.3f (p=%.3E)'%(d_u, d_u_err, t_stat, p_val))

d_u = 0.17974, err = 0.42275, t=3.036 (p=3.798E-03)


**Conclusion 1**:

When only considering tweets with a referendum hashtag, slightly more use of Catalan in replies ($d_{u}=0.157, p=0.00380$).

Weaker condition: use all tweets that contain at least one hashtag.

In [100]:
tweet_data_original_with_hashtag = tweet_data_original[tweet_data_original.loc[:, 'hashtag_count'] > 0]
print('%d tweets with at least one hashtag'%(tweet_data_original_with_hashtag.shape[0]))

24057 tweets with at least one hashtag


In [89]:
from __future__ import division
def run_test(tweet_data):
    relevant_users = tweet_data.groupby('user').apply(lambda x: (x.loc[:, 'reply'].min()==0 and
                                                                 x.loc[:, 'reply'].max()==1))
    relevant_users = relevant_users[relevant_users].index.tolist()
    tweet_data_relevant = tweet_data[tweet_data.loc[:, 'user'].isin(relevant_users)]
    print('%d relevant users'%(len(relevant_users)))
    print('%d relevant tweets'%(tweet_data.shape[0]))
    tweet_data_relevant_reply = tweet_data_relevant[tweet_data_relevant.loc[:, 'reply']==1]
    tweet_data_relevant_non_reply = tweet_data_relevant[tweet_data_relevant.loc[:, 'reply']==0]
    ca_reply_use = tweet_data_relevant_reply.groupby('user').apply(lambda x: (x.loc[:, 'lang']=='ca').astype(int).sum() / x.shape[0])
    ca_non_reply_use = tweet_data_relevant_non_reply.groupby('user').apply(lambda x: (x.loc[:, 'lang']=='ca').astype(int).sum() / x.shape[0])
    ca_use_diff = ca_reply_use - ca_non_reply_use
    d_u = ca_use_diff.mean()
    N = len(ca_use_diff)
    d_u_err = ca_use_diff.std() / N**.5
    h_0 = 0.
    t_stat, p_val = ttest_1samp(ca_use_diff, h_0)
    print('d_u = %.5f, err = %.5f, t=%.3f (p=%.3E)'%(d_u, d_u_err, t_stat, p_val))
    return d_u, d_u_err, t_stat, p_val

In [101]:
test_results = run_test(tweet_data_original_with_hashtag)

476 relevant users
24057 relevant tweets
d_u = -0.00784, err = 0.01426, t=-0.550 (p=5.825E-01)


**Conclusion 2**:

Insignificant tendency toward less Catalan in replies.

Last test: consider only tweets that have no hashtags.

In [91]:
tweet_data_original_no_hashtag = tweet_data_original[tweet_data_original.loc[:, 'hashtag_count'] == 0]
print('%d tweets with no hashtags'%(tweet_data_original_no_hashtag.shape[0]))

57401 tweets with no hashtags


In [92]:
test_results = run_test(tweet_data_original_no_hashtag)

945 relevant users
57401 relevant tweets
d_u = -0.01135, err = 0.00890, t=-1.275 (p=2.026E-01)


**Conclusion 3**:

Insignificant tendency toward less Catalan in replies.

Most basic test: @-reply, no hashtag versus no @-reply, hashtag.

In [8]:
from scipy.stats import ttest_1samp
def run_compare_test(tweet_data_1, tweet_data_2):
    relevant_users = set(tweet_data_1.loc[:, 'user'].unique()) & set(tweet_data_2.loc[:, 'user'].unique())
    tweet_data_relevant_1 = tweet_data_1[tweet_data_1.loc[:, 'user'].isin(relevant_users)]
    tweet_data_relevant_2 = tweet_data_2[tweet_data_2.loc[:, 'user'].isin(relevant_users)]
    print('%d tweets in data 1'%(tweet_data_relevant_1.shape[0]))
    print('%d tweets in data 2'%(tweet_data_relevant_2.shape[0]))
    print('%d relevant users'%(len(relevant_users)))
    ca_1 = tweet_data_relevant_1.groupby('user').apply(lambda x: (x.loc[:, 'lang']=='ca').astype(int).sum() / x.shape[0])
    ca_2 = tweet_data_relevant_2.groupby('user').apply(lambda x: (x.loc[:, 'lang']=='ca').astype(int).sum() / x.shape[0])
    ca_use_diff = ca_1 - ca_2
    d_u = ca_use_diff.mean()
    N = len(ca_use_diff)
    d_u_err = ca_use_diff.std() / N**.5
    h_0 = 0.
    t_stat, p_val = ttest_1samp(ca_use_diff, h_0)
    print('d_u = %.5f, err = %.5f, t=%.3f (p=%.3E)'%(d_u, d_u_err, t_stat, p_val))
    return d_u, d_u_err, t_stat, p_val

In [9]:
tweet_data_1 = tweet_data_original[(tweet_data_original.loc[:, 'reply']==1) & (tweet_data_original.loc[:, 'hashtag_count']==0)]
tweet_data_2 = tweet_data_original[(tweet_data_original.loc[:, 'reply']==0) & (tweet_data_original.loc[:, 'hashtag_count']>0)]

In [10]:
test_results = run_compare_test(tweet_data_1, tweet_data_2)

10370 tweets in data 1
13650 tweets in data 2
921 relevant users
d_u = -0.06189, err = 0.01061, t=-5.830 (p=7.647E-09)


**Conclusion 4**:

Weakly significant effect! **Less** Catalan use in replies.

# Explain experiment 2: explore non-referendum topics

## Control hashtags

What are the most popular control-data hashtags when we take out the referendum-specific hashtags?

In [3]:
ref_hashtags = pd.read_csv('../../data/expanded_fixed_hashtags.csv', index_col=False).loc[:, 'expanded'].values.tolist()

In [20]:
import codecs
pd.set_option('display.max_rows', 100)
hashtag_counts = [l.strip().split(' ') for l in codecs.open('../../data/tweets/extra_user_tweets/hashtags_counts.txt')]
hashtag_counts = filter(lambda x: len(x)>1, hashtag_counts)
hashtag_counts = pd.Series(dict([(h[1], int(h[0])) for h in hashtag_counts])).sort_values(inplace=False, ascending=False)
# remove ref hashtags
ref_hashtags_intersect = set(ref_hashtags) & set(hashtag_counts.index)
hashtag_counts.drop(ref_hashtags_intersect, axis=0, inplace=True)
print(hashtag_counts.head(100))

starmetre                  1324
Venezuela                   806
Barcelona                   743
España                      581
buscandoapacomolina         578
Catalunya                   459
StarMetre:                  440
WorldWide                   440
Trends                      439
stopLGTB                    403
españaesuna                 394
LoMásVisto                  375
ÚltimaHora                  368
Resistencia                 358
URGENTE                     352
beBee                       348
LoMásLeído                  346
…                           333
Caracas                     331
PP                          320
SOSVenezuela                309
ResistenciaVzla             302
LaTorturaNoEsCultura        301
HijoPutismo                 298
AHORA                       297
StarMetre                   292
Noticias                    291
profesReligión              280
stopUE                      273
NoTeLoPierdas               273
PSOE                        257
EstafaBa

Why so many Venezuela tweets?

In [37]:
test_hashtag = 'Venezuela'
tweet_data_with_hashtag = tweet_data_original[tweet_data_original.loc[:, 'hashtags'].apply(lambda x: test_hashtag in x)]
print('--------------\nall data\n--------------\n')
print('\n'.join(tweet_data_with_hashtag.loc[:, 'text'].head(50).values.tolist()))
print('--------------\ncontains referendum hashtag\n--------------\n')
print('\n'.join(tweet_data_with_hashtag[tweet_data_with_hashtag.loc[:, 'contains_ref_hashtag']==1].loc[:, 'text'].head(50).values.tolist()))

--------------
all data
--------------

Hablamos con el economista, José Manuel Puente, para analizar la situación de #Venezuela https://t.co/L0ElvfF4dC
No van a tener nada para saquear si seguimos desabasteciendo las tiendas. #Venezuela https://t.co/v8OEwgkzO5
Pray for Venezuela 🆘🇻🇪🆘 Reza por Venezuela 🆘🇻🇪 🆘 #SOSvenezuela #Venezuela #PrayForVenezuela @Pontifex_es 🇻🇪… https://t.co/ZHQQUZCm4y
@SextaNocheTV Que vergüenza de entrevista @Pablo_Iglesias_ emergencia nacional es la que esta sufriendo #VenezuelaEnDictadura
Periodistas agredidos y robados durante la marcha opositora en Caracas https://t.co/KgAJhWXFTX vía @EFEnoticias #derechagolpista #Venezuela
teleSURtv: #Venezuela: 'Amenaza de EE.UU. contra el país es un acto de locura' https://t.co/yrqGLIS7WH https://t.co/gL3oyFZwnB teleSURtv
¿Cómo se puede definir un Estado? https://t.co/53Fv3X7WYT 601I AA OS #Venezuela
Cuando el chiste se cuenta SOLITO: #TrumpNoPodrasConVenezuela https://t.co/onD35iPTfj HZ #Venezuela UD #LegítimaDefensa
#A

Is this driven by a small group of users?

In [35]:
print(tweet_data_with_hashtag.loc[:, 'user'].value_counts())

hola8400           150
jhenderm6          123
CanariasMcrc        68
mcrc_valladolid     61
McrcJaen            60
MCRC_Vizcaya        55
McrcMurcia          54
McrcGuipuzcoa       53
mcrc_bcn            51
GranadaRC           50
Coruna_MCRC         47
JapreriaVZ          47
BariVenezuela       46
betxi_castellon     37
IndioMapoyo         29
LaRioja_DRC         29
grubell_d3v          3
mywordsworlds1       3
Bank_Practices       2
homequevai           2
manelmarquez         2
ivonnediaz2203       2
social_article       2
allsetic             2
PostreDulce          2
adrifadi             1
Britovius            1
danicorderom         1
ThamarManrique       1
mostrenco2           1
pirataneko           1
Municipal_Bank       1
mcrc_almeria         1
jeff_puente          1
PartidoPodrido       1
PuesPioPio           1
kattolikamente       1
libro_lector         1
StarMetre            1
DonKshu              1
IsaTorres78          1
socialistahn         1
laura_castro3        1
SoyCibelino