# Check code-switching
In which we check to see if our data labelled as Catalan/Spanish contains any code-switching. A legitimate concern!

In [4]:
import pandas as pd
import json
import codecs

## Load data
We'll use the data from the independence referendum dataset (IT) rather than the controls (XT).

In [13]:
ref_tweets = [json.loads(l.strip()) for l in codecs.open('../../data/tweets/archive_Jan-01-17_Oct-31-17_ref_hashtags_filtered.json', 'r', encoding='utf-8')]
ref_tweet_data = pd.concat([pd.Series(t) for t in ref_tweets], axis=1).transpose()
lang_id_data = pd.read_csv('../../data/tweets/archive_Jan-01-17_Oct-31-17_ref_hashtags_filtered_langid.csv', 
                           index_col=False, encoding='utf-8')
# join on tweet ID
ref_tweet_data = pd.merge(ref_tweet_data, lang_id_data, on='id', how='inner')

In [None]:
ref_tweet_data.head()

In [15]:
ca_tweets = ref_tweet_data[ref_tweet_data.loc[:, 'lang'] == 'ca']
es_tweets = ref_tweet_data[ref_tweet_data.loc[:, 'lang'] == 'es']
print('%d Catalan tweets'%(ca_tweets.shape[0]))
print('%d Spanish tweets'%(es_tweets.shape[0]))

2549 Catalan tweets
4320 Spanish tweets


Let's get a sample of 50 Catalan and 50 Spanish tweets.

In [17]:
# set seed
pd.np.random.seed(123)
N = 100
sample_size = N / 2
ca_sample = ca_tweets.loc[pd.np.random.choice(ca_tweets.index, sample_size, replace=False), :]
es_sample = es_tweets.loc[pd.np.random.choice(es_tweets.index, sample_size, replace=False), :]
combined_sample = pd.concat([ca_sample, es_sample])

In [20]:
# clean up text for printing
combined_sample.loc[:, 'text'] = combined_sample.loc[:, 'text'].apply(lambda x: x.replace('\n', ''))

In [None]:
print('\n'.join(combined_sample.apply(lambda x: 'lang=%s:\t%s'%(x.loc['lang'], x.loc['text']), axis=1).values))

Dump to file for annotation, reupload later.

In [27]:
combined_sample.to_csv('../../data/tweets/CA_ES_sample_tweets.tsv', sep='\t', index=False, encoding='utf-8')