# Test social variable collinearity
Some of the social variables may be related e.g. language use and media sharing. Let's assess this!

In [2]:
import pandas as pd
author_social_data = pd.read_csv('../../data/mined_tweets/loanword_verbs_post_social_data.tsv', sep='\t')
author_social_data.fillna('', inplace=True)
display(author_social_data.head())

Unnamed: 0,loanword,loanword_verb,loanword_type,screen_name,user_id,text,date,id,user_description,user_location,...,us_american_artist_video_count,latin_american_artist_video_pct,total_music_count,latin_american_music_genre_pct,latin_american_music_genre_count,us_american_music_genre_count,latin_american_media_count,us_american_media_count,latin_american_media_pct,integrated_verb_pct
0,audit,auditamos,integrated_loanword,garrachavista,779318307585396736,@Sheiladarsy @anticuarta4 Y dime tu cuales 7.6...,,892506833197424640,100% a la izquierda,Venezuela,...,,,,,,,,,,1.0
1,ban,baneamos,integrated_loanword,emmanuelkiller5,4273648032,@adameamiro Eres tan hipocrita que antes nos h...,,882375684823203841,"Felicidad?Que es eso, lo que todos siempre me ...","Chihuahua, Chihuahua",...,,,,,,,,,,
2,ban,baneamos,integrated_loanword,danitolocirio13,482233447,@LVPibai Ibai te hace una normal de reclu? Así...,,893999585953185793,#21. Me gusta Love Live. Reborn. Baraggan y Es...,"Valladolid, España",...,,,,,,,,,,1.0
3,flip,flipas,integrated_loanword,danitolocirio13,482233447,Me echaba unas rankeds ahora que flipas,,949837604362612736,#22. Vago...,"Valladolid, España",...,,,,,,,,,,1.0
4,ban,banear,integrated_loanword,vcf973,883037197754093569,@MiiKeLMsT MIKEEL! CASI ME BANEAN POR COMPRAR ...,,899277329012334593,.,Tenerife,...,,,,,,,,,,


### Test variable correlations
We'll test the correlation between categorical variables with chi-squared test. We will convert the scalar values to categorical values to make things easier.

In [15]:
import numpy as np
scalar_vars = ['integrated_verb_pct', 'latin_american_media_pct']
bins = 4
bin_range = [0., 0.25, 0.75]
bin_names = ['', 'low', 'mid', 'high'] # add null category for anything below range (default null val is -1)
NULL_VAL = -1
for scalar_var in scalar_vars:
    bin_scalar_var = f'{scalar_var}_bin'
    bin_range_names = dict(zip(range(bins), bin_names))
    scalar_var_bins = np.digitize(author_social_data.loc[:, scalar_var].apply(lambda x: NULL_VAL if x=='' else x), bins=bin_range)
    scalar_var_cat_bins = list(map(bin_range_names.get, scalar_var_bins))
    author_social_data = author_social_data.assign(**{
        bin_scalar_var : scalar_var_cat_bins
    })

In [16]:
# let's test all variables to start
valid_data = author_social_data.copy()
social_vars = ['es_bin', 'integrated_verb_pct_bin', 'description_location_region', 'latin_american_media_pct_bin']
for social_var in social_vars:
    valid_data = valid_data[valid_data.loc[:, social_var] != '']
print('%d/%d data'%(valid_data.shape[0], author_social_data.shape[0]))

3275/87610 data


In [26]:
from scipy.stats import chi2_contingency
# compute chi-2 stat for all combos
N = len(social_vars)
var_combos = [(social_vars[i], social_vars[j]) for i in range(N) for j in range(i+1, N)]
for var_1, var_2 in var_combos:
    var_combo_counts = valid_data.groupby([var_1, var_2]).size().reset_index()
    var_contingency_table = pd.pivot(var_combo_counts, index=var_1, columns=var_2, values=0).fillna(0, inplace=False)
    test_stat, p_val, dof, expected_table = chi2_contingency(var_contingency_table)
    print('vars {%s} and {%s} have difference: test-stat=%.3f (p=%.3E)'%(var_1, var_2, test_stat, p_val))

vars {es_bin} and {integrated_verb_pct_bin} have difference: test-stat=26.399 (p=2.629E-05)
vars {es_bin} and {description_location_region} have difference: test-stat=11.490 (p=7.436E-02)
vars {es_bin} and {latin_american_media_pct_bin} have difference: test-stat=63.340 (p=5.755E-13)
vars {integrated_verb_pct_bin} and {description_location_region} have difference: test-stat=16.596 (p=1.089E-02)
vars {integrated_verb_pct_bin} and {latin_american_media_pct_bin} have difference: test-stat=2.527 (p=6.399E-01)
vars {description_location_region} and {latin_american_media_pct_bin} have difference: test-stat=14.843 (p=2.151E-02)


OK! Significant correlations include:
- Language x integrated verb use
- Language x media sharing

Let's confirm the scalar vars too with Spearman correlation.

In [35]:
from scipy.stats import spearmanr
# convert to log vals
scalar_vars = ['es', 'integrated_verb_pct', 'latin_american_media_pct']
smooth_val = 1e-2
for scalar_var in scalar_vars:
    valid_data = valid_data.assign(**{
        f'{scalar_var}_log' : np.log(valid_data.loc[:, scalar_var].astype(float)+smooth_val)
    })
log_scalar_vars = list(map(lambda x: f'{x}_log', scalar_vars))
N = len(scalar_vars)
var_combos = [(log_scalar_vars[i], log_scalar_vars[j]) for i in range(N) for j in range(i+1, N)]
for var_1, var_2 in var_combos:
    correl, p_val = spearmanr(valid_data.loc[:, var_1], valid_data.loc[:, var_2])
    print('vars {%s} and {%s} have correl=%.3f (p=%.3E)'%(var_1, var_2, correl, p_val))

vars {es_log} and {integrated_verb_pct_log} have correl=-0.146 (p=5.619E-17)
vars {es_log} and {latin_american_media_pct_log} have correl=0.185 (p=1.219E-26)
vars {integrated_verb_pct_log} and {latin_american_media_pct_log} have correl=0.000 (p=9.810E-01)


OK! Language has strong correlations with other variables.

### Correlations with variable subsets

Does this hold up when we consider just a subset of the variables?

In [36]:
# let's test all variables to start
valid_data = author_social_data.copy()
social_vars = ['es_bin', 'integrated_verb_pct_bin', 'description_location_region']
for social_var in social_vars:
    valid_data = valid_data[valid_data.loc[:, social_var] != '']
print('%d/%d data'%(valid_data.shape[0], author_social_data.shape[0]))

28775/87610 data


In [37]:
from scipy.stats import chi2_contingency
# compute chi-2 stat for all combos
N = len(social_vars)
var_combos = [(social_vars[i], social_vars[j]) for i in range(N) for j in range(i+1, N)]
for var_1, var_2 in var_combos:
    var_combo_counts = valid_data.groupby([var_1, var_2]).size().reset_index()
    var_contingency_table = pd.pivot(var_combo_counts, index=var_1, columns=var_2, values=0).fillna(0, inplace=False)
    test_stat, p_val, dof, expected_table = chi2_contingency(var_contingency_table)
    print('vars {%s} and {%s} have difference: test-stat=%.3f (p=%.3E)'%(var_1, var_2, test_stat, p_val))

vars {es_bin} and {integrated_verb_pct_bin} have difference: test-stat=8.447 (p=7.652E-02)
vars {es_bin} and {description_location_region} have difference: test-stat=34.696 (p=4.935E-06)
vars {integrated_verb_pct_bin} and {description_location_region} have difference: test-stat=7.592 (p=2.696E-01)


Even with just the "clean" variables:
- Language x location

This all suggests that we should add interaction terms for language and the other variables, or try a residual regression where we first predict using language and then model the residuals with the other variables.