In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

In [2]:
df_games = pd.read_csv(r'data/games_xaire.csv')
df_users = pd.read_csv(r'data/users_xaire.csv')

In [3]:
df_users.columns

Index(['Unnamed: 0', 'id', 'is_robot', 'nickname', 'consent', 'status',
       'pollution_id', 'num_jugador', 'partida_id', 'num_seleccions', 'bots',
       'acabat', 'endowment_initial', 'endowment_current',
       'contributed_public_goods', 'winnings_public_goods',
       'savings_public_goods', 'coins_total', 'tickets', 'gender', 'age_range',
       'educational_level', 'economic_status', 'working_status', 'residence',
       'frame_pr1', 'frame_pr2', 'frame_pr3', 'verification_pr1',
       'verification_pr2', 'verification_pr3', 'verification_pr4',
       'enquesta_final_pr1', 'enquesta_final_pr2', 'enquesta_final_pr3',
       'enquesta_final_pr4', 'enquesta_final_pr5', 'enquesta_final_pr6',
       'enquesta_final_pr7', 'enquesta_final_pr8', 'enquesta_final_pr9',
       'enquesta_final_pr10', 'enquesta_final_pr11', 'date_tutorial',
       'date_register', 'date_creation', 'date_end', 'comment',
       'contributions', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9',
       'R

#### Frame Questions

pr1	How would you rate the quality of the air you breathe where you live?
**pr1_r1**	Bad
**pr1_r2**	Normal
**pr1_r3**	Good

pr2 Are you concerned about the air quality where you live?
****pr2_r1**	Little
**pr2_r2**	Somewhat
**pr2_r3**	A lot

pr3	Would you perform a collective action to improve air quality in Barcelona?
**pr3_r1**	Yes
**pr3_r2**	No

In [4]:
df = df_users.copy()

df['frame_pr1'] = LabelEncoder().fit_transform(df['frame_pr1'])
df['frame_pr2'] = LabelEncoder().fit_transform(df['frame_pr2'])
df['frame_pr3'] = LabelEncoder().fit_transform(df['frame_pr3'])

#### Correlation between frame questions

In [15]:
corr = df['frame_pr1'].corr(df['frame_pr2'])
print('Correlation frame_pr1 vs. frame_pr2: {}'.format(corr))

TypeError: cannot unpack non-iterable numpy.float64 object

#### Correlation between contributions and frame questions

In [6]:

corr = df['contributed_public_goods'].corr(df['frame_pr1'])
print('Correlation contribution vs. frame_pr1: {}'.format(corr))

corr = df['contributed_public_goods'].corr(df['frame_pr2'])
print('Correlation contribution vs. frame_pr2: {}'.format(corr))

corr = df['contributed_public_goods'].corr(df['frame_pr3'])
print('Correlation contribution vs. frame_pr3: {}'.format(corr))

Correlation contribution vs. frame_pr1: 0.015707581696213176
Correlation contribution vs. frame_pr2: 0.07722507263938848
Correlation contribution vs. frame_pr3: -0.013314597361641919


In [7]:
def correlation_ratio(categories, measurements):
    '''
     Correlation Ratio. 
     Mathematically, it is defined as the weighted variance of the mean of each category 
     divided by the variance of all samples; in human language, the Correlation Ratio answers 
     the following question: Given a continuous number, how well can you know to which category 
     it belongs to? Just like the two coefficients we’ve seen before, here too the output is on 
     the range of [0,1].
    '''
    
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat)+1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0,cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
    numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
    denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = np.sqrt(numerator/denominator)
    return eta

In [8]:
corr = correlation_ratio(df_users['frame_pr1'], df_users['contributed_public_goods'])
print('Correlation contribution vs. frame_pr1: {}'.format(corr))
corr = correlation_ratio(df_users['frame_pr2'], df_users['contributed_public_goods'])
print('Correlation contribution vs. frame_pr2: {}'.format(corr))
corr = correlation_ratio(df_users['frame_pr3'], df_users['contributed_public_goods'])
print('Correlation contribution vs. frame_pr3: {}'.format(corr))


Correlation contribution vs. frame_pr1: 0.027644349759101607
Correlation contribution vs. frame_pr2: 0.11940111571015627
Correlation contribution vs. frame_pr3: 0.013314597361641818


#### Correlation between no2_level and frame questions

In [9]:
corr = df['no2_level'].corr(df['frame_pr1'])
print('Correlation no2_level vs. frame_pr1: {}'.format(corr))

corr = df['no2_level'].corr(df['frame_pr2'])
print('Correlation no2_level vs. frame_pr2: {}'.format(corr))

corr = df['no2_level'].corr(df['frame_pr3'])
print('Correlation no2_level vs. frame_pr3: {}'.format(corr))

Correlation no2_level vs. frame_pr1: -0.21220633570254463
Correlation no2_level vs. frame_pr2: -0.04723996975342811
Correlation no2_level vs. frame_pr3: 0.026141737326561124
