# Research Data for paper "How accurate are developers' emotions inferred from social media?"

## Statistics - Psychologists versus Participants

This Notebook file contains the source codes for the statistics between the manual analysis from psychologists and participants over the dataset used in the paper "How accurate are developers' emotions inferred from social media?".

Due to participants' data privacy, we do not present any identifiable data here, such as participants' Twitter account or text publication.

In [128]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import random
import statistics
from collections import Counter
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

## DATA READING

The JSON file 'alldata.json' contains the data used in this Notebook.

In [129]:
data = pd.read_json(r'https://raw.githubusercontent.com/leosilva/jss_2021_paper/master/alldata.json')

In [130]:
data.head()

Unnamed: 0,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44
participant_id,1,2,3,,4,5,6,,7,8,,9,10,11,12,13,14,15,16
created_at,2009-03-11 12:51:34,2008-06-17 14:52:40,2017-11-05 01:21:27,2009-09-27 00:22:22,2015-08-18 09:44:42,2007-12-02 18:59:26,2018-08-28 15:53:30,2012-03-19 11:54:15,2008-01-09 20:56:00,2009-08-02 15:11:05,2008-12-21 15:24:53,2010-07-30 18:14:56,2009-01-04 16:46:32,2013-10-30 12:34:50,2018-08-07 07:54:08,2010-01-29 13:40:07,2018-09-15 09:25:06,2009-08-28 21:11:31,2010-03-04 19:05:54
tweets,"[[102566, 2021-03-31T19:13:28, 26, -0.45880000...","[[108111, 2021-03-31T20:30:21, 27, -0.0258, ne...","[[113189, 2021-03-31T21:52:37, 28, 0.0, neu, n...","[[119977, 2021-03-31T21:31:58, 29, 0.5859, pos...","[[123576, 2021-03-31T09:36:24, 30, 0.610300000...","[[126748, 2021-03-31T22:18:50, 31, 0.248100000...","[[139947, 2021-03-31T23:34:40, 32, 0.198400000...","[[142861, 2021-03-31T23:12:55, 33, -0.34, neg,...","[[154027, 2021-03-31T23:30:22, 34, 0.7088, pos...","[[157011, 2021-03-31T19:13:22, 35, -0.40190000...","[[160069, 2021-03-30T12:47:49, 36, 0.1027, pos...","[[163576, 2021-03-31T18:48:36, 37, 0.296, pos,...","[[171161, 2021-03-31T23:11:12, 38, 0.7096, pos...","[[176593, 2021-03-31T16:06:49, 39, -0.37160000...","[[179579, 2021-03-31T18:09:10, 40, 0.0, neu, n...","[[184846, 2021-03-31T17:53:40, 41, -0.2023, ne...","[[191406, 2021-03-31T07:12:32, 42, 0.0, neu, n...","[[196544, 2021-03-31T23:47:03, 43, 0.6369, pos...","[[224369, 2021-03-31T23:57:17, 44, -0.70890000..."
bigfive,"[[4, 37, 43, 36, 24, 26, 26]]","[[3, 41, 43, 34, 22, 23, 27]]","[[2, 43, 46, 31, 24, 15, 28]]",[],"[[9, 17, 33, 33, 22, 39, 30]]","[[1, 22, 33, 31, 20, 8, 31]]","[[14, 39, 49, 39, 28, 15, 32]]",[],"[[6, 35, 35, 32, 26, 16, 34]]","[[5, 29, 45, 34, 35, 29, 35]]",[],"[[7, 40, 46, 38, 22, 21, 37]]","[[8, 32, 46, 19, 18, 28, 38]]","[[10, 31, 36, 25, 33, 21, 39]]","[[11, 36, 45, 31, 31, 19, 40]]","[[12, 29, 41, 28, 22, 28, 41]]","[[13, 29, 43, 27, 23, 35, 42]]","[[15, 29, 45, 36, 27, 32, 43]]","[[16, 30, 35, 24, 20, 32, 44]]"


In [131]:
data_psi = pd.read_csv(r'https://raw.githubusercontent.com/leosilva/jss_2021_paper/master/analyzed_tweets_dataset1_first_round.csv')
data_psi.columns = ['id',
                    'Classification_PSY1_5_cat',
                    'Classification_PSY2_5_cat',
                    'Classification_PSY3_5_cat']
data_psi.head()

Unnamed: 0,id,Classification_PSY1_5_cat,Classification_PSY2_5_cat,Classification_PSY3_5_cat
0,102600,Weak Negative,Weak Positive,Neutral
1,102895,Weak Negative,Strong Negative,Strong Negative
2,103085,Strong Negative,Strong Negative,Strong Negative
3,103092,Weak Positive,Weak Positive,Weak Positive
4,103200,Weak Positive,Weak Negative,Strong Negative


In [132]:
data_psi_divergentes_resolvidos = pd.read_csv(r'https://raw.githubusercontent.com/leosilva/jss_2021_paper/master/analyzed_tweets_second_round.csv')
data_psi_divergentes_resolvidos.columns = ['id', 'Classification_PSY_5_cat']
data_psi_divergentes_resolvidos.head()

Unnamed: 0,id,Classification_PSY_5_cat
0,102600,Neutral
1,103241,Weak Negative
2,103875,Weak Positive
3,104229,Neutral
4,104337,Weak Negative


In [133]:
data_psi_restantes = pd.read_csv(r'https://raw.githubusercontent.com/leosilva/jss_2021_paper/master/analyzed_tweets_dataset2_first_round.csv')
data_psi_restantes.columns = ['id',
                    'Classification_PSY1_5_cat',
                    'Classification_PSY2_5_cat',
                    'Classification_PSY3_5_cat']
data_psi_restantes.head()

Unnamed: 0,id,Classification_PSY1_5_cat,Classification_PSY2_5_cat,Classification_PSY3_5_cat
0,140009,Weak Negative,Weak Negative,Weak Positive
1,140363,Neutral,Neutral,Neutral
2,140622,Weak Positive,Weak Positive,Weak Positive
3,140710,Neutral,Neutral,Weak Positive
4,141162,Neutral,Weak Positive,Weak Positive


In [134]:
data_participants = pd.read_csv(r'https://raw.githubusercontent.com/leosilva/jss_2021_paper/master/analyzed_tweets_by_participants.csv')
data_participants.head()

Unnamed: 0,id,classification,id_usuario
0,197052,Weak Positive,43
1,197097,Neutral,43
2,198955,Neutral,43
3,200154,Weak Positive,43
4,200645,Weak Positive,43


## Dataframe preparation - Psychologists

In [135]:
def reduce_polatiries_to_3_categories(df, column):
    conditions = [
        (df['{}_5_cat'.format(column)] == 'Weak Positive'),
        (df['{}_5_cat'.format(column)] == 'Strong Positive'),
        (df['{}_5_cat'.format(column)] == 'Neutral'),
        (df['{}_5_cat'.format(column)] == 'Weak Negative'),
        (df['{}_5_cat'.format(column)] == 'Strong Negative')
        ]

    values = ['pos', 'pos', 'neu', 'neg', 'neg']

    df['{}_3_cat'.format(column)] = np.select(conditions, values)

    return df

In [136]:
data_psi = reduce_polatiries_to_3_categories(data_psi, 'Classification_PSY1')
data_psi = reduce_polatiries_to_3_categories(data_psi, 'Classification_PSY2')
data_psi = reduce_polatiries_to_3_categories(data_psi, 'Classification_PSY3')
data_psi.head()

Unnamed: 0,id,Classification_PSY1_5_cat,Classification_PSY2_5_cat,Classification_PSY3_5_cat,Classification_PSY1_3_cat,Classification_PSY2_3_cat,Classification_PSY3_3_cat
0,102600,Weak Negative,Weak Positive,Neutral,neg,pos,neu
1,102895,Weak Negative,Strong Negative,Strong Negative,neg,neg,neg
2,103085,Strong Negative,Strong Negative,Strong Negative,neg,neg,neg
3,103092,Weak Positive,Weak Positive,Weak Positive,pos,pos,pos
4,103200,Weak Positive,Weak Negative,Strong Negative,pos,neg,neg


In [137]:
data_psi_divergentes_resolvidos = reduce_polatiries_to_3_categories(data_psi_divergentes_resolvidos, 'Classification_PSY')
data_psi_divergentes_resolvidos.head()

Unnamed: 0,id,Classification_PSY_5_cat,Classification_PSY_3_cat
0,102600,Neutral,neu
1,103241,Weak Negative,neg
2,103875,Weak Positive,pos
3,104229,Neutral,neu
4,104337,Weak Negative,neg


In [138]:
data_psi_restantes = reduce_polatiries_to_3_categories(data_psi_restantes, 'Classification_PSY1')
data_psi_restantes = reduce_polatiries_to_3_categories(data_psi_restantes, 'Classification_PSY2')
data_psi_restantes = reduce_polatiries_to_3_categories(data_psi_restantes, 'Classification_PSY3')
data_psi_restantes.head()

Unnamed: 0,id,Classification_PSY1_5_cat,Classification_PSY2_5_cat,Classification_PSY3_5_cat,Classification_PSY1_3_cat,Classification_PSY2_3_cat,Classification_PSY3_3_cat
0,140009,Weak Negative,Weak Negative,Weak Positive,neg,neg,pos
1,140363,Neutral,Neutral,Neutral,neu,neu,neu
2,140622,Weak Positive,Weak Positive,Weak Positive,pos,pos,pos
3,140710,Neutral,Neutral,Weak Positive,neu,neu,pos
4,141162,Neutral,Weak Positive,Weak Positive,neu,pos,pos


In [139]:
print(len(data_psi))
data_psi = data_psi.append(data_psi_restantes)
print(len(data_psi))

455
560


In [140]:
data_psi_divergentes_resolvidos = reduce_polatiries_to_3_categories(data_psi_divergentes_resolvidos, 'Classification_PSY')
data_psi_divergentes_resolvidos.head()

Unnamed: 0,id,Classification_PSY_5_cat,Classification_PSY_3_cat
0,102600,Neutral,neu
1,103241,Weak Negative,neg
2,103875,Weak Positive,pos
3,104229,Neutral,neu
4,104337,Weak Negative,neg


In [141]:
print(len(data_psi))

560


## Dataframe preparation - Participants

In [142]:
conditions = [
    (data_participants['classification'] == 'Weak Positive'),
    (data_participants['classification'] == 'Strong Positive'),
    (data_participants['classification'] == 'Neutral'),
    (data_participants['classification'] == 'Weak Negative'),
    (data_participants['classification'] == 'Strong Negative')
    ]

values = ['pos', 'pos', 'neu', 'neg', 'neg']

data_participants['classification_3_cat'] = np.select(conditions, values)

print(len(data_participants))

data_participants.head()

455


Unnamed: 0,id,classification,id_usuario,classification_3_cat
0,197052,Weak Positive,43,pos
1,197097,Neutral,43,neu
2,198955,Neutral,43,neu
3,200154,Weak Positive,43,pos
4,200645,Weak Positive,43,pos


In [143]:
tweets = []

for d in data:
    if data[d]['bigfive']:
        tweets.append(data[d]['tweets'])

final_tweets = []
for i in tweets:
    final_tweets.extend(i)

In [144]:
print(len(final_tweets))

91632


In [145]:
def fill_manual_polarities_3_cat(df):
    for d in df['id']:
        pol = []
        t = df.query('id == {}'.format(d))
        pol.append(t['Classification_PSY1_3_cat'].values[0])
        pol.append(t['Classification_PSY2_3_cat'].values[0])
        pol.append(t['Classification_PSY3_3_cat'].values[0])
        c = dict(Counter(pol))
        for i in c.items():
            if i[1] >= 2:
                df.loc[df['id'] == d, 'manual_polarity_3_cat'] = i[0]
    
    return df

In [146]:
def fill_nan_values_by_consolidated_polarities(df, df_consolidated):
    for i in df_consolidated['id']:
        df.loc[df['id'] == i, 'manual_polarity_3_cat'] = df_consolidated[df_consolidated['id'] == i]['Classification_PSY_3_cat'].values[0]
    return df

In [147]:
def conf_matrix(x, col1, col2):
    if x[col1] == 1 and x[col2] == 1:
        return 'TP'
    elif x[col1] == 1 and x[col2] == -1:
        return 'FN'
    elif x[col1] == -1 and x[col2] == 1:
        return 'FP'
    elif x[col1] == -1 and x[col2] == -1:
        return 'TN'
    else:
        return 0

In [148]:
rdf = pd.DataFrame(columns=['Sentiment Lexicon', 'Accuracy', 'Precision - Positive', 
                                'Recall - Positive', 'F1-Score - Positive',
                                'Precision - Negative', 
                                'Recall - Negative', 'F1-Score - Negative'
                               ])

In [149]:
data_psi.head()

Unnamed: 0,id,Classification_PSY1_5_cat,Classification_PSY2_5_cat,Classification_PSY3_5_cat,Classification_PSY1_3_cat,Classification_PSY2_3_cat,Classification_PSY3_3_cat
0,102600,Weak Negative,Weak Positive,Neutral,neg,pos,neu
1,102895,Weak Negative,Strong Negative,Strong Negative,neg,neg,neg
2,103085,Strong Negative,Strong Negative,Strong Negative,neg,neg,neg
3,103092,Weak Positive,Weak Positive,Weak Positive,pos,pos,pos
4,103200,Weak Positive,Weak Negative,Strong Negative,pos,neg,neg


In [150]:
print(len(data_psi))
print(data_psi['Classification_PSY1_3_cat'].value_counts())
print(data_psi['Classification_PSY2_3_cat'].value_counts())
print(data_psi['Classification_PSY3_3_cat'].value_counts())

560
pos    302
neg    132
neu    126
Name: Classification_PSY1_3_cat, dtype: int64
pos    256
neg    206
neu     98
Name: Classification_PSY2_3_cat, dtype: int64
neu    328
pos    150
neg     82
Name: Classification_PSY3_3_cat, dtype: int64


In [151]:
df_data_psi_3_cat = fill_manual_polarities_3_cat(data_psi)
df_data_psi_3_cat = fill_nan_values_by_consolidated_polarities(df_data_psi_3_cat, data_psi_divergentes_resolvidos)
df_data_psi_3_cat.head()

Unnamed: 0,id,Classification_PSY1_5_cat,Classification_PSY2_5_cat,Classification_PSY3_5_cat,Classification_PSY1_3_cat,Classification_PSY2_3_cat,Classification_PSY3_3_cat,manual_polarity_3_cat
0,102600,Weak Negative,Weak Positive,Neutral,neg,pos,neu,neu
1,102895,Weak Negative,Strong Negative,Strong Negative,neg,neg,neg,neg
2,103085,Strong Negative,Strong Negative,Strong Negative,neg,neg,neg,neg
3,103092,Weak Positive,Weak Positive,Weak Positive,pos,pos,pos,pos
4,103200,Weak Positive,Weak Negative,Strong Negative,pos,neg,neg,neg


In [152]:
def create_psycho_versus_participants_df(df, category):
    if category == 3:
        fields = {
            'd': ['manual_polarity_3_cat', 'participants_polarity_3_cat'],
            'r': ['manual_polarity_3_cat', 'classification_3_cat']
        }
    elif category == 5:
        fields = {
            'd': ['manual_polarity_5_cat', 'participants_polarity_5_cat'],
            'r': ['manual_polarity_5_cat', 'classification_3_cat']
        }

    d = {
        'id':[],
        fields['d'][0]:[],
        fields['d'][1]:[]
    }

    ids_not_found = []

    count = 0
    for idx in data_participants['id']:
        found = False
        for i in df['id']:
            if idx == i:
                found = True
                count = count + 1
                t = df.query('id == {}'.format(i))
                d['id'].append(t[t.keys()[0]].values[0])
                d[fields['d'][0]].append(t[fields['r'][0]].tolist()[0])
                p_t = data_participants.loc[data_participants['id'] == idx]
                d[fields['d'][1]].append(p_t[fields['r'][1]].tolist()[0])
        if found == False:
            ids_not_found.append(idx)

#     print("ids not found: ", len(ids_not_found))
    temp_pd = pd.DataFrame(d)
    return temp_pd

In [153]:
def perform_statistics_between_psycho_participants(df, cat, prefix):
    df['participants_prediction'] = df['participants_polarity_3_cat'].apply(lambda x: -1 if x == 'neg' else (1 if x == 'pos' else 0))
    df['psychologists_prediction'] = df['manual_polarity_3_cat'].apply(lambda x: -1 if x == 'neg' else (1 if x == 'pos' else 0))
    
    df['accuracy'] = df.apply(lambda x: 1 if x['participants_prediction'] == x['psychologists_prediction'] else 0, axis=1)

    df['conf_matrix'] = df.apply(lambda x: conf_matrix(x, 'psychologists_prediction', 'participants_prediction'), axis=1)

    list_precision = []

    conf_vals = df['conf_matrix'].value_counts().to_dict()
    
    if 'FP' not in conf_vals:
        conf_vals['FP'] = 0
    if 'TP' not in conf_vals:
        conf_vals['TP'] = 0
    if 'FN' not in conf_vals:
        conf_vals['FN'] = 0    
    if 'TN' not in conf_vals:
        conf_vals['TN'] = 0

    accuracy = (conf_vals['TP'] + conf_vals['TN']) / (conf_vals['TP'] + conf_vals['TN'] + conf_vals['FP'] + conf_vals['FN'])
    precision_pos = conf_vals['TP'] / (conf_vals['TP'] + conf_vals['FP'])

    recall_pos = conf_vals['TP'] / (conf_vals['TP'] + conf_vals['FN'])
    f1_score_pos = 2*(precision_pos*recall_pos / (precision_pos + recall_pos))

    precision_neg = conf_vals['TN'] / (conf_vals['TN'] + conf_vals['FN'])
    recall_neg = conf_vals['TN'] / (conf_vals['TN'] + conf_vals['FP'])
    f1_score_neg = 2*(precision_neg*recall_neg / (precision_neg + recall_neg))

    d = {
        'Accuracy': accuracy,
        'Precision - Positive': precision_pos,
        'Recall - Positive': recall_pos,
        'F1-Score - Positive': f1_score_pos,
        'Precision - Negative': precision_neg,
        'Recall - Negative': recall_neg,
        'F1-Score - Negative': f1_score_neg,
        'F1-Score - Average': (f1_score_pos + f1_score_neg) / 2
    }
       
    return pd.DataFrame.from_records([d])

In [154]:
df_for_analysis = create_psycho_versus_participants_df(df_data_psi_3_cat, 3)
print(len(df_for_analysis))
pd_stats_psy_part = perform_statistics_between_psycho_participants(df_for_analysis, 3, '')
pd_stats_psy_part

455


Unnamed: 0,Accuracy,Precision - Positive,Recall - Positive,F1-Score - Positive,Precision - Negative,Recall - Negative,F1-Score - Negative,F1-Score - Average
0,0.865169,0.862857,0.92638,0.893491,0.869565,0.769231,0.816327,0.854909


In [155]:
print(df_for_analysis['manual_polarity_3_cat'].value_counts())
print(df_for_analysis['participants_polarity_3_cat'].value_counts())

pos    206
neg    127
neu    122
Name: manual_polarity_3_cat, dtype: int64
pos    210
neu    139
neg    106
Name: participants_polarity_3_cat, dtype: int64


In [156]:
df_for_analysis = create_psycho_versus_participants_df(df_data_psi_3_cat, 3)

df_for_analysis.drop(df_for_analysis.loc[df_for_analysis['manual_polarity_3_cat']=='neu'].index, inplace=True)
df_for_analysis.drop(df_for_analysis.loc[df_for_analysis['participants_polarity_3_cat']=='neu'].index, inplace=True)

test = df_for_analysis['participants_polarity_3_cat']
pred = df_for_analysis['manual_polarity_3_cat']
confusion = confusion_matrix(test, pred)

print('Confusion Matrix\n')
print(confusion)

print('\nAccuracy: {:.6f}\n'.format(accuracy_score(test, pred)))

print('Micro Precision: {:.6f}'.format(precision_score(test, pred, average='micro')))
print('Micro Recall: {:.6f}'.format(recall_score(test, pred, average='micro')))
print('Micro F1-score: {:.6f}\n'.format(f1_score(test, pred, average='micro')))

print('Macro Precision: {:.6f}'.format(precision_score(test, pred, average='macro')))
print('Macro Recall: {:.6f}'.format(recall_score(test, pred, average='macro')))
print('Macro F1-score: {:.6f}\n'.format(f1_score(test, pred, average='macro')))

print('Weighted Precision: {:.6f}'.format(precision_score(test, pred, average='weighted')))
print('Weighted Recall: {:.6f}'.format(recall_score(test, pred, average='weighted')))
print('Weighted F1-score: {:.6f}'.format(f1_score(test, pred, average='weighted')))

print('\nClassification Report\n')
print(classification_report(test, pred, digits=6))

Confusion Matrix

[[ 80  12]
 [ 24 151]]

Accuracy: 0.865169

Micro Precision: 0.865169
Micro Recall: 0.865169
Micro F1-score: 0.865169

Macro Precision: 0.847806
Macro Recall: 0.866211
Macro F1-score: 0.854909

Weighted Precision: 0.872231
Weighted Recall: 0.865169
Weighted F1-score: 0.866903

Classification Report

              precision    recall  f1-score   support

         neg   0.769231  0.869565  0.816327        92
         pos   0.926380  0.862857  0.893491       175

    accuracy                       0.865169       267
   macro avg   0.847806  0.866211  0.854909       267
weighted avg   0.872231  0.865169  0.866903       267

