In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

In [2]:
data_path = '../data/full_dataset/'
train_df = pd.read_csv(data_path + 'goemotions_1.csv')
train_df.head()

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


## Univariate Analysis

In [3]:
pos_labels = ['admiration','approval', 'amusement', 'caring', 'desire', 
              'excitement', 'gratitude', 'joy', 'love','optimism', 'pride', 'relief']
neg_labels = ['anger', 'annoyance', 'disappointment', 'disapproval', 'disgust',
              'embarrassment','fear', 'grief', 'nervousness', 'remorse', 'sadness']
ambi_labels = ['confusion', 'curiosity', 'realization', 'surprise']

In [4]:
emotion_count = {}
sentiment_count = {}

pos_sum = 0
for label in pos_labels:
    df = train_df[train_df[label]==1]
    emotion_count[label] = len(df)
    pos_sum += len(df)
sentiment_count['positive'] = pos_sum

neg_sum = 0
for label in neg_labels:
    df = train_df[train_df[label]==1]
    emotion_count[label] = len(df)
    neg_sum += len(df)
sentiment_count['negative'] = neg_sum

ambi_sum = 0
for label in ambi_labels:
    df = train_df[train_df[label]==1]
    emotion_count[label] = len(df)
    ambi_sum += len(df)

df = train_df[train_df['neutral']==1]
emotion_count['neutral'] = len(df)
sentiment_count['neutral']  = len(df)
sentiment_count['ambiguous'] = ambi_sum

In [5]:
emotion_count

{'admiration': 5647,
 'approval': 5928,
 'amusement': 3081,
 'caring': 1988,
 'desire': 1248,
 'excitement': 1900,
 'gratitude': 3863,
 'joy': 2607,
 'love': 2745,
 'optimism': 2887,
 'pride': 452,
 'relief': 452,
 'anger': 2589,
 'annoyance': 4443,
 'disappointment': 2771,
 'disapproval': 3774,
 'disgust': 1704,
 'embarrassment': 817,
 'fear': 1048,
 'grief': 227,
 'nervousness': 598,
 'remorse': 849,
 'sadness': 2193,
 'confusion': 2471,
 'curiosity': 3267,
 'realization': 2867,
 'surprise': 1806,
 'neutral': 18423}

In [6]:
sentiment_count

{'positive': 32798, 'negative': 21013, 'neutral': 18423, 'ambiguous': 10411}

### Bar Plot - All emotions

In [7]:
# Univariate distribution

rdf = pd.DataFrame.from_dict(emotion_count, orient = "index").reset_index()
rdf.rename(columns = {"index" : "Emotion", 0 : "Count"}, inplace = True)
rdf.sort_values(by  = ["Count"], ascending = False, inplace = True)

In [8]:
%matplotlib tk
chart = sns.barplot(x = "Emotion", y = "Count", data = rdf)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.gcf().subplots_adjust(bottom=0.25)

### Grouped by Sentiment

In [9]:
def label_sentiment(row):
    if row['Emotion'] in pos_labels:
        return "Positive"
    if row['Emotion'] in neg_labels:
        return "Negative"
    if row['Emotion'] in ambi_labels:
        return "Ambiguous"
    if row['Emotion'] == 'neutral':
        return "Neutral"
df = rdf
df["Sentiment"] = df.apply(lambda row: label_sentiment(row), axis = 1)

In [10]:
# Group by sentiment

# Create an array with the colors you want to use
colors = ["#FAEBD7","#7080E2", "#EF4135", "#0DC9B6"]
# Set your custom color palette
sns.set_palette(sns.color_palette(colors))

fig, ax = plt.subplots()
chart = sns.barplot(x="Emotion", y="Count", hue = "Sentiment", data=df, dodge = False)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
fig.subplots_adjust(bottom=0.2)

### Facet Grid - Sentiment

In [10]:
df[df.Sentiment != "Neutral"]

Unnamed: 0,Emotion,Count,Sentiment
1,approval,5928,Positive
0,admiration,5647,Positive
13,annoyance,4443,Negative
6,gratitude,3863,Positive
15,disapproval,3774,Negative
24,curiosity,3267,Ambiguous
2,amusement,3081,Positive
9,optimism,2887,Positive
25,realization,2867,Ambiguous
14,disappointment,2771,Negative


In [18]:
# Data preprocessing for Amcharts
# pos = []
# pos_cnt = 0 
# neg = []
# neg_cnt = 0
# neut = []
# neut_cnt = 0
# for idx,row in df.iterrows():
#     if row["Sentiment"] == "Positive":
#         pos.append({"name": row["Emotion"], "value": row["Count"]})
#         pos_cnt += row["Count"]
#     elif row["Sentiment"] == "Negative":
#         neg.append({ "name": row["Emotion"], "value": row["Count"]})
#         neg_cnt += row["Count"]
#     else:
#         neut.append({"name": row["Emotion"], "value": row["Count"]})
#         neut_cnt += row["Count"]

In [12]:
# Facet Grid
sns.set(style="whitegrid")
g = sns.catplot(data=df[df.Sentiment != "Neutral"], x='Emotion', y='Count', hue = "Sentiment", 
                col='Sentiment', kind='bar', dodge = False, sharex = False, palette = sns.color_palette(["#7080E2", "#EF4135", "#0DC9B6"]))
fig.subplots_adjust(bottom= 0.8)
[plt.setp(ax.get_xticklabels(), rotation=45) for ax in g.axes.flat]
plt.gcf().subplots_adjust(bottom=0.25)

### Pie chart

In [13]:
df = pd.DataFrame.from_dict(sentiment_count, orient = "index")
df.rename(columns = {"index" : "Sentiment", 0 : "Count"}, inplace = True)
%matplotlib tk
df.plot.pie(y = "Count",autopct='%1.1f%%', 
            colors = ["#7080E2", "#EF4135", "#FAEBD7","#0DC9B6"], explode = (0, 0, 0.1, 0), fontsize = 16,
            labels = ["Positive", "Negative", "Neutral", "Ambiguous"],
           legend = False, ylabel = "",)

<matplotlib.axes._subplots.AxesSubplot at 0x1a1b0a6c18>

### Grouped by 6 emotions

In [14]:
anger_list = [ "anger", "annoyance", "disapproval", "disgust"]
fear_list = ["fear", "nervousness"]
joy_list = ["joy", "amusement", "approval", "excitement", "gratitude","love", "optimism", "relief", "pride", "admiration", "desire", "caring"]
sadness_list = ["sadness", "disappointment", "embarrassment", "grief", "remorse"]
surprise_list = ["surprise", "realization", "confusion", "curiosity"]

In [15]:
def label_emotion_boraded(row):
    if row['Emotion'] in anger_list:
        return "Anger"
    if row['Emotion'] in fear_list:
        return "Fear"
    if row['Emotion'] in joy_list:
        return "Joy"
    if row['Emotion'] in sadness_list:
        return "Sadness"
    if row['Emotion'] in surprise_list:
        return "Surprise"
    if row['Emotion'] == 'neutral':
        return "Neutral"
df = rdf
df["Broader Emotions"] = df.apply(lambda row: label_emotion_boraded(row), axis = 1)
df

Unnamed: 0,Emotion,Count,Sentiment,Broader Emotions
27,neutral,18423,Neutral,Neutral
1,approval,5928,Positive,Joy
0,admiration,5647,Positive,Joy
13,annoyance,4443,Negative,Anger
6,gratitude,3863,Positive,Joy
15,disapproval,3774,Negative,Anger
24,curiosity,3267,Ambiguous,Surprise
2,amusement,3081,Positive,Joy
9,optimism,2887,Positive,Joy
25,realization,2867,Ambiguous,Surprise


In [16]:
broader_emotion_count = df.groupby("Broader Emotions").sum().reset_index()
broader_emotion_count
%matplotlib tk
broader_emotion_count.plot.pie(y = "Count",autopct='%1.1f%%', colors = sns.color_palette("Set2"), labels = broader_emotion_count["Broader Emotions"],
                              legend = False,  ylabel = "Count of data points", fontsize = 16)
# texts[0].set_fontsize(4)

<matplotlib.axes._subplots.AxesSubplot at 0x1a1b0ccc50>

## Bivariate Analysis

In [17]:
# df = titanic.pivot_table(index='embark_town', columns='age_group', values='fare', aggfunc=np.median)
rdf = pd.read_csv(data_path + 'goemotions_1.csv')
df_subset = rdf.iloc[:,np.r_[9:37]]
df_subset

Unnamed: 0,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
69996,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69997,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
69998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


### Pearson

In [18]:
corr = df_subset.corr()
sns.set_theme(style="white")
%matplotlib tk
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, mask = mask, square = True, linewidths = 1, xticklabels=True, yticklabels=True, robust = True)
#             vmax = 0.25, vmin = -0.25, center = 0, )

<matplotlib.axes._subplots.AxesSubplot at 0x1a219af048>

### Chi 2

### Single Example

In [19]:
# Get table of observed counts
df = df_subset
count_table = pd.crosstab(df['admiration'], df['amusement'])
print(count_table)

amusement       0     1
admiration             
0           61420  2933
1            5499   148


In [20]:
def chi_sq_test(table):

    print("Results for:")
    print(str(table))

    # Get column percentages
    col_sum = table.sum(axis=0)
    col_percents = table/col_sum
    print(col_percents)

    chi_square = chi2_contingency(table)
    print("Chi-square value, p-value, expected_counts")
    print(chi_square)

    print()

print("Initial Chi-square:")
chi_sq_test(count_table)
print(" ")

Initial Chi-square:
Results for:
amusement       0     1
admiration             
0           61420  2933
1            5499   148
amusement          0         1
admiration                    
0           0.917826  0.951964
1           0.082174  0.048036
Chi-square value, p-value, expected_counts
(45.823583162180505, 1.293963502436272e-11, 1, array([[61520.54867143,  2832.45132857],
       [ 5398.45132857,   248.54867143]]))

 


### Correlation

In [21]:
def chi_sq_test(table):
    # Get column percentages
    col_sum = table.sum(axis=0)
    col_percents = table/col_sum
    chi_square = chi2_contingency(table)
    return chi_square[0], chi_square[1]
     
chi2_values = np.zeros((28,28))
p_values =  np.zeros((28,28))
df = df_subset
for i in range(28):
    for j in range(28):
        count_table = pd.crosstab(df.iloc[:, i], df.iloc[:, j])
        chi2_values[i][j], p_values[i][j] = chi_sq_test(count_table)

#### Plotting Chi2 values

In [22]:
chi2_values

array([[6.99865169e+04, 4.58235832e+01, 1.93915952e+02, 2.99322755e+02,
        1.28905590e-03, 4.01884010e+01, 1.15408735e+02, 9.87824208e+01,
        5.41065862e+00, 1.39682684e+02, 2.37829247e+02, 8.93561332e+01,
        3.14656559e+01, 7.62557815e-01, 5.02798719e+01, 2.23994365e+01,
        5.73780828e+00, 1.34288312e+00, 1.66697225e+00, 2.14923982e+01,
        9.17491017e+00, 7.82071509e+01, 5.91057437e+01, 9.68870383e+00,
        4.01761575e+01, 9.35665826e+01, 3.56383938e+01, 2.19259813e+03],
       [4.58235832e+01, 6.99762361e+04, 8.68498743e+01, 1.32057083e+02,
        6.45703665e+01, 4.88357281e+01, 4.51018755e+01, 3.45560863e+01,
        2.29838500e+01, 7.14747103e+01, 9.36406765e+01, 3.94014677e+01,
        1.11455765e+01, 1.67810642e+01, 2.60327674e+01, 3.52029993e+01,
        5.89450688e+00, 1.46826781e-02, 1.53847400e+01, 1.57474740e+01,
        2.37281013e+01, 1.25417649e+01, 1.29378848e+01, 2.89362662e+00,
        1.75239914e+01, 5.02564493e+01, 2.98260857e+01, 1.14976

In [25]:
final_chi2 = pd.DataFrame(data = chi2_values, columns  = df_subset.columns.values.tolist(), index = df_subset.columns.values.tolist())

In [28]:

sns.set_theme(style="white")
%matplotlib tk
mask = np.zeros_like(final_chi2, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
cmap = sns.color_palette("flare", as_cmap=True)

ax = sns.heatmap(final_chi2, square = True, linewidths = 1, xticklabels=True, yticklabels=True, 
            mask = mask, cmap = cmap)
plt.gcf().subplots_adjust(bottom=0.25)

#### Plotting p values

In [29]:
p_values

array([[0.00000000e+000, 1.29396350e-011, 4.44229925e-044,
        4.62722635e-067, 9.71359365e-001, 2.30613424e-010,
        6.40394290e-027, 2.81825122e-023, 2.00141634e-002,
        3.12322964e-032, 1.16959490e-053, 3.29768258e-021,
        2.02999209e-008, 3.82529245e-001, 1.33309898e-012,
        2.21438967e-006, 1.66034984e-002, 2.46526164e-001,
        1.96664568e-001, 3.55234088e-006, 2.45355410e-003,
        9.27820358e-019, 1.49422457e-014, 1.85404280e-003,
        2.32063294e-010, 3.92763218e-022, 2.37559019e-009,
        0.00000000e+000],
       [1.29396350e-011, 0.00000000e+000, 1.17079470e-020,
        1.45378248e-030, 9.31457505e-016, 2.78323327e-012,
        1.87046183e-011, 4.14144603e-009, 1.63368069e-006,
        2.80834650e-017, 3.78332028e-022, 3.45037531e-010,
        8.42322900e-004, 4.19498320e-005, 3.35671586e-007,
        2.97065683e-009, 1.51881817e-002, 9.03554721e-001,
        8.76936380e-005, 7.23845283e-005, 1.10950627e-006,
        3.97956241e-004, 3.219

In [31]:
final_p = pd.DataFrame(data = p_values, columns  = df_subset.columns.values.tolist(), index = df_subset.columns.values.tolist())

In [33]:
sns.set_theme(style="white")
fig, ax = plt.subplots()
%matplotlib tk
mask = np.zeros_like(final_p, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
cmap = sns.color_palette("flare", as_cmap=True)

sns.heatmap(final_p, square = True, linewidths = 1, xticklabels=True, yticklabels=True, mask = mask,cmap = cmap)
plt.gcf().subplots_adjust(bottom=0.25)