In [67]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [68]:
data_path = '../data/full_dataset/'
train_df = pd.read_csv(data_path + 'goemotions_1.csv')
train_df.head()

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


## Univariate Analysis

In [69]:
pos_labels = ['admiration','approval', 'amusement', 'caring', 'desire', 
              'excitement', 'gratitude', 'joy', 'love','optimism', 'pride', 'relief']
neg_labels = ['anger', 'annoyance', 'disappointment', 'disapproval', 'disgust',
              'embarrassment','fear', 'grief', 'nervousness', 'remorse', 'sadness']
ambi_labels = ['confusion', 'curiosity', 'realization', 'surprise']

In [70]:
emotion_count = {}
sentiment_count = {}

pos_sum = 0
for label in pos_labels:
    df = train_df[train_df[label]==1]
    emotion_count[label] = len(df)
    pos_sum += len(df)
sentiment_count['positive'] = pos_sum

neg_sum = 0
for label in neg_labels:
    df = train_df[train_df[label]==1]
    emotion_count[label] = len(df)
    neg_sum += len(df)
sentiment_count['negative'] = neg_sum

ambi_sum = 0
for label in ambi_labels:
    df = train_df[train_df[label]==1]
    emotion_count[label] = len(df)
    ambi_sum += len(df)

df = train_df[train_df['neutral']==1]
emotion_count['neutral'] = len(df)
sentiment_count['neutral']  = len(df)
sentiment_count['ambiguous'] = ambi_sum

In [71]:
emotion_count

{'admiration': 5647,
 'approval': 5928,
 'amusement': 3081,
 'caring': 1988,
 'desire': 1248,
 'excitement': 1900,
 'gratitude': 3863,
 'joy': 2607,
 'love': 2745,
 'optimism': 2887,
 'pride': 452,
 'relief': 452,
 'anger': 2589,
 'annoyance': 4443,
 'disappointment': 2771,
 'disapproval': 3774,
 'disgust': 1704,
 'embarrassment': 817,
 'fear': 1048,
 'grief': 227,
 'nervousness': 598,
 'remorse': 849,
 'sadness': 2193,
 'confusion': 2471,
 'curiosity': 3267,
 'realization': 2867,
 'surprise': 1806,
 'neutral': 18423}

In [72]:
sentiment_count

{'positive': 32798, 'negative': 21013, 'neutral': 18423, 'ambiguous': 10411}

In [73]:
# Univariate distribution

df = pd.DataFrame.from_dict(emotion_count, orient = "index").reset_index()
df.rename(columns = {"index" : "Emotion", 0 : "Count"}, inplace = True)
df.sort_values(by  = ["Count"], ascending = False, inplace = True)
# sns.countplot(emotion_count)

### Bar Plot

In [119]:
%matplotlib tk
a4_dims = (11.7, 8.27)
fig, ax = plt.subplots()
chart = sns.barplot(x = "Emotion", y = "Count", data = df)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
fig.subplots_adjust(bottom=0.2)
# chart.get_figure().savefig("output.png")


In [75]:
def label_sentiment(row):
    if row['Emotion'] in pos_labels:
        return "Positive"
    if row['Emotion'] in neg_labels:
        return "Negative"
    if row['Emotion'] in ambi_labels:
        return "Ambiguous"
    if row['Emotion'] == 'neutral':
        return "Neutral"
df["Sentiment"] = df.apply(lambda row: label_sentiment(row), axis = 1)

In [115]:
# Group by sentiment

# Create an array with the colors you want to use
colors = ["#FAEBD7","#7080E2", "#EF4135", "#0DC9B6"]
# Set your custom color palette
sns.set_palette(sns.color_palette(colors))

fig, ax = plt.subplots()
chart = sns.barplot(x="Emotion", y="Count", hue = "Sentiment", data=df, dodge = False)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
fig.subplots_adjust(bottom=0.2)

### Facet Grid

In [113]:
df[df.Sentiment != "Neutral"]

Unnamed: 0,Emotion,Count,Sentiment,Broader_emotion,Broader Emotions
1,approval,5928,Positive,Joy,Joy
0,admiration,5647,Positive,Joy,Joy
13,annoyance,4443,Negative,Anger,Anger
6,gratitude,3863,Positive,Joy,Joy
15,disapproval,3774,Negative,Anger,Anger
24,curiosity,3267,Ambiguous,Surprise,Surprise
2,amusement,3081,Positive,Joy,Joy
9,optimism,2887,Positive,Joy,Joy
25,realization,2867,Ambiguous,Surprise,Surprise
14,disappointment,2771,Negative,Sadness,Sadness


In [118]:
# Facet Grid
sns.set(style="whitegrid")
g = sns.catplot(data=df[df.Sentiment != "Neutral"], x='Emotion', y='Count', hue = "Sentiment", 
                col='Sentiment', kind='bar', dodge = False, sharex = False, palette = sns.color_palette(["#7080E2", "#EF4135", "#0DC9B6"]))
fig.subplots_adjust(bottom= 0.8)
[plt.setp(ax.get_xticklabels(), rotation=45) for ax in g.axes.flat]

[[None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None, None, None, None, None, None, None, None]]

### Pie chart

In [124]:
df = pd.DataFrame.from_dict(sentiment_count, orient = "index")
df.rename(columns = {"index" : "Sentiment", 0 : "Count"}, inplace = True)
%matplotlib tk
df.plot.pie(y = "Count",autopct='%1.1f%%', 
            colors = ["#7080E2", "#EF4135", "#FAEBD7","#0DC9B6"], explode = (0, 0, 0.1, 0), fontsize = 16,
            labels = ["Positive", "Negative", "Neutral", "Ambiguous"],
           legend = False, ylabel = "",)

<matplotlib.axes._subplots.AxesSubplot at 0x1a2d906438>

### Grouped into emotions

In [81]:
anger_list = [ "anger", "annoyance", "disapproval", "disgust"]
fear_list = ["fear", "nervousness"]
joy_list = ["joy", "amusement", "approval", "excitement", "gratitude","love", "optimism", "relief", "pride", "admiration", "desire", "caring"]
sadness_list = ["sadness", "disappointment", "embarrassment", "grief", "remorse"]
surprise_list = ["surprise", "realization", "confusion", "curiosity"]

In [83]:
def label_emotion_boraded(row):
    if row['Emotion'] in anger_list:
        return "Anger"
    if row['Emotion'] in fear_list:
        return "Fear"
    if row['Emotion'] in joy_list:
        return "Joy"
    if row['Emotion'] in sadness_list:
        return "Sadness"
    if row['Emotion'] in surprise_list:
        return "Surprise"
    if row['Emotion'] == 'neutral':
        return "Neutral"
df["Broader Emotions"] = df.apply(lambda row: label_emotion_boraded(row), axis = 1)
df

Unnamed: 0,Emotion,Count,Sentiment,Broader_emotion,Broader Emotions
27,neutral,18423,Neutral,Neutral,Neutral
1,approval,5928,Positive,Joy,Joy
0,admiration,5647,Positive,Joy,Joy
13,annoyance,4443,Negative,Anger,Anger
6,gratitude,3863,Positive,Joy,Joy
15,disapproval,3774,Negative,Anger,Anger
24,curiosity,3267,Ambiguous,Surprise,Surprise
2,amusement,3081,Positive,Joy,Joy
9,optimism,2887,Positive,Joy,Joy
25,realization,2867,Ambiguous,Surprise,Surprise


In [109]:
broader_emotion_count = df.groupby("Broader Emotions").sum().reset_index()
broader_emotion_count
%matplotlib tk
broader_emotion_count.plot.pie(y = "Count",autopct='%1.1f%%', colors = sns.color_palette("Set2"), labels = broader_emotion_count["Broader Emotions"],
                              legend = False,  ylabel = "Count of data points", fontsize = 16)
# texts[0].set_fontsize(4)

<matplotlib.axes._subplots.AxesSubplot at 0x1a2889d588>

## Bivariate Analysis

In [13]:
# df = titanic.pivot_table(index='embark_town', columns='age_group', values='fare', aggfunc=np.median)
train_df = pd.read_csv(data_path + 'goemotions_1.csv')
train_df = train_df.iloc[:,np.r_[9:37]]
train_df

Unnamed: 0,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
69996,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69997,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
69998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [45]:
corr = train_df.corr()
sns.set_theme(style="white")
# Generate a custom diverging colormap
%matplotlib tk
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, mask = mask, square = True, linewidths = 1, xticklabels=True, yticklabels=True, robust = True)
#             vmax = 0.25, vmin = -0.25, center = 0, )

<matplotlib.axes._subplots.AxesSubplot at 0x1a285d6f98>