<a href="https://colab.research.google.com/github/ipietri/w266_Final_Project/blob/master/notebooks/RtGender-Notebooks/RtGender_Annotations_Sentiment_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RtGender - Annotations - Additional EDA



In [1]:
try:
  from google.colab import drive
  drive.mount('/content/drive', force_remount=True)
  path = r'/content/drive/MyDrive/w266'
except ModuleNotFoundError:
  path = r'data'

Mounted at /content/drive


<a id='section01'></a>
## Load Modules

In [2]:
%%capture
#!pip install transformers==3.0.2
!pip install -q transformers

In [3]:
%%capture
!pip install datasets
import datasets 
from datasets import load_dataset, Dataset, DatasetDict

In [4]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn import metrics
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
#from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import logging
logging.basicConfig(level=logging.ERROR)

In [5]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

<a id='section02'></a>
## Import and Review Data

In [6]:
train_df = pd.read_csv('/content/drive/MyDrive/w266/annotations_train.csv')
dev_df = pd.read_csv('/content/drive/MyDrive/w266/annotations_dev.csv')
test_df = pd.read_csv('/content/drive/MyDrive/w266/annotations_test.csv')

print('train_shape: ',train_df.shape)
print('dev_shape: ',dev_df.shape)
print('test_shape: ',test_df.shape)

train_shape:  (10746, 8)
dev_shape:  (2303, 8)
test_shape:  (2303, 8)


In [7]:
train_df

Unnamed: 0.1,Unnamed: 0,source,op_gender,post_text,response_text,sentiment,relevance,label
0,3845,facebook_congress,W,Im reading the 3/1 GAO report that finds billi...,Thank you Congresswoman Bass. Keep up the grea...,Positive,ContentPoster,2
1,9743,fitocracy,M,Being followed by the famous DBJ? Quite an honor.,"Well, I am very honored you feel so honored",Positive,Content,2
2,13041,ted,W,"Penelope Boston gave a talk about Planets, exp...",Her opinions seem driven by wishful thinking. ...,Mixed,Content,1
3,4265,facebook_congress,W,Congress must act to help the 41 million Ameri...,There's no other way out of the enormity excep...,Positive,Content,2
4,13145,ted,W,"Pardis Sabeti gave a talk about Africa, big pr...",What were the benefits of the larger community...,Mixed,Content,1
...,...,...,...,...,...,...,...,...
10741,14128,ted,M,"Bjarke Ingels gave a talk about architecture, ...",Brillant!! Ingels has a terrific future ahead ...,Positive,Content,2
10742,5589,facebook_congress,W,I was honored to meet with Eliseo Medina and F...,The Democrats view this as another way to use ...,Negative,Content,0
10743,10672,reddit,W,SO YOU LIKE STACKING CUPS?! DO WE HAVE A GREAT...,Is this real?? Well at least this kid will be ...,Mixed,Content,1
10744,4839,facebook_congress,M,Try this Brian Schatz FB bumper sticker - an e...,EH BRIAN WEA MY STICKA N WAT OBAMA STAY ON UM ...,Neutral,Irrelevant,1


Distribution

In [8]:
print("train distribution: ", train_df.sentiment.value_counts(normalize=True))
print("-"*20)
print("dev distribution: ", dev_df.sentiment.value_counts(normalize=True))
print("-"*20)
print("test distribution: ", test_df.sentiment.value_counts(normalize=True))

train distribution:  Positive    0.492835
Neutral     0.243067
Negative    0.162944
Mixed       0.101154
Name: sentiment, dtype: float64
--------------------
dev distribution:  Positive    0.485454
Neutral     0.245766
Negative    0.165436
Mixed       0.103343
Name: sentiment, dtype: float64
--------------------
test distribution:  Positive    0.474598
Neutral     0.248372
Negative    0.176726
Mixed       0.100304
Name: sentiment, dtype: float64


Relevance

In [9]:
print("train distribution: ", train_df.relevance.value_counts(normalize=True))
print("-"*20)
print("dev distribution: ", dev_df.relevance.value_counts(normalize=True))
print("-"*20)
print("test distribution: ", test_df.relevance.value_counts(normalize=True))

train distribution:  Content          0.526987
ContentPoster    0.206216
Poster           0.149172
Irrelevant       0.117625
Name: relevance, dtype: float64
--------------------
dev distribution:  Content          0.529744
ContentPoster    0.194095
Poster           0.162831
Irrelevant       0.113330
Name: relevance, dtype: float64
--------------------
test distribution:  Content          0.531046
ContentPoster    0.201476
Poster           0.146765
Irrelevant       0.120712
Name: relevance, dtype: float64


Gender

In [10]:
print("train distribution: ", train_df.op_gender.value_counts(normalize=True))
print("-"*20)
print("dev distribution: ", dev_df.op_gender.value_counts(normalize=True))
print("-"*20)
print("test distribution: ", test_df.op_gender.value_counts(normalize=True))

train distribution:  M    0.504095
W    0.495905
Name: op_gender, dtype: float64
--------------------
dev distribution:  M    0.517586
W    0.482414
Name: op_gender, dtype: float64
--------------------
test distribution:  M    0.514112
W    0.485888
Name: op_gender, dtype: float64


Cross Tab

In [11]:
#sentiment and relevance
pd.crosstab(train_df.relevance, train_df.sentiment, normalize='columns')

sentiment,Mixed,Negative,Neutral,Positive
relevance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Content,0.595216,0.552256,0.590735,0.473187
ContentPoster,0.197792,0.154769,0.084992,0.284743
Irrelevant,0.102116,0.135922,0.261485,0.043807
Poster,0.104876,0.157053,0.062787,0.198263


In [12]:
#sentiment and op_gender
pd.crosstab(train_df.op_gender, train_df.sentiment, normalize='columns')

sentiment,Mixed,Negative,Neutral,Positive
op_gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M,0.514259,0.550543,0.536753,0.470544
W,0.485741,0.449457,0.463247,0.529456


## Determine Max Length

In [13]:
# find the P99 of length for response_text and set that as the max length 
max_length = train_df['response_text'].astype(str).map(len).quantile(0.99)
print(f"99th %tile of response_text length: {max_length}")

99th %tile of response_text length: 287.0


# Create multi class label

In [14]:
# add a dummy variable for the 4 sentiment classes 
sentiment_mappings_4 = {'Positive': 2, 'Mixed': 3, 'Neutral': 1, 'Negative':0}
train_df['labels_4'] = train_df.sentiment.map(sentiment_mappings_4)
dev_df['labels_4'] = dev_df.sentiment.map(sentiment_mappings_4)
test_df['labels_4'] = test_df.sentiment.map(sentiment_mappings_4)

#Over Sample

In [15]:
#Positive has the largest n-count so oversample each of the minority classes to have the largest number
print("train distribution: ", train_df.sentiment.value_counts(normalize=False))

train distribution:  Positive    5296
Neutral     2612
Negative    1751
Mixed       1087
Name: sentiment, dtype: int64


In [16]:
additional_neutral = train_df[train_df.sentiment == 'Neutral'].sample(5296 -2612, replace=True, random_state=12345)
additional_mixed = train_df[train_df.sentiment == 'Mixed'].sample(5296 -1087, replace=True, random_state=12345)
additional_neg = train_df[train_df.sentiment == 'Negative'].sample(5296 -1751, replace=True, random_state=12345)

train_data_oversampled = pd.concat([train_df,additional_neutral,additional_mixed,additional_neg], axis=0)

In [17]:
train_data_oversampled['sentiment'].value_counts(normalize=True)
print('train_shape: ',train_data_oversampled.shape)

train_shape:  (21184, 9)


In [18]:
print("oversampled train distribution: ", train_data_oversampled.op_gender.value_counts(normalize=True))


oversampled train distribution:  M    0.519165
W    0.480835
Name: op_gender, dtype: float64


In [21]:
# save out oversampled train_df and newly labeled dev and test 
train_data_oversampled.to_csv('/content/drive/MyDrive/w266/train_oversampled.csv', index=False)
train_df.to_csv('/content/drive/MyDrive/w266/annotations_train.csv', index=False)
dev_df.to_csv('/content/drive/MyDrive/w266/annotations_dev.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/w266/annotations_test.csv', index=False)




In [20]:
train_data_oversampled

Unnamed: 0.1,Unnamed: 0,source,op_gender,post_text,response_text,sentiment,relevance,label,labels_4
0,3845,facebook_congress,W,Im reading the 3/1 GAO report that finds billi...,Thank you Congresswoman Bass. Keep up the grea...,Positive,ContentPoster,2,2
1,9743,fitocracy,M,Being followed by the famous DBJ? Quite an honor.,"Well, I am very honored you feel so honored",Positive,Content,2,2
2,13041,ted,W,"Penelope Boston gave a talk about Planets, exp...",Her opinions seem driven by wishful thinking. ...,Mixed,Content,1,3
3,4265,facebook_congress,W,Congress must act to help the 41 million Ameri...,There's no other way out of the enormity excep...,Positive,Content,2,2
4,13145,ted,W,"Pardis Sabeti gave a talk about Africa, big pr...",What were the benefits of the larger community...,Mixed,Content,1,3
...,...,...,...,...,...,...,...,...,...
1940,6561,facebook_congress,W,It was terrific to have the chance to hear fro...,"""Committed to making sure we don't lose our he...",Negative,Poster,0,0
600,3829,facebook_congress,M,Johnny will join Tim Bryant on WGAU 1340 AM (A...,Both Isakson and Chambliss voted to TABLE Rand...,Negative,Poster,0,0
1625,11700,reddit,W,It wouldnt disintegrate you.,I think a magic beam of pure light would disin...,Negative,Content,0,0
7110,5668,facebook_congress,M,Our contest for a chance to attend a special c...,I'd rather have a root canal . . .,Negative,ContentPoster,0,0
