In [1]:
import json
from IPython.display import display
import pandas as pd
from itables import show
from datetime import datetime, timezone
import numpy as np
# Preprocessing
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
# Sentiment Analysis
from transformers import pipeline



nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lilynorthcutt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lilynorthcutt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lilynorthcutt/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
# Test Transformers is properly installed
!python3 -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
model.safetensors: 100%|█████████████████████| 268M/268M [01:39<00:00, 2.69MB/s]
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
tokenizer_config.json: 100%|██████████████████| 48.0/48.0 [00:00<00:00, 997kB/s]
vocab.txt: 100%|█████████████████████████████| 232k/232k [00:00<00:00, 4.82MB/s]
[{'label': 'POSITIVE', 'score': 0.9998704195022583}]


# Preprocessing

## Read in Data


In [4]:
# Opening JSON file
f = open('Data/incel_small.json')

# Create a list of dictionaries
data = json.load(f)
f.close()

# Create dataframe from list keeping columns of interest
df = pd.DataFrame(data,
  columns=['author', 'created_utc', 'num_comments', 'selftext', 'subreddit', 'title']) 
  
# Convert unix to utc
df["created_utc"] = df["created_utc"].apply(lambda time: datetime.fromtimestamp(time, tz=timezone.utc).strftime('%Y-%m-%d'))


# lambda row: datetime.fromtimestamp(row["created_utc"], tz=timezone.utc).strftime('%Y-%m-%d')
show(df[["subreddit", "created_utc", "title"]].head())

subreddit,created_utc,title
Loading ITables v2.1.4 from the internet... (need help?),,


In [5]:
# A lot of posts do not have text, for many posts this is because the post text got removed or deleted
print(f'Total submissions: {len(df)} \nTotal submissions with post text: {np.size(df["selftext"].unique()) -1}')


Total submissions: 5000 
Total submissions with post text: 1857


## Data Cleaning


In [6]:
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    return text


df['cleaned_title'] = df['title'].apply(lambda title: clean_text(str(title)))
df['cleaned_text'] = df['selftext'].apply(lambda text: clean_text(str(text)))
print(f'Title before cleaning: {df["title"][0]} \n\nTitle after cleaning: {df["cleaned_title"][0]}')

Title before cleaning: WONDERFUEL: Streamer accuses his girlfriend of cheating and acts in kind whilst live 

Title after cleaning: wonderfuel streamer accuses his girlfriend of cheating and acts in kind whilst live


## Data Preprocessing

In [12]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['processed_title'] = df['cleaned_title'].apply(lambda title: preprocess_text(str(title)))
df['processed_text'] = df['cleaned_text'].apply(lambda text: preprocess_text(str(text)))
print(f'Title before token/lemmat-ization: {df["cleaned_title"][0]} \n\nTitle after: {df["processed_title"][0]}')


Title before token/lemmat-ization: wonderfuel streamer accuses his girlfriend of cheating and acts in kind whilst live 

Title after: wonderfuel streamer accuses girlfriend cheating act kind whilst live


# Sentiment Analysis


In [15]:
# Load pre-trained sentiment analysis model
sentiment_model = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)


tf_model.h5:  55%|#####4    | 147M/268M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at bhadresh-savani/distilbert-base-uncased-emotion.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


tokenizer_config.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



In [72]:
# --------- TITLE ------------
# Apply sentiment analysis
df_temp = df.iloc[0:10]
df_temp['sentiment_title'] = df_temp['processed_title'].apply(lambda x: sentiment_model(x))

# Convert list of list of labels with sentiments into dictionary where each sentiment is a key and their values are their scores
test_list = list([])
for row in range(0, len(df_temp['sentiment_title'])):
  model_temp1 = df_temp['sentiment_title'][row]
  test_dict = {}  
  for i in range(0, len(model_temp1[0])):
    model_temp2 = model_temp1[0][i]
    test_dict[model_temp2["label"]] = model_temp2["score"]
  test_list.append(test_dict)
  
df_temp['sentiment_title'] = test_list

# Make each sentiment in the dictionary its own column
df_temp.head()


# --------- POST TEXT ------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['sentiment_title'] = df_temp['processed_title'].apply(lambda x: sentiment_model(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['sentiment_title'] = test_list


Unnamed: 0,author,created_utc,num_comments,selftext,subreddit,title,cleaned_title,cleaned_text,prepped_title,prepped_text,processed_title,processed_text,sentiment_title
0,maybethrowed,2017-11-03,0,,Incels,WONDERFUEL: Streamer accuses his girlfriend of...,wonderfuel streamer accuses his girlfriend of ...,,wonderfuel streamer accuses girlfriend cheatin...,,wonderfuel streamer accuses girlfriend cheatin...,,"{'sadness': 0.0042288838885724545, 'joy': 0.00..."
1,IncelDegenerate,2017-11-03,0,,Incels,Guide to Females,guide to females,,guide female,,guide female,,"{'sadness': 0.05490565672516823, 'joy': 0.6465..."
2,TheOneAndOnlyDeggie,2017-11-03,0,,Incels,Incel? Chad? Something in between?,incel chad something in between,,incel chad something,,incel chad something,,"{'sadness': 0.021808000281453133, 'joy': 0.090..."
3,BJRgaminggod,2017-11-03,0,I’m in high school and I get ready to take my ...,Incels,VENT I white knighted for a girl in class beca...,vent i white knighted for a girl in class beca...,i m in high school and i get ready to take my ...,vent white knighted girl class saw 2 bag hot c...,high school get ready take social study class ...,vent white knighted girl class saw 2 bag hot c...,high school get ready take social study class ...,"{'sadness': 0.6625804305076599, 'joy': 0.01809..."
4,FreshCope44,2017-11-03,0,[removed],Incels,Shame on me,shame on me,removed,shame,removed,shame,removed,"{'sadness': 0.9164012670516968, 'joy': 0.00643..."


In [39]:
title_temp = df['processed_title'][4]
print(title_temp)
model = sentiment_model(title_temp)
model

shame


[[{'label': 'sadness', 'score': 0.9164012670516968},
  {'label': 'joy', 'score': 0.006433330476284027},
  {'label': 'love', 'score': 0.0024321023374795914},
  {'label': 'anger', 'score': 0.07191678136587143},
  {'label': 'fear', 'score': 0.0016750693321228027},
  {'label': 'surprise', 'score': 0.0011414638720452785}]]

In [56]:
test_list = list([])

for row in range(0, len(model)):
  model_temp1 = model[row]

  test_dict = {}
  for i in range(0, len(model_temp1)):
    model_temp2 = model[row][i]
    test_dict[model_temp2["label"]] = model_temp2["score"]
    
  test_list.append(test_dict)
  
test_list

[{'sadness': 0.9164012670516968,
  'joy': 0.006433330476284027,
  'love': 0.0024321023374795914,
  'anger': 0.07191678136587143,
  'fear': 0.0016750693321228027,
  'surprise': 0.0011414638720452785}]

In [52]:
df.iloc[0:10]["title"]

0    WONDERFUEL: Streamer accuses his girlfriend of...
1                                     Guide to Females
2                   Incel? Chad? Something in between?
3    VENT I white knighted for a girl in class beca...
4                                          Shame on me
5                                      Catfish Request
6                                               Always
7    Something Awful forums are obsessed with r/incels
8    Sex is the tool that the elites use to control...
9    Take the Whitepill. Everyone here needs to rea...
Name: title, dtype: object