In [1]:
!pip install transformers
!pip install plotly==4.14.1
!pip install datasets



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import sys
sys.path.append('/content/drive/My Drive/data/icns_project')

In [4]:
import torch
import pandas as pd
import plotly.express as px
from transformers import RobertaTokenizer, RobertaModel
from datasets import load_dataset
from torch.utils.data import DataLoader
from pathlib import Path

from collections import defaultdict
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from scipy.spatial.distance import euclidean, pdist, squareform
from sklearn import manifold          #use this for MDS computation

#visualization libs
import plotly.graph_objects as go
import matplotlib.pyplot as plt
% matplotlib inline

In [5]:
pd.set_option('max_colwidth', 800)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('using device: ', torch.cuda.get_device_name(device), flush=True)

using device:  Tesla T4


In [7]:
# Load pre-trained model tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [8]:
# Load pre-trained model
model = RobertaModel.from_pretrained('roberta-base',
                                  output_hidden_states = True
                                  )
model.to(device)
# Put the model in "evaluation" mode
model.eval()

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [9]:
MODEL_PATH = Path('drive') / 'My Drive' / 'data' / 'icns_project' / 'paraphrase-distilroberta-base-v1'
DATA_PATH = Path('drive') / 'My Drive' / 'data' / 'icns_project'

In [10]:
news_df = pd.read_csv(DATA_PATH / 'BBC_news_adjusted.csv', encoding='utf-8')
jokes_df = pd.read_csv(DATA_PATH / 'jokes_stupid_wocka.csv', encoding='utf-8')

In [11]:
news_df.head(2)

Unnamed: 0,Text,Category
0,worldcom ex-boss launches defence lawyers defending former worldcom chief bernie ebbers against a battery of fraud charges have called a company whistleblower as their first witness. cynthia cooper worldcom s ex-head of internal accounting alerted directors to irregular accounting practices at the us telecoms giant in 2002.,business
1,german business confidence slides german business confidence fell in february knocking hopes of a speedy recovery in europe s largest economy. munich-based research institute ifo said that its confidence index fell to 95.5 in february from 97.5 in january its first decline in three months.,business


In [12]:
jokes_df.head(2)

Unnamed: 0,text,category,source,score
0,"A blackjack dealer and a player with a thirteen count in his hand were arguing about whether or not it was appropriate to tip the dealer. The player said, ""When I get bad cards, it's not the dealer's fault. Accordingly, when I get good cards, the dealer obviously had nothing to do with it so, why should I tip him?"" The dealer said, ""When you eat out do you tip the waiter?"" ""Yes."" ""Well then, he serves you food, I'm serving you cards, so you should tip me."" ""Okay, but, the waiter gives me what I ask for. I'll take an eight.""",Children,stupidstuff,2.63
1,"At a dinner party, several of the guests were arguing whether men or women were more trustworthy. 'No woman,' said one man, scornfully, 'can keep a secret.' 'I don't know about that,' answered a blonde woman guest. 'I have kept my age a secret since I was twenty-one.' 'You'll let it out some day,' the man insisted. 'I hardly think so!' responded the blonde lady. 'When a woman has kept a secret for twenty-seven years, she can keep it forever.'",Blonde Jokes,stupidstuff,2.57


In [13]:
news_df = news_df.rename({'Text': 'text'}, axis=1)

In [14]:
na_rows = news_df[news_df['text'].isna()]

In [15]:
na_rows

Unnamed: 0,text,Category


In [16]:
na_rows = jokes_df[jokes_df['text'].isna()]

In [17]:
na_rows

Unnamed: 0,text,category,source,score
3281,,Other / Misc,wocka,


In [18]:
jokes_df = jokes_df.drop(na_rows.index)

In [19]:
news_df.shape

(1490, 2)

In [20]:
jokes_df.shape

(13132, 4)

In [21]:
# let's look at the lengths of jokes in (possibly) relevant categories only
categories = ['Sports', 'Business', 'Tech', 'News / Politics', 'Political',
              'Money', 'At Work', 'Office Jokes', 'Computers']

In [22]:
jokes_df = jokes_df[jokes_df['category'].isin(categories)]

In [23]:
jokes_df.shape

(1201, 4)

In [24]:
%%time
# tokenize without padding and truncation
news_encodings = tokenizer(news_df['text'].to_list())

CPU times: user 870 ms, sys: 37 ms, total: 907 ms
Wall time: 920 ms


In [25]:
%%time
# tokenize without padding and truncation
jokes_encodings = tokenizer(jokes_df['text'].to_list())

Token indices sequence length is longer than the specified maximum sequence length for this model (595 > 512). Running this sequence through the model will result in indexing errors


CPU times: user 1.55 s, sys: 4.89 ms, total: 1.56 s
Wall time: 1.56 s


In [26]:
news_lens = [len(ex) for ex in news_encodings['input_ids']]
jokes_lens = [len(ex) for ex in jokes_encodings['input_ids']]

In [27]:
news_lens_series = pd.Series(news_lens, name='number_of_tokens')
jokes_lens_series = pd.Series(jokes_lens, name='number_of_tokens')

In [28]:
news_lens_series.describe()

count    1490.000000
mean       66.089933
std        10.718886
min        20.000000
25%        59.000000
50%        66.000000
75%        72.000000
max       114.000000
Name: number_of_tokens, dtype: float64

In [29]:
jokes_lens_series.describe()

count     1201.000000
mean       241.198168
std        569.486391
min          5.000000
25%         76.000000
50%        158.000000
75%        272.000000
max      17171.000000
Name: number_of_tokens, dtype: float64

In [30]:
jokes_lens_series[jokes_lens_series > 512].shape

(98,)

In [31]:
#set index from 0 to n
jokes_df = jokes_df.reset_index(drop=True)

In [32]:
news_df['length'] = news_lens_series
jokes_df['length'] = jokes_lens_series

In [33]:
jokes_df.head(3)

Unnamed: 0,text,category,source,score,length
0,"A brunette, a blonde, and a redhead all worked in the same office with the same female boss. Every day, they noticed their boss left work early. One day, the girls decided that when the boss left, they'd leave right behind her. After all, she never called in or came back to the office when she left early, so how was she to know? The next day, they all three left the office right after the boss left. The brunette was thrilled to be home early. She did a little gardening and went to bed early. The redhead was elated to be able to get in a quick workout at the health club before meeting her dinner date. The blonde was happy, happy, happy to be home, but when she got to the bedroom she heard a muffled noise from inside. Slowly, quietly, she cracked open the door and was mortified to se...",Office Jokes,stupidstuff,3.73,260
1,"Bill and Hillary Clinton went out to dinner and when the waiter came to take their order, he asked Bill how he wanted his steak, she replied, ""medium."" Then the waiter said, ""how about your vegetable?"" Bill replied, ""Oh, she can order for herself.""",Political,stupidstuff,3.5,59
2,"The stockbroker's secretary answered his phone one morning. ""I'm sorry,"" she said, ""Mr. Bradford's on another line."" ""This is Mr. Ingram's office,"" the caller said. ""We'd like to know if he's bullish or bearish right now."" ""He's talking to his wife,"" the secretary replied. ""Right now I'd say he's sheepish.""",Business,stupidstuff,2.33,85


In [34]:
# how many of the too long jokes come from stupidstuff vs wocka?
long_jokes = jokes_df[jokes_df['length'] > 512]
long_jokes['source'].value_counts()

wocka          74
stupidstuff    24
Name: source, dtype: int64

In [35]:
# remove too long jokes
jokes_df = jokes_df.drop(long_jokes.index)

In [36]:
jokes_df.shape

(1103, 5)

In [37]:
! pip install -U kaleido

Requirement already up-to-date: kaleido in /usr/local/lib/python3.6/dist-packages (0.1.0)


In [42]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=news_df['length'], name='news',
                          xbins=dict( # bins used for histogram
                            start=0,
                            end=512,
                            size=50
                            ),
                           marker_color='#EB89B5',
                          )
)
fig.add_trace(go.Histogram(x=jokes_df['length'], name='jokes',
                           xbins=dict( # bins used for histogram
                              start=0,
                              end=512,
                              size=50
                              ),
                           marker_color='#330C73',
                          )
)
# Overlay both histograms
fig.update_layout(
    #barmode='overlay',
                  title='Sample lengths (number of tokens)',
                  xaxis_title='Number of tokens',
                  yaxis_title='Frequency',
                  legend_title='Source',
                  bargap=0.2, # gap between bars of adjacent location coordinates
                  )
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.7)
fig.show()
fig.write_image(str(DATA_PATH / 'sample_length_histogram.png'))

In [47]:
jokes_df['category'].value_counts()

At Work            264
News / Politics    258
Sports             166
Tech               126
Political          109
Money               68
Computers           55
Business            40
Office Jokes        17
Name: category, dtype: int64

In [46]:
# save jokes dataset with only relevant categories and samples within allowed length boundaries
jokes_df.to_csv(str(DATA_PATH / 'jokes_stupid_wocka_relevant.csv'), index=False, encoding='utf-8')