# 1. Install and Import Dependencies

In [1]:
!pip install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio===0.8.1 -f https://download.pytorch.org/whl/torch_stable.html

Looking in links: https://download.pytorch.org/whl/torch_stable.html


In [2]:
!pip install transformers requests beautifulsoup4 pandas numpy



In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re

# 2. Instantiate Model

In [4]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# 3. Encode and Calculate Sentiment

In [13]:
sentence = "It was ok"
tokens = tokenizer.encode(sentence, return_tensors='pt')
result = model(tokens)
result.logits
int(torch.argmax(result.logits))+1

3

# 3.5 Import Data

In [16]:
import os
import pandas as pd

In [15]:
os.listdir("./data/")

['RS_2021-06.json',
 'RS_2021-06.zst',
 'disney_all_subs_w_dates.csv',
 'reddit.csv',
 'twitter.csv']

In [17]:
df = pd.read_csv("./data/disney_all_subs_w_dates.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [18]:
len(df)

99274

In [19]:
df.selftext[:100]

0                                             [deleted]
1                                             [deleted]
2                                                   NaN
3                                             [deleted]
4                                                   NaN
                            ...                        
95                                                  NaN
96                                            [deleted]
97                                                  NaN
98                                                  NaN
99    Maybe it’d be better than to be scooped up by ...
Name: selftext, Length: 100, dtype: object

In [60]:
df2 = df.selftext.isnull()

In [63]:
type(df2)

pandas.core.series.Series

In [64]:
df2[:10]

0    False
1    False
2     True
3    False
4     True
5     True
6     True
7     True
8     True
9     True
Name: selftext, dtype: bool

In [65]:
df3 = df[df2 == False]

In [67]:
df3.selftext[:10]

0     [deleted]
1     [deleted]
3     [deleted]
10    [removed]
12    [removed]
13    [removed]
14    [removed]
18    [deleted]
22    [removed]
25    [deleted]
Name: selftext, dtype: object

In [71]:
df4 = df3[df3.selftext != "[deleted]"]

In [73]:
df5 = df4[df4.selftext != "[removed]"]

In [74]:
df5.selftext[:10]

38      As you probably are aware, Magic Bands have a ...
62      hello, my mother, me and her best friend and s...
99      Maybe it’d be better than to be scooped up by ...
298     Like the title says, I was hoping to find some...
970     The concept art may look promising that Disney...
1406    First of all, I'm not from the US so I don't k...
1442    So a few weeks ago, I came home to my wife wat...
1472    So I'm planning to binge-watch "The Lion Guard...
1478                      Anyone have any they just love?
1510    I have been SCOURING the internet for weeks fo...
Name: selftext, dtype: object

In [107]:
len(df5)

11527

In [112]:
df6 = df5[df5.selftext.apply(lambda x: len(str(x)) < 512)]

In [113]:
len(df6)

8093

In [114]:
sentence = df5.iloc[0].selftext

In [119]:
list1 = []

In [116]:
df6.iloc[0].selftext

'hello, my mother, me and her best friend and son are going to disney in a few months and i was wondering if disney allows autistic persons to buddy up with underage kids (i want to buddy up with my moms friends son who is 21 and has autism) not trying to be rude but some theme parks don’t allow that'

In [121]:
from tqdm.notebook import tqdm

In [122]:
with tqdm(total=len(df6)) as pbar:
    for i in range(len(df6)):
        tokens = tokenizer.encode(df6.iloc[i].selftext, return_tensors='pt')
        result = model(tokens)
        result.logits
        list1.append(int(torch.argmax(result.logits))+1)
        pbar.update(1)

  0%|          | 0/8093 [00:00<?, ?it/s]

In [141]:
type(list1)

list

In [142]:
list1[:10]

[4, 2, 4, 2, 2, 3, 5, 5, 3, 3]

In [143]:
list2 = pd.Series(list1)

In [144]:
len(list2)

8093

In [152]:
list3 = list2.rename("sentiment")

In [153]:
list3.head(10)

0    4
1    2
2    4
3    2
4    2
5    3
6    5
7    5
8    3
9    3
Name: sentiment, dtype: int64

In [154]:
result = pd.concat([df6, list3], axis=1)

In [155]:
result.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,is_gallery,gallery_data,media_metadata,edited,banned_by,author_cakeday,author_is_blocked,is_created_from_ads_ui,creation_date,sentiment
0,,,,,,,,,,,...,,,,,,,,,,4.0
1,,,,,,,,,,,...,,,,,,,,,,2.0
2,,,,,,,,,,,...,,,,,,,,,,4.0
3,,,,,,,,,,,...,,,,,,,,,,2.0
4,,,,,,,,,,,...,,,,,,,,,,2.0


In [156]:
result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16143 entries, 0 to 99273
Data columns (total 83 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   all_awardings                  776 non-null    object 
 1   allow_live_comments            453 non-null    object 
 2   author                         8093 non-null   object 
 3   author_flair_css_class         993 non-null    object 
 4   author_flair_richtext          2804 non-null   object 
 5   author_flair_text              103 non-null    object 
 6   author_flair_type              2804 non-null   object 
 7   author_fullname                6187 non-null   object 
 8   author_patreon_flair           1856 non-null   object 
 9   author_premium                 230 non-null    object 
 10  awarders                       287 non-null    object 
 11  can_mod_post                   3416 non-null   object 
 12  contest_mode                   4277 non-null  