# Convert YouTube Comments into Transaction Data

In [4]:
import spacy
from spacy.language import Language
import ray
import psutil
import modin.pandas as pd
from tqdm import tqdm
from modin.config import ProgressBar

ProgressBar.enable()
num_cpus = psutil.cpu_count(logical=False)
ray.init(num_cpus=num_cpus, ignore_reinit_error=True)

2021-10-19 19:19:05,850	INFO worker.py:836 -- Calling ray.init() again after it has already been called.


Load data set.

In [5]:
df = pd.read_csv('../data/comments_labeled.csv')
df = df[df['text'].str.len() > 0]
df.head()

Estimated completion of line 2:   0%           Elapsed time: 00:00, estimated remaining time: ?

Unnamed: 0,LABEL,video_id,text,like,published_at,published_week
0,0,-2Rd0A_WTDQ,which makes the attacker have to go through a...,0,2021-05-21,2021-05-17
1,0,-2Rd0A_WTDQ,the neck beard i can't look away,0,2021-05-15,2021-05-10
2,2,-2Rd0A_WTDQ,billion dollars is pathetic for a modern chip...,0,2021-05-15,2021-05-10
3,-1,-2Rd0A_WTDQ,imagine people fighting against facebook spyi...,0,2021-05-11,2021-05-10
4,4,-2Rd0A_WTDQ,if facebook was 1 a month it'd change everyth...,0,2021-05-11,2021-05-10


Load spacy pipeline.

In [6]:
# python -m spacy download en_core_web_sm
pipeline = spacy.load('en_core_web_sm')

Process one comment.

In [7]:
texts = df['text']
doc1 = pipeline(texts[2])
for i, token in enumerate(doc1):
    print({"text": token.text,
            "lemma": token.lemma_,
            "POS": token.pos_,
            "tag": token.tag_,
            "dep": token.dep_,
            "shape": token.shape_,
            "is_alpha": token.is_alpha,
            "is_stop": token.is_stop})

{'text': ' ', 'lemma': ' ', 'POS': 'SPACE', 'tag': '_SP', 'dep': 'compound', 'shape': ' ', 'is_alpha': False, 'is_stop': False}
{'text': 'billion', 'lemma': 'billion', 'POS': 'NUM', 'tag': 'CD', 'dep': 'nummod', 'shape': 'xxxx', 'is_alpha': True, 'is_stop': False}
{'text': 'dollars', 'lemma': 'dollar', 'POS': 'NOUN', 'tag': 'NNS', 'dep': 'nsubj', 'shape': 'xxxx', 'is_alpha': True, 'is_stop': False}
{'text': 'is', 'lemma': 'be', 'POS': 'AUX', 'tag': 'VBZ', 'dep': 'ROOT', 'shape': 'xx', 'is_alpha': True, 'is_stop': True}
{'text': 'pathetic', 'lemma': 'pathetic', 'POS': 'ADJ', 'tag': 'JJ', 'dep': 'acomp', 'shape': 'xxxx', 'is_alpha': True, 'is_stop': False}
{'text': 'for', 'lemma': 'for', 'POS': 'ADP', 'tag': 'IN', 'dep': 'mark', 'shape': 'xxx', 'is_alpha': True, 'is_stop': True}
{'text': 'a', 'lemma': 'a', 'POS': 'DET', 'tag': 'DT', 'dep': 'det', 'shape': 'x', 'is_alpha': True, 'is_stop': True}
{'text': 'modern', 'lemma': 'modern', 'POS': 'ADJ', 'tag': 'JJ', 'dep': 'amod', 'shape': 'xxxx

Build a pipeline to process comments.

In [8]:
@Language.component('remove_stop')
def remove_stop(doc):
    return [token.lemma_.lower().strip().replace("'", '') for token in doc if not token.is_stop and 1 < len(token.lemma_) < 25]

pipeline.add_pipe('remove_stop')

<function __main__.remove_stop(doc)>

Check our pipeline

In [9]:
pipeline.analyze_pipes(pretty=True)

[1m

#   Component         Assigns               Requires   Scores             Retokenizes
-   ---------------   -------------------   --------   ----------------   -----------
0   tok2vec           doc.tensor                                          False      
                                                                                     
1   tagger            token.tag                        tag_acc            False      
                                                                                     
2   parser            token.dep                        dep_uas            False      
                      token.head                       dep_las                       
                      token.is_sent_start              dep_las_per_type              
                      doc.sents                        sents_p                       
                                                       sents_r                       
                                                

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'remove_stop': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
 

Process a random comment.

In [10]:
print('origin text: ', texts[2])
print(pipeline(texts[2]))

origin text:   billion dollars is pathetic for a modern chip building factory intel would need to invest 10x more they are drowning under money these days but it's easier to keep supply low for better profit 
['billion', 'dollar', 'pathetic', 'modern', 'chip', 'building', 'factory', 'intel', 'need', 'invest', '10x', 'drown', 'money', 'day', 'easy', 'supply', 'low', 'well', 'profit']


Process all the comments and save the processed comments to a txt file.

In [11]:
def f(t):
    tokens = pipeline(t)
    return tokens

texts = texts.apply(f)

Estimated completion of line 5:   0%           Elapsed time: 00:00, estimated remaining time: ?

In [12]:
f = open('../data/transaction.txt', 'w')
for tokens in tqdm(texts):
    if len(tokens) < 2:
        continue
    f.write(','.join(tokens))
    f.write('\n')
f.close()


  0%|          | 0/142486 [00:00<?, ?it/s][A
  0%|          | 1/142486 [02:29<5930:24:15, 149.84s/it][A
100%|██████████| 142486/142486 [02:30<00:00, 949.71it/s][A
