# Imports and installs

In [None]:
!pip install pandas numpy scikit-learn gdown sentence_transformers ffmpeg youtube-dl

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 1.6 MB/s 
[?25hCollecting ffmpeg
  Downloading ffmpeg-1.4.tar.gz (5.1 kB)
Collecting youtube-dl
  Downloading youtube_dl-2021.12.17-py2.py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 9.2 MB/s 
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 30.4 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 46.9 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 4.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0

In [None]:
import os
import numpy as np
import pandas as pd
from base64 import b64encode
from pandas import DataFrame
from datetime import timedelta
from IPython.display import HTML
from tensorflow import convert_to_tensor
from tensorflow.keras import activations
from sentence_transformers import CrossEncoder
from sklearn.feature_extraction.text import CountVectorizer

# Summarization Example

## Model downloading

In [None]:
!gdown https://drive.google.com/uc?id=1tZmDlrkV3MTNRiBasr7S-JQx_PTIeEdW
!unzip multilingual_subj_class_model.zip

Downloading...
From: https://drive.google.com/uc?id=1tZmDlrkV3MTNRiBasr7S-JQx_PTIeEdW
To: /content/multilingual_subj_class_model.zip
100% 661M/661M [00:08<00:00, 79.2MB/s]
Archive:  multilingual_subj_class_model.zip
   creating: multilingual_subj_class_model/
  inflating: multilingual_subj_class_model/config.json  
  inflating: multilingual_subj_class_model/pytorch_model.bin  
  inflating: multilingual_subj_class_model/special_tokens_map.json  
  inflating: multilingual_subj_class_model/tokenizer.json  
  inflating: multilingual_subj_class_model/tokenizer_config.json  
  inflating: multilingual_subj_class_model/vocab.txt  


## Dataset downloading

In [None]:
## Downloading and reading CMU MUSI dataset

!gdown https://drive.google.com/uc?id=1epb3W1u8MDfHj9kBIlrR7mmcIX7QtVoj

df_data = pd.read_pickle('cmu_mosi_subjectivity.pkl')

Downloading...
From: https://drive.google.com/uc?id=1epb3W1u8MDfHj9kBIlrR7mmcIX7QtVoj
To: /content/cmu_mosi_subjectivity.pkl
  0% 0.00/221k [00:00<?, ?B/s]100% 221k/221k [00:00<00:00, 95.8MB/s]


## Summarization Process

In [None]:
## Filtering one video to summarize

example_video_id = '9J25DZhivz8'

# For a random video
# example_video_id =  df_data.youtube_id.sample().iloc[0]

video_data = df_data[df_data.youtube_id == example_video_id]

In [None]:
## Predicting subjectivity in video data

# Loading model
model_path = 'multilingual_subj_class_model'
model = CrossEncoder(model_path, num_labels=2)

# Data to predict
num_rows = video_data.shape[0]
sentences = video_data.transcript.to_numpy().reshape((num_rows, 1))

# Predictions
np_preds = activations.softmax(convert_to_tensor(model.predict(sentences))).numpy()

# Predict classes (0 = Objective; 1 = Subjective)
class_predictions = np.argmax(np_preds, axis=1)

# Map label and assign to column
video_data['subjectivity'] = np.where(class_predictions, 'subjective', 'objective')
summ_video_data = video_data[video_data.subjectivity == 'subjective']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
## Creating smaller version of the video

# Download original video from youtube
!youtube-dl https://youtube.com/watch?v={example_video_id} -o original_video.mp4

# Cropping video into output
def generate_ffmpeg_input(time_data: DataFrame,
                          video_file_name:str='original_video.mp4') -> None:
    """ This function generates a txt file with the
    time windows to crop into the summarized video """

    next_start, next_end = 0, 0
    subcjetive_segments = list(zip(time_data.time_start,
                                   time_data.time_end))

    segments = []
    for start, end in subcjetive_segments:
        if start <= next_end: next_end = end
        else:
            segments.append((next_start, next_end))
            next_start, next_end = start, end
    segments = segments[1:] + [(next_start, next_end)]

    with open('input.txt', 'w') as f:
        for start, end in segments:
            f.write(f"file '{video_file_name}'\n")
            f.write(f"inpoint {timedelta(seconds=start)}\n")
            f.write(f"outpoint {timedelta(seconds=end)}\n")

time_frames = summ_video_data[['time_start', 'time_end']].sort_values('time_start')
generate_ffmpeg_input(time_frames)

!ffmpeg -safe 0 -f concat -segment_time_metadata 1 -i input.txt -vf select=concatdec_select -af aselect=concatdec_select,aresample=async=1 output.mp4

In [None]:
## Playing videos

original_file = 'original_video.mp4'
output_file = 'output.mp4'

orig_mp4 = open(original_file,'rb').read()
out_mp4 = open(output_file,'rb').read()


orig_data_url = "data:video/mp4;base64," + b64encode(orig_mp4).decode()
out_data_url = "data:video/mp4;base64," + b64encode(out_mp4).decode()

HTML(
    """
    <h2>Original Video</h2>
    <video width=400 controls>
        <source src="%s" type="video/mp4">
    </video>
    <br>
    <h2>Summarized Video</h2>
    <video width=400 controls>
        <source src="%s" type="video/mp4">
    </video>
    """ % (orig_data_url, out_data_url))

# Datasets info

## Pre-processing

In [None]:
# Create datasets directories
!mkdir processed\ datasets
!mkdir raw\ datasets/
!mkdir raw\ datasets/Subjectivity\ Dataset\ v1
!mkdir raw\ datasets/Book\ Reviews
!mkdir raw\ datasets/Computer-BR
!mkdir raw\ datasets/HS-MVideoSumm
!mkdir raw\ datasets/Produtos\ Eletronicos
!mkdir raw\ datasets/CMU-MOSI

In [None]:
# Download Subjectivity Dataset v1
!wget http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
!tar -xf rotten_imdb.tar.gz -C raw\ datasets/Subjectivity\ Dataset\ v1

# Download Book Reviews dataset
!wget https://raw.githubusercontent.com/Lubelisa/Natural-Linguage-Processing/master/Corpus%20of%20Book%20Reviews/corpus_book_reviews_portuguese.csv
!mv corpus_book_reviews_portuguese.csv raw\ datasets/Book\ Reviews/

# Download Computer-BR dataset
!wget https://github.com/Luizgferreira/subjectivity-classifier/raw/master/src/data/raw/Computer-BR.xlsx
!mv Computer-BR.xlsx raw\ datasets/Computer-BR/

# Download HS-MVideoSumm
!gdown https://drive.google.com/uc?id=1QPPZpbUv381Rl0_cA1nC-hSqtRqyD5Ab
!unzip 'Ground truth - HSMVideoSumm.zip' -d raw\ datasets/HS-MVideoSumm/ 

# Download Produtos Eletronicos
!wget https://github.com/Luizgferreira/subjectivity-classifier/raw/master/src/data/raw/sentencas.xlsx
!mv sentencas.xlsx raw\ datasets/Produtos\ Eletronicos/

# Download CMU-MOSI
!gdown https://drive.google.com/uc?id=1epb3W1u8MDfHj9kBIlrR7mmcIX7QtVoj
!mv cmu_mosi_subjectivity.pkl raw\ datasets/CMU-MOSI/

--2022-06-09 02:45:59--  http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
Resolving www.cs.cornell.edu (www.cs.cornell.edu)... 132.236.207.36
Connecting to www.cs.cornell.edu (www.cs.cornell.edu)|132.236.207.36|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 519599 (507K) [application/x-gzip]
Saving to: ‘rotten_imdb.tar.gz’


2022-06-09 02:46:01 (539 KB/s) - ‘rotten_imdb.tar.gz’ saved [519599/519599]

--2022-06-09 02:46:01--  https://raw.githubusercontent.com/Lubelisa/Natural-Linguage-Processing/master/Corpus%20of%20Book%20Reviews/corpus_book_reviews_portuguese.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 55542 (54K) [text/plain]
Saving to: ‘corpus_book_reviews_portuguese.csv’


2022-06-09 02:46:02 (3

### HS-MVideoSumm

In [None]:
## Reading datasets into one dataframe

hsmvideosumm_path = 'raw datasets/HS-MVideoSumm'
datasets = []

for root, subdirs, files in os.walk(hsmvideosumm_path):
    for filename in files:
        path = os.path.join(root, filename)
        df = pd.read_json(path)
        
        df['path'], df['dataset'] = path, path.split('/')[1]
        df['content_length'] = df.content.str.len()
        df.drop(columns=['begin', 'end'], inplace=True)
        
        datasets.append(df)

df = pd.concat(datasets)
df['class'].value_counts()

Neutro                            69
Neutro - sem faces                37
Entrevista                        23
Opinião de repórter                7
Opinião de repórter/entrevista     1
Name: class, dtype: int64

In [None]:
## Mapping classes as either subjective or objective

neutral_classes = ['Neutro', 'Neutro - sem faces']
df['subjectivity'] = np.where(df['class'].isin(neutral_classes), 'objective', 'subjective')
df.subjectivity.value_counts()

objective     106
subjective     31
Name: subjectivity, dtype: int64

In [None]:
## Saving dataset

save_path = 'processed datasets/hsmvideosumm.csv'
df[['content', 'subjectivity']].to_csv(save_path,
                                       sep=';',
                                       encoding='utf-8',
                                       index=False)

### Computer-BR

In [None]:
## Reading original dataset Excel file

df = pd.read_excel('raw datasets/Computer-BR/Computer-BR.xlsx')
df = df[['Mensagem', 'FINAL']].rename(columns={'Mensagem': 'content',
                                               'FINAL': 'polarity'})
df.polarity.value_counts()

 0    1677
-1     407
 1     197
-2      36
Name: polarity, dtype: int64

In [None]:
## Mapping polarity classes as either subjective or objective

df['subjectivity'] = np.where(df.polarity == 0, 'objective', 'subjective')
df.subjectivity.value_counts()

objective     1677
subjective     640
Name: subjectivity, dtype: int64

In [None]:
## Saving dataset

save_path = 'processed datasets/computerbr.csv'
df[['content', 'subjectivity']].to_csv(save_path,
                                       sep=';',
                                       encoding='utf-8',
                                       index=False)

### Book Reviews

In [None]:
## Reading original dataset csv file

df = pd.read_csv('raw datasets/Book Reviews/corpus_book_reviews_portuguese.csv')
df = df[['FRASE', 'OBJ/SUBJ']].rename(columns={'FRASE': 'content'})
df['OBJ/SUBJ'].value_counts()

objetiva     175
subjetiva    175
Name: OBJ/SUBJ, dtype: int64

In [None]:
## Renaming classes to English

df['subjectivity'] = np.where(df['OBJ/SUBJ'] == 'objetiva', 'objective', 'subjective')
df.subjectivity.value_counts()

objective     175
subjective    175
Name: subjectivity, dtype: int64

In [None]:
## Saving dataset

save_path = 'processed datasets/bookreviews.csv'
df[['content', 'subjectivity']].to_csv(save_path,
                                       sep=';',
                                       encoding='utf-8',
                                       index=False)

### Electronic Products

In [None]:
## Reading original dataset Excel file

df = pd.read_excel('raw datasets/Produtos Eletronicos/sentencas.xlsx')
df = df[['Sentença', 'Polaridade']].rename(columns={'Sentença': 'content',
                                                    'Polaridade': 'polarity'})
df.polarity.value_counts()

 1    131
-1     59
 0     43
Name: polarity, dtype: int64

In [None]:
## Mapping plarity classes as either subjective or objective

df['subjectivity'] = np.where(df['polarity'] == 0, 'objective', 'subjective')
df.subjectivity.value_counts()

subjective    190
objective      43
Name: subjectivity, dtype: int64

In [None]:
## Saving dataset

save_path = 'processed datasets/electronicproducts.csv'
df[['content', 'subjectivity']].to_csv(save_path,
                                       sep=';',
                                       encoding='utf-8',
                                       index=False)

### Subjectivity Dataset v1

In [None]:
## Reading original dataset files

dataset_path = 'raw datasets/Subjectivity Dataset v1'
obj_set = os.path.join(dataset_path, 'plot.tok.gt9.5000')
subj_set = os.path.join(dataset_path, 'quote.tok.gt9.5000')

df_obj = pd.read_csv(obj_set, header=None, sep=';;;;', encoding="ISO-8859-1")
df_subj = pd.read_csv(subj_set, header=None, sep=';;;;', encoding="ISO-8859-1")

df_obj = df_obj.rename(columns={0: 'content'})
df_subj = df_subj.rename(columns={0: 'content'})

  return func(*args, **kwargs)


In [None]:
## Concatenating subjective and objective sets with corresponding class column

df_obj['subjectivity'] = 'objective'
df_subj['subjectivity'] = 'subjective'

df = df_obj.append(df_subj, ignore_index=True)
df.subjectivity.value_counts()

objective     5000
subjective    5000
Name: subjectivity, dtype: int64

In [None]:
## Saving dataset

save_path = 'processed datasets/subjectivitydatasetv1.csv'
df[['content', 'subjectivity']].to_csv(save_path,
                                       sep=';',
                                       encoding='utf-8',
                                       index=False)

### CMU-MOSI

In [None]:
## Reading original dataset

dataset_path = 'raw datasets/CMU-MOSI/cmu_mosi_subjectivity.pkl'
df = pd.read_pickle(dataset_path)
df = df.rename(columns={'transcript': 'content'})
df.subjectivity_score.value_counts()

 2.000000    120
-1.000000    110
-2.000000    103
 0.000000     96
 1.800000     94
 2.200000     94
 1.400000     92
-1.600000     91
 1.600000     90
-1.400000     79
 0.800000     77
-1.800000     72
-1.200000     71
-0.400000     69
 1.200000     68
-0.800000     68
 0.600000     63
 0.400000     62
 1.000000     62
 0.200000     61
-2.200000     61
-0.600000     55
-0.200000     55
 2.400000     54
 2.600000     44
-2.400000     40
-2.600000     36
-2.800000     32
 2.800000     29
-0.500000     18
 0.500000     17
 3.000000     12
-3.000000     12
 0.250000     12
-0.250000     10
-1.500000     10
-1.250000      8
-0.750000      8
 1.250000      7
-1.750000      4
 1.750000      4
 0.750000      4
 1.333333      4
-0.666667      3
 1.500000      3
-0.333333      2
 0.666667      2
-2.250000      2
 2.500000      2
-1.333333      2
-2.500000      2
 2.250000      2
 0.333333      1
Name: subjectivity_score, dtype: int64

In [None]:
## Classifying subjectivity scores into either subjective or objective

condition = (df.subjectivity_score <= -2) | (df.subjectivity_score >= 2) 
df['subjectivity'] = np.where(condition, 'subjective', 'objective')
df.subjectivity.value_counts()

objective     1554
subjective     645
Name: subjectivity, dtype: int64

In [None]:
## Saving dataset

save_path = 'processed datasets/cmumosi.csv'
df[['content', 'subjectivity']].to_csv(save_path,
                                       sep=';',
                                       encoding='utf-8',
                                       index=False)

## Describing

In [None]:
def token_data(content):
    vect = CountVectorizer()
    X = vect.fit_transform(content).toarray()

    total_tokens = X.shape[1]
    avg_tokens = np.array([i.sum() for i in X]).mean()

    return total_tokens, np.round(avg_tokens, 2)

In [None]:
datasets = ['bookreviews', 'computerbr', 'electronicproducts', 'hsmvideosumm', 'subjectivitydatasetv1', 'cmumosi']

dfs = []
tokens_data = {}

for ds in datasets:
    df = pd.read_csv(f'processed datasets/{ds}.csv', sep=';')
    df['corpus'] = ds

    total_tokens, avg_tokens = token_data(df.content)
    tokens_data[ds] = {
        'Avg. tokens by text': avg_tokens,
        'Total tokens': total_tokens
    }

    dfs.append(df)

tokens_data = pd.DataFrame(tokens_data).T
df = pd.concat(dfs).reset_index(drop=True).reset_index()

In [None]:
description = pd.pivot_table(df, index='corpus', values='index',
                             columns='subjectivity', aggfunc='count')

datasets_info = description.join(tokens_data)
datasets_info

Unnamed: 0_level_0,objective,subjective,Avg. tokens by text,Total tokens
corpus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bookreviews,175,175,20.26,2472.0
cmumosi,1554,645,11.75,3086.0
computerbr,1677,640,18.19,6323.0
electronicproducts,43,190,24.56,1779.0
hsmvideosumm,106,31,43.14,1754.0
subjectivitydatasetv1,5000,5000,20.71,20897.0
