# Analyzing Hyperpartisan Documents (on unigrams)

## 1. Setup Working Environment

### 1.1. Load zip data files

#### 1.1.1. Download zip files from Google Drive link

In [None]:
import gdown

zip_data_url = "https://drive.google.com/drive/folders/1e8tgF2tGdZJ0HU-7pKX7yMLBLw71XTMe?usp=drive_link"
gdown.download_folder(zip_data_url)

#### 1.1.2. Move downloaded data files to the corresponding `zip_data_path`

In [None]:
import os
import shutil


downloaded_data_path = 'data'
zip_data_path = '../data/zip'

os.makedirs(zip_data_path)
for zip_data_file in os.listdir('data'):
    source_path = os.path.join(downloaded_data_path, zip_data_file)
    destination_path = os.path.join(zip_data_path, zip_data_file)
    shutil.move(source_path, destination_path)

os.rmdir(downloaded_data_path)

### 1.3. Install required packages

In [None]:
!pip install -r ../requirements.txt

## 2. Perform Hyperpartisan Documents Analysis 

### 2.1 Add necessary import statements

In [None]:
from src.log_odd_ratios.LogOddRatiosAnalyzer import LogOddRatiosAnalyzer
from src.log_odd_ratios.LogOddRatiosCalculator import LogOddRatiosCalculator, TokenType
from src.constant_values.enums import DocumentType
from src.get_hyperpartisan_data.HyperpartisanDocumentsFormatter import HyperpartisanDocumentsFormatter
from src.get_hyperpartisan_data.HyperpartisanDocumentsProcessor import HyperpartisanDocumentsProcessor
from src.utils import console_output_formatter

### 2.2. Adapt hyperpartisan documents format into text files

In [None]:
console_output_formatter.print_section_header(section_header='Data formatting')
hyperpartisan_documents_formatter = HyperpartisanDocumentsFormatter()
hyperpartisan_documents_formatter.adapt_dataset_format()

### 2.3. Process hyperpartisan documents (if necessary) and load

In [None]:
console_output_formatter.print_section_header(section_header='Data loading / preprocessing')
hyperpartisan_documents_processor = HyperpartisanDocumentsProcessor()

hyperpartisan_document_list = hyperpartisan_documents_processor.get_clean_documents(
    document_type=DocumentType.HYPERPARTISAN
)
console_output_formatter.print_document_list_stats(
    document_list=hyperpartisan_document_list,
    document_type=DocumentType.HYPERPARTISAN
)

non_hyperpartisan_document_list = hyperpartisan_documents_processor.get_clean_documents(
    document_type=DocumentType.NON_HYPERPARTISAN
)
console_output_formatter.print_document_list_stats(
    document_list=non_hyperpartisan_document_list,
    document_type=DocumentType.NON_HYPERPARTISAN
)

### 2.4. Remove infrequent words (log-odd ratio is sensitive to them)

In [None]:
hyperpartisan_document_list, non_hyperpartisan_document_list = (hyperpartisan_documents_processor.
    remove_infrequent_words(
        hyperpartisan_documents=hyperpartisan_document_list,
        non_hyperpartisan_documents=non_hyperpartisan_document_list
    )
)

### 2.5. Calculate log-odd ratios

#### 2.5.1. Calculate on `UNIGRAMS`

In [None]:
console_output_formatter.print_section_header(section_header='Log-odd ratios calculation (on unigrams)')
log_odd_ratios_calculator = LogOddRatiosCalculator(
    hyperpartisan_documents=hyperpartisan_document_list,
    non_hyperpartisan_documents=non_hyperpartisan_document_list,
    token_type=TokenType.UNIGRAM
)

log_odd_ratios_calculator.calculate_log_odd_ratios()

#### 2.5.2. Calculate on `BIGRAMS`

In [None]:
console_output_formatter.print_section_header(section_header='Log-odd ratios calculation (on bigrams)')
log_odd_ratios_calculator = LogOddRatiosCalculator(
    hyperpartisan_documents=hyperpartisan_document_list,
    non_hyperpartisan_documents=non_hyperpartisan_document_list,
    token_type=TokenType.BIGRAM
)

log_odd_ratios_calculator.calculate_log_odd_ratios()

### 2.6. Get most relevant words for each document group

#### 2.6.1. Most relevant words (on `UNIGRAMS`)

In [None]:
console_output_formatter.print_section_header(section_header='Most relevant words extraction (on unigrams)')
log_odd_ratios_analyzer = LogOddRatiosAnalyzer(token_type=TokenType.UNIGRAM)

hyperpartisan_most_relevant_words_on_unigrams = (log_odd_ratios_analyzer.get_most_relevant_words(document_type=DocumentType.HYPERPARTISAN))
print(f'{hyperpartisan_most_relevant_words_on_unigrams} \n')
hyperpartisan_most_relevant_words_not_inf_on_unigrams = (log_odd_ratios_analyzer.
    get_most_relevant_words(
        document_type=DocumentType.HYPERPARTISAN,
        infinite_values=False
    )
)
print(f'{hyperpartisan_most_relevant_words_not_inf_on_unigrams} \n\n')

non_hyperpartisan_most_relevant_words_on_unigrams = (log_odd_ratios_analyzer.get_most_relevant_words(document_type=DocumentType.NON_HYPERPARTISAN))
print(f'{non_hyperpartisan_most_relevant_words_on_unigrams} \n')
non_hyperpartisan_most_relevant_words_not_inf_on_unigrams = (log_odd_ratios_analyzer.
    get_most_relevant_words(
        document_type=DocumentType.NON_HYPERPARTISAN,
        infinite_values=False
    )
)
print(non_hyperpartisan_most_relevant_words_not_inf_on_unigrams)

#### 2.6.2. Most relevant words (on `BIGRAMS`)

In [None]:
console_output_formatter.print_section_header(section_header='Most relevant words extraction (on bigrams)')
log_odd_ratios_analyzer = LogOddRatiosAnalyzer(token_type=TokenType.BIGRAM)

hyperpartisan_most_relevant_words_on_bigrams = (log_odd_ratios_analyzer.get_most_relevant_words(document_type=DocumentType.HYPERPARTISAN))
print(f'{hyperpartisan_most_relevant_words_on_bigrams} \n')
hyperpartisan_most_relevant_words_not_inf_on_bigrams = (log_odd_ratios_analyzer.
    get_most_relevant_words(
        document_type=DocumentType.HYPERPARTISAN,
        infinite_values=False
    )
)
print(f'{hyperpartisan_most_relevant_words_not_inf_on_bigrams} \n\n')

non_hyperpartisan_most_relevant_words_on_bigrams = (log_odd_ratios_analyzer.get_most_relevant_words(document_type=DocumentType.NON_HYPERPARTISAN))
print(f'{non_hyperpartisan_most_relevant_words_on_bigrams} \n')
non_hyperpartisan_most_relevant_words_not_inf_on_bigrams = (log_odd_ratios_analyzer.
    get_most_relevant_words(
        document_type=DocumentType.NON_HYPERPARTISAN,
        infinite_values=False
    )
)
print(non_hyperpartisan_most_relevant_words_not_inf_on_bigrams)

\* NOTE: Due to excessive RAM memory usage some checkpoints have been stored via *pickle* files and the environment has been restarted 2-3 times. 

## 3. Results Analysis