In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import TweetTokenizer
import matplotlib.pyplot as plt
from tqdm import tqdm
import csv

In [2]:
raw_articles = pd.read_csv('../dataset/data_articles_train.csv', delimiter=',')

In [3]:
print("total of articles = "+str(len(raw_articles)))

total of articles = 204107


In [4]:
raw_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204107 entries, 0 to 204106
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   full_text         204107 non-null  object
 1   keywords          204107 non-null  object
 2   meta_description  204107 non-null  object
 3   meta_keywords     139997 non-null  object
 4   page_id           204107 non-null  int64 
 5   page_title        204107 non-null  object
 6   summary           204107 non-null  object
 7   tags              46867 non-null   object
dtypes: int64(1), object(7)
memory usage: 12.5+ MB


## TRAIN DATA

In [None]:
title_lenght = []
full_text_lenght = []
meta_description_lenght = []
summary_lenght = []
keywords_lenght = []
meta_keywords_lenght = []
tags_lenght = []

tknzr = TweetTokenizer()

for i,row in tqdm(raw_articles.iterrows()):
    
    title = str(row['page_title'])
    full_text = str(row['full_text'])
    meta_description = str(row['meta_description'])
    summary = str(row['summary'])
    keywords = str(row['keywords'])
    meta_keywords = str(row['meta_keywords'])
    tags = str(row['tags'])
    
    lenght_title = len(tknzr.tokenize(title))
    lenght_full_text = len(tknzr.tokenize(full_text))
    lenght_meta_description = len(tknzr.tokenize(meta_description))
    lenght_summary = len(tknzr.tokenize(summary))
    lenght_keywords = len(tknzr.tokenize(keywords))
    lenght_meta_keywords = len(tknzr.tokenize(meta_keywords))
    lenght_tags = len(tknzr.tokenize(tags))
    
    title_lenght.append(lenght_title)
    full_text_lenght.append(lenght_full_text)
    meta_description_lenght.append(lenght_meta_description)
    summary_lenght.append(lenght_summary)
    keywords_lenght.append(lenght_keywords)
    meta_keywords_lenght.append(lenght_meta_keywords)
    tags_lenght.append(lenght_tags)

# Article Title Analysis

In the next code blocks we analyze the number of words in the article title

In [None]:
print('title analysis:')
print("max words = "+str(np.max(title_lenght)))
print("min words = "+str(np.min(title_lenght)))
print("avg words = "+str(np.mean(title_lenght)))
print("std words = "+str(np.std(title_lenght)))

In [None]:
plt.hist(title_lenght, range(0,35))
plt.title('total of words in articles title')
plt.xlabel('total of words')
plt.ylabel('total of articles (log)')
plt.yscale('log')
plt.savefig('article_title1.pdf')
plt.show()

# Article Text Analysis

In the next code blocks we analyze the number of words in the article text

In [None]:
print('text analysis:')
print("max words = "+str(np.max(lenght_full_text)))
print("min words = "+str(np.min(lenght_full_text)))
print("avg words = "+str(np.mean(lenght_full_text)))
print("std words = "+str(np.std(lenght_full_text)))

In [None]:
plt.hist(full_text_lenght, range(0,300))
plt.title('total of words in articles full text')
plt.xlabel('total of words')
plt.ylabel('total of articles (log)')
plt.yscale('log')
plt.show()

# Article Meta Description Analysis

In [None]:
print('meta description analysis:')
print("max words = "+str(np.max(meta_description_lenght)))
print("min words = "+str(np.min(meta_description_lenght)))
print("avg words = "+str(np.mean(meta_description_lenght)))
print("std words = "+str(np.std(meta_description_lenght)))

In [None]:
plt.hist(meta_description_lenght, range(0,2213))
plt.title('total of words in articles meta description')
plt.xlabel('total of words')
plt.ylabel('total of articles (log)')
plt.yscale('log')
plt.show()

# Article Summary Analysis

In [None]:
print('summary analysis:')
print("max words = "+str(np.max(summary_lenght)))
print("min words = "+str(np.min(summary_lenght)))
print("avg words = "+str(np.mean(summary_lenght)))
print("std words = "+str(np.std(summary_lenght)))

In [None]:
plt.hist(summary_lenght, range(0,750))
plt.title('total of words in articles summary')
plt.xlabel('total of words')
plt.ylabel('total of articles (log)')
plt.yscale('log')
plt.show()

# Article Keywords Analysis

In [None]:
print('keywords analysis:')
print("max words = "+str(np.max(keywords_lenght)))
print("min words = "+str(np.min(keywords_lenght)))
print("avg words = "+str(np.mean(keywords_lenght)))
print("std words = "+str(np.std(keywords_lenght)))

In [None]:
plt.hist(keywords_lenght, range(0,35))
plt.title('total of words in articles keywords')
plt.xlabel('total of words')
plt.ylabel('total of articles (log)')
plt.yscale('log')
plt.show()

# Article Meta Keywords Analysis

In [None]:
print('meta keywords analysis:')
print("max words = "+str(np.max(meta_keywords_lenght)))
print("min words = "+str(np.min(meta_keywords_lenght)))
print("avg words = "+str(np.mean(meta_keywords_lenght)))
print("std words = "+str(np.std(meta_keywords_lenght)))

In [None]:
plt.hist(meta_keywords_lenght, range(0,35))
plt.title('total of words in articles meta keywords')
plt.xlabel('total of words')
plt.ylabel('total of articles (log)')
plt.yscale('log')
plt.show()

# Article Tags Analysis

In [None]:
print('meta tags analysis:')
print("max words = "+str(np.max(tags_lenght)))
print("min words = "+str(np.min(tags_lenght)))
print("avg words = "+str(np.mean(tags_lenght)))
print("std words = "+str(np.std(tags_lenght)))

In [None]:
plt.hist(tags_lenght, range(0,1680))
plt.title('total of words in articles tags')
plt.xlabel('total of words')
plt.ylabel('total of articles (log)')
plt.yscale('log')
plt.show()

## TEST DATA

In [None]:
test_articles = pd.read_csv('../dataset/data_articles_test.csv', delimiter=',')

In [None]:
TEST_title_lenght = []
TEST_meta_description_lenght = []
TEST_keywords_lenght = []

tknzr = TweetTokenizer()

for i,row in tqdm(test_articles.iterrows()):
    
    title = row['page_title']
    meta_description = row['meta_description']
    keywords = row['keywords']
    
    lenght_title = len(tknzr.tokenize(title))
    lenght_meta_description = len(tknzr.tokenize(meta_description))
    lenght_keywords = len(tknzr.tokenize(keywords))
    
    TEST_title_lenght.append(lenght_title)
    TEST_meta_description_lenght.append(lenght_meta_description)
    TEST_keywords_lenght.append(lenght_keywords)

In [None]:
print('title analysis:')
print("max words = "+str(np.max(TEST_title_lenght)))
print("min words = "+str(np.min(TEST_title_lenght)))
print("avg words = "+str(np.mean(TEST_title_lenght)))
print("std words = "+str(np.std(TEST_title_lenght)))

In [None]:
plt.hist(TEST_title_lenght, range(0,25))
plt.title('total of words in articles title')
plt.xlabel('total of words')
plt.ylabel('total of articles (log)')
plt.yscale('log')
plt.savefig('article_title1.pdf')
plt.show()

In [None]:
print('meta description analysis:')
print("max words = "+str(np.max(TEST_meta_description_lenght)))
print("min words = "+str(np.min(TEST_meta_description_lenght)))
print("avg words = "+str(np.mean(TEST_meta_description_lenght)))
print("std words = "+str(np.std(TEST_meta_description_lenght)))

In [None]:
plt.hist(TEST_meta_description_lenght, range(0,772))
plt.title('total of words in articles meta description')
plt.xlabel('total of words')
plt.ylabel('total of articles (log)')
plt.yscale('log')
plt.show()

In [None]:
print('keywords analysis:')
print("max words = "+str(np.max(TEST_keywords_lenght)))
print("min words = "+str(np.min(TEST_keywords_lenght)))
print("avg words = "+str(np.mean(TEST_keywords_lenght)))
print("std words = "+str(np.std(TEST_keywords_lenght)))

In [None]:
plt.hist(TEST_keywords_lenght, range(0,25))
plt.title('total of words in articles keywords')
plt.xlabel('total of words')
plt.ylabel('total of articles (log)')
plt.yscale('log')
plt.show()

# tokens analysis

In [5]:
from transformers import BertTokenizer
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [7]:
title_lenght = []
meta_description_lenght = []
keywords_lenght = []

for i,row in tqdm(raw_articles.iterrows()):
    
    title = str(row['page_title'])
    meta_description = str(row['meta_description'])
    keywords = str(row['keywords'])
    
    total_tokens_title = len(bert_tokenizer.tokenize(title))
    lenght_meta_description = len(bert_tokenizer.tokenize(meta_description))
    lenght_keywords = len(bert_tokenizer.tokenize(keywords))
    
    title_lenght.append(total_tokens_title)
    meta_description_lenght.append(lenght_meta_description)
    keywords_lenght.append(lenght_keywords)
    

204107it [05:32, 614.60it/s]


In [8]:
print('title tokens analysis analysis:')
print("max words = "+str(np.max(title_lenght)))
print("min words = "+str(np.min(title_lenght)))
print("avg words = "+str(np.mean(title_lenght)))
print("std words = "+str(np.std(title_lenght)))

title tokens analysis analysis:
max words = 55
min words = 1
avg words = 10.085411083402333
std words = 4.971531406496229


In [9]:
print('meta-description tokens analysis analysis:')
print("max words = "+str(np.max(meta_description_lenght)))
print("min words = "+str(np.min(meta_description_lenght)))
print("avg words = "+str(np.mean(meta_description_lenght)))
print("std words = "+str(np.std(meta_description_lenght)))

meta-description tokens analysis analysis:
max words = 3374
min words = 1
avg words = 27.660006761159586
std words = 40.49967109947301


In [10]:
print('keywords tokens analysis analysis:')
print("max words = "+str(np.max(keywords_lenght)))
print("min words = "+str(np.min(keywords_lenght)))
print("avg words = "+str(np.mean(keywords_lenght)))
print("std words = "+str(np.std(keywords_lenght)))

keywords tokens analysis analysis:
max words = 107
min words = 1
avg words = 20.888303683852097
std words = 5.0942223999569
