### CATEGORIZACIÓN Y ANÁLISIS DE SENTIMIENTOS DE ARTÍCULOS DE NOTICIAS
#### Conversión del Archivo a CSV, Determinación de Registros Nulos y Remoción de Noticias Duplicadas

In [1]:
# Se importan la librerías requeridas.
import pandas as pd

In [2]:
# Se importa el archivo de datos y se almacena en una estructura de datos dataframe.
news_categories_df = pd.read_json('./data/News_Category_Dataset_v3.json', lines=True)
news_categories_df.head(10)

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22
5,https://www.huffpost.com/entry/belk-worker-fou...,Cleaner Was Dead In Belk Bathroom For 4 Days B...,U.S. NEWS,The 63-year-old woman was seen working at the ...,,2022-09-22
6,https://www.huffpost.com/entry/reporter-gets-a...,Reporter Gets Adorable Surprise From Her Boyfr...,U.S. NEWS,"""Who's that behind you?"" an anchor for New Yor...",Elyse Wanshel,2022-09-22
7,https://www.huffpost.com/entry/puerto-rico-wat...,Puerto Ricans Desperate For Water After Hurric...,WORLD NEWS,More than half a million people remained witho...,"DÁNICA COTO, AP",2022-09-22
8,https://www.huffpost.com/entry/mija-documentar...,How A New Documentary Captures The Complexity ...,CULTURE & ARTS,"In ""Mija,"" director Isabel Castro combined mus...",Marina Fang,2022-09-22
9,https://www.huffpost.com/entry/biden-un-russia...,Biden At UN To Call Russian War An Affront To ...,WORLD NEWS,White House officials say the crux of the pres...,"Aamer Madhani, AP",2022-09-21


In [3]:
# Se determina información general del dataframe.
news_categories_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209527 entries, 0 to 209526
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   link               209527 non-null  object        
 1   headline           209527 non-null  object        
 2   category           209527 non-null  object        
 3   short_description  209527 non-null  object        
 4   authors            209527 non-null  object        
 5   date               209527 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.6+ MB


In [4]:
# Se determina la cantidad de los registros duplicados y se eliminan.
duplicated_rows = news_categories_df.duplicated()
print('La cantidad de registros duplicados es:',duplicated_rows.sum())
news_categories_df = news_categories_df.drop_duplicates()
print('Las dimensiones del dataframe después de eliminar los registros duplicados son:',news_categories_df.shape)

La cantidad de registros duplicados es: 13
Las dimensiones del dataframe después de eliminar los registros duplicados son: (209514, 6)


In [5]:
# Se determina la cantidad de registros nulos en el dataframe.
print('La cantidad de registros nulos es:\n',news_categories_df.isnull().sum())

La cantidad de registros nulos es:
 link                 0
headline             0
category             0
short_description    0
authors              0
date                 0
dtype: int64


In [6]:
# Se determina la cantidad de registros vacíos en el dataframe.
blank_news = news_categories_df[(news_categories_df['headline'] == '') & (news_categories_df['short_description'] == '')]
blank_news

Unnamed: 0,link,headline,category,short_description,authors,date
90944,https://www.huffingtonpost.com/entry/lincoln-2...,,POLITICS,,"Robert Moran, ContributorRobert Moran leads Br...",2015-08-22
103675,https://www.huffingtonpost.com/entry/us-and-eu...,,WORLDPOST,,"Natasha Srdoc, ContributorAuthor, Economist, C...",2015-03-29
109100,https://www.huffingtonpost.com/entry/disney-ce...,,BUSINESS,,"Gary Snyder, ContributorWriter and Media Strat...",2015-01-25
110153,https://www.huffingtonpost.com/entry/beverly-h...,,MEDIA,,"Gary Snyder, ContributorWriter and Media Strat...",2015-01-13
122145,https://www.huffingtonpost.com/entry/beverly-h...,,QUEER VOICES,,"Gary Snyder, ContributorWriter and Media Strat...",2014-08-28


In [8]:
# Se eliminan los registros vacíos.
def count_words(text):
    """Count the number of words in a text string"""
    if pd.isna(text) or text == '':
        return 0
    return len(str(text).split())

news_categories_df['headline_word_count'] = news_categories_df['headline'].apply(count_words)
news_categories_df['description_word_count'] = news_categories_df['short_description'].apply(count_words)

min_words = 6

very_short_text_news = news_categories_df[
    (news_categories_df['headline_word_count'] < min_words) & 
    (news_categories_df['description_word_count'] < min_words)
]

very_short_text_news_idx = very_short_text_news.index
news_categories_df = news_categories_df.drop(very_short_text_news_idx)

very_short_text_news

Unnamed: 0,link,headline,category,short_description,authors,date,headline_word_count,description_word_count


In [9]:
# Se determina la cantidad de registros por categoría.
news_categories_df.category.value_counts()

category
POLITICS          35296
WELLNESS          17938
ENTERTAINMENT     17186
TRAVEL             9831
STYLE & BEAUTY     9805
PARENTING          8789
HEALTHY LIVING     6453
QUEER VOICES       6317
FOOD & DRINK       6284
BUSINESS           5901
COMEDY             5192
SPORTS             4984
BLACK VOICES       4554
HOME & LIVING      4310
PARENTS            3932
THE WORLDPOST      3664
WEDDINGS           3652
WOMEN              3529
CRIME              3526
IMPACT             3450
DIVORCE            3426
WORLD NEWS         3297
MEDIA              2887
WEIRD NEWS         2726
GREEN              2575
RELIGION           2490
WORLDPOST          2474
STYLE              2213
SCIENCE            2172
TECH               2092
TASTE              2060
MONEY              1756
ARTS               1452
ENVIRONMENT        1443
U.S. NEWS          1377
GOOD NEWS          1373
FIFTY              1369
ARTS & CULTURE     1338
COLLEGE            1130
LATINO VOICES      1128
CULTURE & ARTS     1066
EDUCATI

In [10]:
# Se agrupan las categorías similares.
category_map = {
    'PARENTING': 'PARENTS',
    'HEALTHY LIVING': 'WELLNESS',
    'THE WORLDPOST': 'WORLD NEWS',
    'WORLDPOST': 'WORLD NEWS',
    'U.S. NEWS': 'WORLD NEWS',
    'ARTS':'CULTURE & ARTS',
    'TASTE': 'FOOD & DRINK',
    'COLLEGE': 'EDUCATION',
    'MONEY': 'BUSINESS',
    'STYLE': 'STYLE & BEAUTY',
    'GREEN' : 'ENVIRONMENT',
    'BLACK VOICES' : 'DIVERSITY VOICES',
    'LATINO VOICES': 'DIVERSITY VOICES',
    'QUEER VOICES': 'DIVERSITY VOICES',
    'WEIRD NEWS': 'VARIETY',
    'GOOD NEWS': 'VARIETY',
    'FIFTY': 'VARIETY',
}
news_categories_df['category'] = news_categories_df['category'].replace(category_map)

In [11]:
# Se determina la cantidad de registros por categoría.
news_categories_df.category.value_counts()

category
POLITICS            35296
WELLNESS            24391
ENTERTAINMENT       17186
PARENTS             12721
STYLE & BEAUTY      12018
DIVERSITY VOICES    11999
WORLD NEWS          10812
TRAVEL               9831
FOOD & DRINK         8344
BUSINESS             7657
VARIETY              5468
COMEDY               5192
SPORTS               4984
HOME & LIVING        4310
ENVIRONMENT          4018
WEDDINGS             3652
WOMEN                3529
CRIME                3526
IMPACT               3450
DIVORCE              3426
MEDIA                2887
CULTURE & ARTS       2518
RELIGION             2490
SCIENCE              2172
EDUCATION            2126
TECH                 2092
ARTS & CULTURE       1338
Name: count, dtype: int64

In [12]:
news_categories_df = news_categories_df.iloc[:, :-2]
news_categories_df

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,WORLD NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",WORLD NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTS,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,WORLD NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22
...,...,...,...,...,...,...
209522,https://www.huffingtonpost.com/entry/rim-ceo-t...,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH,Verizon Wireless and AT&T are already promotin...,"Reuters, Reuters",2012-01-28
209523,https://www.huffingtonpost.com/entry/maria-sha...,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr...",,2012-01-28
209524,https://www.huffingtonpost.com/entry/super-bow...,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked...",,2012-01-28
209525,https://www.huffingtonpost.com/entry/aldon-smi...,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...,,2012-01-28


In [14]:
news_categories_df.to_csv('./data/NoDuplicates_News_Category_Dataset.csv', index=False)