In [23]:
import pandas as pd

In [24]:
df = pd.read_csv("original_data/NYT_Dataset.csv",index_col = 0, parse_dates = ["Date"],na_values=['nan'])
df["no_key"] = df.keywords.apply(lambda x: len(x)) # vals in string, if len == 2, then string is "[]"
n_rows = df.shape[0]

## Delete all nan values from the dataset

In [25]:
nan_vals_to_drop = df.loc[(df.title.isnull())|(df.abstract.isnull())|(df.no_key == 2)].index
df.drop(nan_vals_to_drop, inplace = True)
df.reset_index(drop = True,inplace = True)

In [26]:
df.head()

Unnamed: 0,ID,title,topic,abstract,Date,keywords,no_key
0,nyt://article/178801fe-4679-5f12-985f-8344a86e...,"In Reversal, Pakistan Welcomes Outside Help Wi...",Foreign,Pakistan’s ambassador to the U.S. said his gov...,2008-01-01 05:00:00+00:00,['Assassinations and Attempted Assassinations'...,131
1,nyt://article/21acedcb-a7f6-5131-99cf-d3a47e33...,Fighting Intensifies After Election in Kenya,Foreign,"Kenya sank deeper into trouble, with a curfew ...",2008-01-01 05:00:00+00:00,"['Kenya', 'Demonstrations and Riots', 'Odinga,...",83
2,nyt://article/357b5429-a9f8-5d33-a5eb-c013a201...,Israel: Olmert Curbs Settlements,Foreign,Prime Minister Ehud Olmert has sent a letter t...,2008-01-01 05:00:00+00:00,['West Bank'],13
3,nyt://article/619ca4ea-50e4-59e4-97bb-f206502c...,Gay Muslims Pack a Dance Floor of Their Own,Foreign,The monthly club night known as Gayhane is an ...,2008-01-01 05:00:00+00:00,"['Homosexuality', 'Islam', 'IMMIGRATION AND RE...",74
4,nyt://article/73c49a5a-bcf1-5b8f-a15a-98d29003...,Iraqi Revelers Embrace the New Year,Foreign,"But even as partygoers embraced the New Year, ...",2008-01-01 05:00:00+00:00,"['ARMAMENT, DEFENSE AND MILITARY FORCES', 'Iraq']",49


## Dropping the repeated titles or abstracts

In [27]:
titles_to_drop = df.title.value_counts().loc[df.title.value_counts()>2].index.to_list()
abstracts_to_drop = df.abstract.value_counts().loc[df.abstract.value_counts()>2].index.to_list()

In [28]:
reperated_titles_abstracts = df.loc[df.title.isin(titles_to_drop) |df.abstract.isin(abstracts_to_drop)].index
df.drop(reperated_titles_abstracts,inplace = True)
df.reset_index(inplace = True,drop = True)

In [29]:
df.head()

Unnamed: 0,ID,title,topic,abstract,Date,keywords,no_key
0,nyt://article/178801fe-4679-5f12-985f-8344a86e...,"In Reversal, Pakistan Welcomes Outside Help Wi...",Foreign,Pakistan’s ambassador to the U.S. said his gov...,2008-01-01 05:00:00+00:00,['Assassinations and Attempted Assassinations'...,131
1,nyt://article/21acedcb-a7f6-5131-99cf-d3a47e33...,Fighting Intensifies After Election in Kenya,Foreign,"Kenya sank deeper into trouble, with a curfew ...",2008-01-01 05:00:00+00:00,"['Kenya', 'Demonstrations and Riots', 'Odinga,...",83
2,nyt://article/357b5429-a9f8-5d33-a5eb-c013a201...,Israel: Olmert Curbs Settlements,Foreign,Prime Minister Ehud Olmert has sent a letter t...,2008-01-01 05:00:00+00:00,['West Bank'],13
3,nyt://article/619ca4ea-50e4-59e4-97bb-f206502c...,Gay Muslims Pack a Dance Floor of Their Own,Foreign,The monthly club night known as Gayhane is an ...,2008-01-01 05:00:00+00:00,"['Homosexuality', 'Islam', 'IMMIGRATION AND RE...",74
4,nyt://article/73c49a5a-bcf1-5b8f-a15a-98d29003...,Iraqi Revelers Embrace the New Year,Foreign,"But even as partygoers embraced the New Year, ...",2008-01-01 05:00:00+00:00,"['ARMAMENT, DEFENSE AND MILITARY FORCES', 'Iraq']",49


In [30]:
final_rows = df.shape[0]
deleted_rows = n_rows-final_rows
per_del = deleted_rows/n_rows
per_del = round(per_del,3)
print(f"{per_del*100}% of the rows has been deleted")

6.1% of the rows has been deleted


## Transforming the keywords into a list

In [31]:
def get_keywords(text:str):
    """
    Transforming the string keywords to a list of values
    
    
    """
    text = text.replace("[","")
    text = text.replace("'","")
    text = text.replace("]","")
    text = text.split(",")
    # Delete \n or space in front of words: [" jaja"," lol"] -> ["jaja,"lol"]
    text = [i.strip() for i in text] 
    return text

In [32]:
df["keywords"] = df.keywords.apply(lambda x: get_keywords(x))

In [33]:
df[["title","topic","abstract"]] = df[["title","topic","abstract"]].astype(str) # ensure string dtype

In [34]:
df.drop(columns = ["no_key"],inplace = True) # delete not neccesary columns

In [35]:
#df.to_json("NYT_Dataset.json")
df.to_csv("NYT_Dataset.csv")

## Testing the datasets library with the new dataset

In [36]:
from datasets import load_dataset

In [37]:
data = load_dataset("csv",data_files="NYT_Dataset.csv")

Using custom data configuration default-5135cb3c75baf478


Downloading and preparing dataset csv/default to /Users/jaime/.cache/huggingface/datasets/csv/default-5135cb3c75baf478/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /Users/jaime/.cache/huggingface/datasets/csv/default-5135cb3c75baf478/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]