# Creation de dataset et uniformisation des données (cours 3)

In [35]:
"""
This script creates a dataset based on the two CSV files containing the Moldovan and Romanian news articles, 
scraped using the scripts moldovan_scraping.py and romanian_scraping.py, respectively. The dataset is saved as a CSV file.

Firstly, the two CSV files are turned into panda datasets. 
Then a column is added to the dataset, containing the country of origin of each article, as the dataset combines the two CSVs.
Finally, the categories of the articles are uniformised, so that the Moldovan and Romanian categories are the same and the dataset is saved as a CSV file.

Modules:
- pandas - for data manipulation

Functions:
- combine_csvs - combines the two CSV files into a single dataset and adds the country column
- uniformise_categories - uniformises the categories of the articles

Variables:
- ro_csv_path - path to the Romanian articles CSV
- md_csv_path - path to the Moldovan articles CSV
- output_csv_path - path to the output CSV file

"""

ro_csv_path = "/Users/madalina/Documents/M1TAL/outils_trait_corpus/constitution-corpus/data/clean/romanian_news.csv"
md_csv_path = "/Users/madalina/Documents/M1TAL/outils_trait_corpus/constitution-corpus/data/clean/moldovan_news.csv"
output_csv_path = "/Users/madalina/Documents/M1TAL/outils_trait_corpus/constitution-corpus/data/clean/news_combined.csv"

import pandas as pd


In [36]:
""" 
This function combines the two CSV files into a single dataset and adds a column for the country of origin.

Parameters:
- ro_csv_path - path to the Romanian news CSV
- md_csv_path - path to the Moldovan news CSV

Returns:
- dataset - a dataset containing the articles from both CSVs, with a column for the country of origin
"""
def combine_csvs(ro_csv_path, md_csv_path):


    # Load the CSVs
    ro = pd.read_csv(ro_csv_path)
    md = pd.read_csv(md_csv_path)

    # Add a column for the country of origin
    ro["country"] = "Romania"
    md["country"] = "Moldova"

    # Concatenate the two dataframes
    dataset = pd.concat([ro, md], ignore_index=True)

    return dataset



In [41]:
"""
This function uniformises the categories of the articles, so that the Moldovan and Romanian categories are the same.
The various categories are placed into 6 categories: 0 - culture, 1 - world, 2 - politics, 3 - economy, 4 - society, 5 - technology

Parameters:
- dataset - the dataset containing the articles from both CSVs, with a column for the country of origin (output of combine_csvs)

Returns:
- dataset - the dataset with the categories uniformised
"""

def uniformise_categories(dataset):
    dataset["category"] = dataset["category"].replace({"stiri-auto": "5"})
    dataset["category"] = dataset["category"].replace({"sport": "4"})
    dataset["category"] = dataset["category"].replace({"societate": "4"})
    dataset["category"] = dataset["category"].replace({"economie":"3"})
    dataset["category"] = dataset["category"].replace({"capitala":"2"})
    dataset["category"] = dataset["category"].replace({"politica":"2"})
    dataset["category"] = dataset["category"].replace({"business":"3"})
    dataset["category"] = dataset["category"].replace({"in-lume":"1"})
    dataset["category"] = dataset["category"].replace({"afis":"0"})
    dataset["category"] = dataset["category"].replace({"sanatate":"4"})
    dataset["category"] = dataset["category"].replace({"testeaza-cunostintiile":"0"})
    dataset["category"] = dataset["category"].replace({"externe":"1"})
    dataset["category"] = dataset["category"].replace({"politic":"2"})
    dataset["category"] = dataset["category"].replace({"economic":"3"})
    dataset["category"] = dataset["category"].replace({"social":"4"})
    dataset["category"] = dataset["category"].replace({"tehnologie":"5"})
    dataset["category"] = dataset["category"].replace({"cultura-media":"0"})
    dataset["category"] = dataset["category"].replace({"editorialistii":"0"})
    dataset["category"] = dataset["category"].replace({"life-entertaiment":"4"})

    dataset.to_csv(output_csv_path, index=False)
    
    return dataset
    

In [43]:
dataset = combine_csvs(ro_csv_path, md_csv_path)
uniformise_categories(dataset)


Unnamed: 0,id,category,text,country
0,22329753,1,"Secretarul $NE$ de stat a $NE$ , Janet Yellen ...",Romania
1,22329745,2,"Cătălin Cîrstoiu a vorbit , la Antena 3 CNN , ...",Romania
2,22329472,2,"„ Vă spun ceva , vom câştiga alegerile . Vom s...",Romania
3,22329360,3,"$NE$ $NE$ $NE$ , din cadrul grupului Intrakat ...",Romania
4,22329172,3,$NE$ Europeană a aprobat reintroducerea unei s...,Romania
...,...,...,...,...
140,745491,1,"În acest timp , numărul de trunchiuri îmbrăț...",Moldova
141,497838,4,Vă prezentăm în continuare analizele recoman...,Moldova
142,745490,1,Echipa de arheologi a examinat mormîntul și ...,Moldova
143,473567,4,În $NE$ rind este nevoie să vă calcula norma...,Moldova
