In [190]:
import json
import pandas as pd


In [191]:
with open("uca.json", 'r') as file:
    data = json.load(file)

In [192]:
def transform_authors_of_article(data):
    for ind, item in enumerate(data):
        for article in item["Article"]:
            count = len(article['auteur'].split(","))
            article["Author Count"] = count

transform_authors_of_article(data)
pd.DataFrame(article for item in data for article in item["Article"])

Unnamed: 0,Nom Article,description,auteur,Annee,Author Count
0,Evapotranspiration components determined by st...,"Agricultural and forest meteorology 125 (3-4),...","DG Williams, W Cable, K Hultine, JCB Hoedjes, ...",2004,7
1,Monitoring wheat phenology and irrigation in C...,"Agricultural Water Management 79 (1), 1-27, 2006","VS B. Duchemin, R. Hadria, S. Er-raki, G. Boul...",2006,6
2,Combining FAO-56 model and ground-based remote...,"Agricultural water management 87 (1), 41-54, 2007","S Er-Raki, A Chehbouni, N Guemouria, B Duchemi...",2007,6
3,The use of high‐resolution image time series f...,International Journal of Remote Sensing 29 (1)...,"V Simonneaux, B Duchemin, D Helson, S Er‐Raki,...",2008,6
4,Performance assessment of AquaCrop model for e...,"Agricultural Water Management 163, 219-235, 2016","J Toumi, S Er-Raki, J Ezzahar, S Khabba, L Jar...",2016,6
...,...,...,...,...,...
395,Construction of a genetic linkage map for the ...,"Crop science 50 (6), 2291-2302, 2010","AZE Aabidine, J Charafi, C Grout, A Doligez, S...",2010,7
396,Assemblage of indigenous arbuscular mycorrhiza...,"Scientific Reports 11 (1), 22835, 2021","A Boutasknit, M Baslam, M Ait-El-Mokhtar, M An...",2021,6
397,Use of alginate extracted from Moroccan brown ...,"Molecules 25 (3), 720, 2020","S Bouissil, Z El Alaoui-Talibi, G Pierre, P Mi...",2020,6
398,Differential physiological and antioxidative r...,"Journal of Plant Interactions 11 (1), 30-40, 2016","A Chakhchar, M Lamaoui, S Aissam, A Ferradous,...",2016,6


In [193]:
# add col for article count
from transform_articles import transform_article_count
transform_article_count(data)
# pd.DataFrame(data)

In [194]:
# add col faculties
from faculty_transform import transform_faculties
transform_faculties(data)
# pd.DataFrame(data)

In [195]:
# create profiles table
profiles = [{key: value for key, value in item.items() if key != "Article"} for item in data]
df = pd.DataFrame(profiles)
# df.to_excel('generated/profiles.xlsx', index=False)
df.to_csv('generated/profiles.csv', index=False)

In [196]:
def decompose_description(description):
    # print(description)
    first_digit_index = next((i for i, c in enumerate(description) if c.isdigit()), None)

    journal_title = description[:first_digit_index].strip()
    remaining_info = description[first_digit_index:].strip()

    components = [item.strip() for item in remaining_info.split(',')]
    # print(components)
    volume_issue = ""
    year = ""
    pages = ""
    volume = ""
    issue = ""
    # Extracting values
    if len(components) == 1:
        year = components[0]
    else:
        volume_issue = components[0]
        if len(components)>2:
            pages = components[1]
            year = components[2]
        else:
            pages = ""
            year = components[1]

    # Further splitting volume and issue
    if volume_issue != "":
        volume_issue_ = [part.strip() for part in volume_issue.split('(')]
        if len(volume_issue_) > 1:
            volume, issue = [part.strip() for part in volume_issue.split('(')]
        else:
            volume = volume_issue_[0]
            issue = ""
    return {
        'Title': journal_title,
        'Volume': volume,
        'Issue': issue.replace(")", ""),
        'Pages': pages,
        'Year': year,
    }
    
def decompose_articles(articles):
    articles_decomposed = []
    for article in articles:
        authors = [author.strip() for author in article["auteur"].split(",")]
        if all(s.lower() != article["Author"].lower() for s in authors):
            authors.append(article["Author"])
        for author in authors:
            if "..." in author: break
            desc = article["description"]
            information = decompose_description(desc)
            author_data = {
                "Article_Title": article["Nom Article"],
                "Author": author,
                "Year": article["Annee"],
                "Description": desc,
                'Title': information['Title'],
                'Volume': information['Volume'],
                'Issue': information['Issue'],
                'Pages': information['Pages'],
            }
            articles_decomposed.append(author_data)
    return articles_decomposed

In [197]:
# create articles table
articles = [article  | {"Author": item["Nom et Pr\u00e9nom"]} for item in data for article in item["Article"]]
articles_decomposed = decompose_articles(articles)
df = pd.DataFrame(articles_decomposed)
df_article_author_unique = df.drop_duplicates(subset=["Article_Title", "Author", "Year"], keep="first")
df_article_unique = df.drop_duplicates(subset=["Article_Title", "Year"], keep="first")

df_article_author_unique.to_csv('generated/article_author_unique.csv', index=False)
df_article_unique.to_csv('generated/article_unique.csv', index=False)