In [33]:
import json
import pandas as pd


In [34]:
with open("uca.json", 'r') as file:
    data = json.load(file)

In [35]:
def transform_authors_of_article(data):
    for ind, item in enumerate(data):
        for article in item["Article"]:
            count = len(article['auteur'].split(","))
            article["Author Count"] = count

transform_authors_of_article(data)
pd.DataFrame(article for item in data for article in item["Article"])

Unnamed: 0,Nom Article,description,auteur,Annee,cite par,Author Count
0,Evapotranspiration components determined by st...,"Agricultural and forest meteorology 125 (3-4),...","DG Williams, W Cable, K Hultine, JCB Hoedjes, ...",2004,570,7
1,Monitoring wheat phenology and irrigation in C...,"Agricultural Water Management 79 (1), 1-27, 2006","VS B. Duchemin, R. Hadria, S. Er-raki, G. Boul...",2006,515,6
2,Combining FAO-56 model and ground-based remote...,"Agricultural water management 87 (1), 41-54, 2007","S Er-Raki, A Chehbouni, N Guemouria, B Duchemi...",2007,328,6
3,The use of high‐resolution image time series f...,International Journal of Remote Sensing 29 (1)...,"V Simonneaux, B Duchemin, D Helson, S Er‐Raki,...",2008,238,6
4,Performance assessment of AquaCrop model for e...,"Agricultural Water Management 163, 219-235, 2016","J Toumi, S Er-Raki, J Ezzahar, S Khabba, L Jar...",2016,161,6
...,...,...,...,...,...,...
395,Characteristics of cellulose microfibers and n...,"Cellulose 28, 4089-4103, 2021","A Bahloul, Z Kassab, F Aziz, H Hannache, R Bou...",2021,24,7
396,Engineering of highly Brachychiton populneus s...,"Journal of Molecular Liquids 346, 117092, 2022","K Aziz, F Aziz, R Mamouni, L Aziz, N Saffaj",2022,23,5
397,Pathogens evolution during the composting of t...,"Waste and Biomass Valorization 11, 1789-1797, ...","K Atif, A Haouas, F Aziz, MY Jamali, A Tallou,...",2020,22,6
398,Can the application of graphene oxide contribu...,Current Research in Pharmacology and Drug Disc...,"LGMSUAF Asmaa Rhazouani, Khalid Aziz, Halima G...",2021,21,3


In [36]:
for item in data:
    item["Spciality"] = ", ".join(item["Spciality"])

In [37]:
# add col for article count
from transform_articles import transform_article_count
transform_article_count(data)
# pd.DataFrame(data)

In [38]:
# add col faculties
from faculty_transform import transform_faculties
transform_faculties(data)
# pd.DataFrame(data)

In [39]:
# create profiles table
profiles = [{key: value for key, value in item.items() if key != "Article"} for item in data]
df = pd.DataFrame(profiles)
# df.to_excel('generated/profiles.xlsx', index=False)
df.to_csv('generated/profiles.csv', index=False)

In [40]:
def decompose_description(description):
    # print(description)
    first_digit_index = next((i for i, c in enumerate(description) if c.isdigit()), None)

    article_revue = description[:first_digit_index].strip()
    remaining_info = description[first_digit_index:].strip()

    components = [item.strip() for item in remaining_info.split(',')]
    # print(components)
    volume_number = ""
    year = ""
    pages = ""
    volume = ""
    number = ""
    # Extracting values
    if len(components) == 1:
        year = components[0]
    else:
        volume_number = components[0]
        if len(components)>2:
            pages = components[1]
            year = components[2]
        else:
            pages = ""
            year = components[1]

    # Further splitting volume and number
    if volume_number != "":
        volume_number_ = [part.strip() for part in volume_number.split('(')]
        if len(volume_number_) > 1:
            volume, number = [part.strip() for part in volume_number.split('(')]
        else:
            volume = volume_number_[0]
            number = ""
    return {
        'Revue': article_revue,
        'Volume': volume,
        'Number': number.replace(")", ""),
        'Pages': pages,
        'Year': year,
    }
    
def decompose_articles(articles):
    articles_decomposed = []
    for article in articles:
        authors = [author.strip() for author in article["auteur"].split(",")]
        if all(s.lower() != article["Author"].lower() for s in authors):
            authors.append(article["Author"])
        for author in authors:
            if "..." in author: break
            desc = article["description"]
            information = decompose_description(desc)
            author_data = {
                "Article_Title": article["Nom Article"],
                "Author": author,
                "Year": article["Annee"],
                "Description": desc,
                'Revue': information['Revue'],
                'Volume': information['Volume'],
                'Number': information['Number'],
                'Pages': information['Pages'],
            }
            articles_decomposed.append(author_data)
    return articles_decomposed

In [41]:
# create articles table
articles = [article  | {"Author": item["Nom et Pr\u00e9nom"]} for item in data for article in item["Article"]]
articles_decomposed = decompose_articles(articles)
df = pd.DataFrame(articles_decomposed)
df_article_author_unique = df.drop_duplicates(subset=["Article_Title", "Author", "Year"], keep="first")
df_article_unique = df.drop_duplicates(subset=["Article_Title", "Year"], keep="first")

df_article_author_unique.to_csv('generated/article_author_unique.csv', index=False)
df_article_unique.to_csv('generated/article_unique.csv', index=False)