In [None]:
# Script que lee el CSV y usa una llamada al API de Crossref para obtener los DOIs de las distintas referencias
# Version 1.0

In [5]:
%%time # Mide el tiempo de ejecución del bloque entero
import pandas as pd
import requests
import time
import re
from urllib.parse import quote_plus

# Especificar el correo en caso de usar la versión "Polite" de la API
correo="correo@dominio.com"

# Función que lee fila a fila las distintas entradas de título y año de los artículos, buscando el DOI correspondiente
# al artículo con la API de Crossref y añadiendola en una columna nueva "DOIs"
def search_dois_for_aligned_entries(csv_path):
    # Read the CSV file
    df = pd.read_csv(csv_path, quotechar='"', escapechar='\\', delimiter=',', on_bad_lines='warn')
    
    # Add a new column for DOIs
    df['DOIs'] = None  

    # CrossRef API base URL
    crossref_base_url = "https://api.crossref.org/works"

    # Iterate over the DataFrame
    for index, row in df.iterrows():
        titles = row['Title'].split('\n')  # Split titles by new line
        years = row['Year'].split('\n')    # Split years by new line
        dois = []
        
        # Process each title with its corresponding year
        for title, year in zip(titles, years):
            # Clean title by removing specific characters and encode for URL
            cleaned_title = re.sub(r'[!?\',+-]', '', title)  # Remove punctuation
            title = quote_plus(cleaned_title.strip())  # Encode for URL

            # Construct the query URL
            query_url = f"{crossref_base_url}?query.title={title}&filter=from-pub-date:{year}&rows=1&mailto={correo}"

            # Make the API request with error handling and minimal delay for rate limiting
            try:
                response = requests.get(query_url)
                if response.status_code == 200:
                    results = response.json()
                    items = results['message']['items']
                    if items:
                        # Assuming the first result is the most relevant
                        dois.append(items[0].get('DOI'))
                        print(f"{index} DONE")
                elif response.status_code == 429:
                    # Handle rate limiting
                    time.sleep(10)  # Sleep longer if rate limit is hit
                    print('RATE LIMIT')
                    continue  # Retry the current request
            except requests.RequestException as e:
                print(f"{index} Request failed: {e}")
                continue

            time.sleep(0.025)  # Wait 25 milliseconds between requests to maintain rate limit
        
        # Store the list of DOIs back into the DataFrame
        df.at[index, 'DOIs'] = ','.join(dois) if dois else None

    # Save the DataFrame with DOIs to a new CSV file
    output_path = csv_path.replace('.csv', '_with_dois.csv')
    df.to_csv(output_path, index=False)
    
    return output_path

# Llamada a la función especificando el nombre del CSV a procesar
search_dois_for_aligned_entries('cleaned_merged.csv')  # Uncomment to test the function with a specific file path


0 DONE
1 DONE
2 DONE
3 DONE
4 DONE
4 DONE
4 DONE
4 DONE
4 DONE
4 DONE
5 DONE
5 DONE
6 DONE
6 DONE
6 DONE
7 DONE
8 DONE
8 DONE
9 DONE
10 DONE
11 DONE
12 DONE
13 DONE
14 DONE
14 DONE
15 DONE
16 DONE
17 DONE
17 DONE
17 DONE
18 DONE
19 DONE
19 DONE
19 DONE
19 DONE
19 DONE
19 DONE
19 DONE
19 DONE
19 DONE
19 DONE
19 DONE
20 DONE
21 DONE
21 DONE
22 DONE
23 DONE
24 DONE
24 DONE
24 DONE
24 DONE
25 DONE
25 DONE
26 DONE
27 DONE
27 DONE
27 DONE
28 DONE
28 DONE
28 DONE
29 DONE
29 DONE
30 DONE
30 DONE
31 DONE
32 DONE
32 DONE
33 DONE
33 DONE
34 DONE
35 DONE
35 DONE
35 DONE
36 DONE
36 DONE
36 DONE
37 DONE
37 DONE
37 DONE
38 DONE
39 DONE
39 DONE
39 DONE
39 DONE
40 DONE
41 DONE
42 DONE
42 DONE
42 DONE
43 DONE
43 DONE
43 DONE
43 DONE
44 DONE
44 DONE
44 DONE
45 DONE
46 DONE
46 DONE
47 DONE
48 DONE
49 DONE
49 DONE
49 DONE
50 DONE
51 DONE
51 DONE
51 DONE
52 DONE
52 DONE
53 DONE
54 DONE
54 DONE
54 DONE
55 DONE
56 DONE
57 DONE
58 DONE
58 DONE
58 DONE
59 DONE
60 DONE
60 DONE
60 DONE
61 DONE
62 DONE
63 DONE
64 

'cleaned_merged_with_dois.csv'