In [7]:
import requests
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import logging
from rake_nltk import Rake
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache
import numpy as np

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Download NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Load the CSV file
file_path = 'C:/Users/DELL/Downloads/Trainig Dataset1 - Udacity.csv'
df = pd.read_csv(file_path)

# Function to query Wikidata with retry logic, focusing on academic disciplines
@lru_cache(maxsize=10000)
def query_wikidata_academic_discipline(entity):
    url = "https://www.wikidata.org/w/api.php"
    params = {
        'action': 'wbsearchentities',
        'language': 'en',
        'format': 'json',
        'search': entity
    }
    
    session = requests.Session()
    retry = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    
    try:
        response = session.get(url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()
        results = data.get('search', [])
        
        # Assuming the confidence rate is proportional to the ranking of the result
        top_results = results[:int(0.9 * len(results))]  # Consider top 90% of the results
        
        for result in top_results:
            entity_id = result['id']
            entity_url = f"https://www.wikidata.org/wiki/Special:EntityData/{entity_id}.json"
            entity_response = session.get(entity_url)
            entity_response.raise_for_status()
            entity_data = entity_response.json()
            entities = entity_data['entities']
            if entity_id in entities:
                entity_info = entities[entity_id]
                claims = entity_info.get('claims', {})
                
                # Check if the entity is an academic discipline
                if 'P31' in claims:
                    for claim in claims['P31']:
                        if 'mainsnak' in claim and claim['mainsnak']['datavalue']['value']['id'] == 'Q11862829':  # Q11862829 is the Wikidata item for academic discipline
                            return result['label']
        return None
    except requests.exceptions.RequestException as e:
        logging.error(f"Request failed: {e}")
        return None

# Function to extract academic disciplines and keyphrases from text
def extract_academic_disciplines_and_keyphrases(text):
    if pd.isna(text):
        return ""
    
    logging.info(f"Extracting academic disciplines and keyphrases for text: {text[:50]}...")  # Log a snippet of the text being processed
    
    # Tokenize and filter for nouns
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_nouns = [word for word in nouns if word.lower() not in stop_words]
    
    # Query Wikidata for each noun
    academic_disciplines = []
    for noun in filtered_nouns:
        entity = query_wikidata_academic_discipline(noun)
        if entity:
            academic_disciplines.append(entity)
    
    # Extract keyphrases using RAKE
    rake = Rake(stopwords.words('english'))
    rake.extract_keywords_from_text(text)
    keyphrases = rake.get_ranked_phrases()
    
    # Combine academic disciplines and keyphrases
    combined_entities = set(academic_disciplines + keyphrases)
    
    return ', '.join(combined_entities)

# Process the DataFrame in chunks using ThreadPoolExecutor
def process_chunk(chunk):
    chunk['Extracted Entities'] = chunk['Description'].apply(extract_academic_disciplines_and_keyphrases)
    return chunk

# Split the DataFrame into chunks and process them in parallel
num_chunks = 100  # Increase the number of chunks to better utilize the CPU
chunks = np.array_split(df, num_chunks)

with ThreadPoolExecutor() as executor:
    chunks = list(executor.map(process_chunk, chunks))

# Combine the processed chunks back into a single DataFrame
df = pd.concat(chunks)

# Save the updated DataFrame to a new CSV file
output_file_path = 'C:/Users/DELL/Downloads/dataset.csv'
logging.info(f"Saving the updated DataFrame to {output_file_path}")
df.to_csv(output_file_path, index=False)

# Display the first few rows to verify
logging.info("Displaying the first few rows of the updated DataFrame")
print(df[['Course Name', 'Description', 'Extracted Entities']].head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  return bound(*args, **kwds)
2024-08-06 10:04:15,871 - INFO - Extracting academic disciplines and keyphrases for text: Digital Project Management enables the creation an...
2024-08-06 10:04:15,877 - INFO - Extracting academic disciplines and keyphrases for text: This program will teach you how to become an Azure...
2024-08-06 10:04:15,877 - INFO - Extracting academic disciplines and keyphrases for text: Master the computer vision skills behind advances ...
2024-08-06 10:04

                                Course Name  \
0                Digital Project Management   
1                       Enterprise Security   
2                             Sensor Fusion   
3        Cybersecurity for Business Leaders   
4  Programming for Data Science with Python   

                                         Description  \
0  Digital Project Management enables the creatio...   
1  The Enterprise Security Nanodegree program imp...   
2  The Sensor Fusion Engineer Nanodegree program ...   
3  Udacityâ€™s Cybersecurity for Business Leaders...   
4  Learn programming skills needed to uncover pat...   

                                  Extracted Entities  
0  services, high, development, digital project m...  
1  design robust defenses, covering network, corp...  
2  clustering, sensor fusion engineer nanodegree ...  
3  computer security, stage, equip future c, role...  
4  uncover patterns, relational databases, git, l...  
