In [1]:
import pandas as pd

# File paths
job_skills_path = r'C:\Users\kulai\OneDrive\Desktop\Job-Skill Recommendation\job_skills.csv'
job_summary_path = r'C:\Users\kulai\OneDrive\Desktop\Job-Skill Recommendation\job_summary.csv'
linkedin_jobs_path = r'C:\Users\kulai\OneDrive\Desktop\Job-Skill Recommendation\linkedin_job_postings.csv'

# Load datasets
job_skills = pd.read_csv(job_skills_path)
job_summary = pd.read_csv(job_summary_path)
linkedin_jobs = pd.read_csv(linkedin_jobs_path)

# Function to print columns and suggested features
def analyze_columns(df, name):
    print(f"\n🔹 Columns in {name}:")
    for col in df.columns:
        print(f" - {col}")
    
    # Basic feature suggestion based on common column names
    print(f"\n✅ Suggested features for {name}:")
    for col in df.columns:
        col_lower = col.lower()
        if any(k in col_lower for k in ['skill', 'title', 'job', 'role', 'location', 'experience', 'level', 'industry', 'description']):
            print(f" - {col} (Useful)")
        else:
            print(f" - {col} (Maybe not needed)")

# Analyze each file
analyze_columns(job_skills, "job_skills.csv")
analyze_columns(job_summary, "job_summary.csv")
analyze_columns(linkedin_jobs, "linkedin_job_postings.csv")



🔹 Columns in job_skills.csv:
 - job_link
 - job_skills

✅ Suggested features for job_skills.csv:
 - job_link (Useful)
 - job_skills (Useful)

🔹 Columns in job_summary.csv:
 - job_link
 - job_summary

✅ Suggested features for job_summary.csv:
 - job_link (Useful)
 - job_summary (Useful)

🔹 Columns in linkedin_job_postings.csv:
 - job_link
 - last_processed_time
 - got_summary
 - got_ner
 - is_being_worked
 - job_title
 - company
 - job_location
 - first_seen
 - search_city
 - search_country
 - search_position
 - job_level
 - job_type

✅ Suggested features for linkedin_job_postings.csv:
 - job_link (Useful)
 - last_processed_time (Maybe not needed)
 - got_summary (Maybe not needed)
 - got_ner (Maybe not needed)
 - is_being_worked (Maybe not needed)
 - job_title (Useful)
 - company (Maybe not needed)
 - job_location (Useful)
 - first_seen (Maybe not needed)
 - search_city (Maybe not needed)
 - search_country (Maybe not needed)
 - search_position (Maybe not needed)
 - job_level (Useful)
 

In [2]:
import pandas as pd

# Load CSVs
skills_df = pd.read_csv(r'C:\Users\kulai\OneDrive\Desktop\Job-Skill Recommendation\job_skills.csv')
linkedin_df = pd.read_csv(r'C:\Users\kulai\OneDrive\Desktop\Job-Skill Recommendation\linkedin_job_postings.csv')

# Clean NaN values in job_skills column by converting non-strings to ''
skills_df['job_skills'] = skills_df['job_skills'].fillna('').astype(str)

# Group by job_link and consolidate all skills into unique list
job_skills_map = skills_df.groupby('job_link')['job_skills'].apply(
    lambda x: list(set(','.join(x).split(',')))
).reset_index()

# Merge with job titles
job_roles_df = pd.merge(linkedin_df[['job_link', 'job_title']], job_skills_map, on='job_link')

# Drop missing and duplicate entries
job_roles_df = job_roles_df.dropna().drop_duplicates('job_title')

# Rename columns
job_roles_df = job_roles_df[['job_title', 'job_skills']]
job_roles_df.columns = ['title', 'required_skills']

# Optional: lowercase and strip skill names
job_roles_df['required_skills'] = job_roles_df['required_skills'].apply(
    lambda skills: [skill.strip().lower() for skill in skills if skill.strip()]
)

print(job_roles_df.head())


                                               title  \
0  Account Executive - Dispensing (NorCal/Norther...   
1                 Registered Nurse - RN Care Manager   
2               RESTAURANT SUPERVISOR - THE FORKLIFT   
3                      Independent Real Estate Agent   
4                              Registered Nurse (RN)   

                                     required_skills  
0  [medical equipment sales, internal and externa...  
1  [hospitalization avoidance, patient selfmanage...  
2  [oral communication skills, guest service skil...  
3  [negotiation, deeds, equal opportunity employe...  
4  [diversity, equal opportunity employer, equity...  


In [1]:
import pandas as pd

# Load datasets
skills_df = pd.read_csv(r'C:\Users\kulai\OneDrive\Desktop\Job-Skill Recommendation\job_skills.csv')
summary_df = pd.read_csv(r'C:\Users\kulai\OneDrive\Desktop\Job-Skill Recommendation\job_summary.csv')
linkedin_df = pd.read_csv(r'C:\Users\kulai\OneDrive\Desktop\Job-Skill Recommendation\linkedin_job_postings.csv')

# Show all columns
print("\n🔍 All Available Columns:")
print(f"job_skills.csv: {list(skills_df.columns)}")
print(f"job_summary.csv: {list(summary_df.columns)}")
print(f"linkedin_job_postings.csv: {list(linkedin_df.columns)}")

# Identify features actually used in logic
used_columns = {
    'job_skills.csv': ['job_link', 'job_skills'],  # used to map skills per job
    'job_summary.csv': [],                         # not used in current logic
    'linkedin_job_postings.csv': ['job_link', 'job_title']  # used to match title and join with skills
}

# Print useful features only
print("\n✅ Feature Columns Used in Output Logic:")
for file, columns in used_columns.items():
    print(f"{file}: {columns}")



🔍 All Available Columns:
job_skills.csv: ['job_link', 'job_skills']
job_summary.csv: ['job_link', 'job_summary']
linkedin_job_postings.csv: ['job_link', 'last_processed_time', 'got_summary', 'got_ner', 'is_being_worked', 'job_title', 'company', 'job_location', 'first_seen', 'search_city', 'search_country', 'search_position', 'job_level', 'job_type']

✅ Feature Columns Used in Output Logic:
job_skills.csv: ['job_link', 'job_skills']
job_summary.csv: []
linkedin_job_postings.csv: ['job_link', 'job_title']
