<a href="https://colab.research.google.com/github/joycea17/linkedin-data/blob/main/pofiledataparsing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import re

# Load your dataset (adjust the path to your Drive mount or local upload)
df = pd.read_csv('/content/Career_Guidance_Dataset_v3.csv')

# View columns
print("Columns:", df.columns.tolist())
print(df.head(2))


Columns: ['Job Title', 'Work Experience', 'Certificates', 'Education', 'Volunteer Activities', 'Languages', 'Organizations']
                   Job Title  \
0               Data Analyst   
1  Machine Learning Engineer   

                                     Work Experience  \
0  2013‚Äì2015: Intern at Tech Mahindra ‚Äì Assisted ...   
1  2018‚Äì2021: Machine Learning Engineer at Byjus ...   

                                        Certificates  \
0  Digital Marketing Professional, Tableau Deskto...   
1                                  PMP Certification   

                                           Education  \
0  2010‚Äì2012: High School ‚Äì Focused on relevant s...   
1  2013‚Äì2015: High School ‚Äì Focused on relevant s...   

                              Volunteer Activities       Languages  \
0         Participated in coding for good projects  English, Hindi   
1  Mentored university students in career guidance  English, Tamil   

                Organizations  
0             

In [None]:
def parse_experiences(text):
    """
    Parse multiple experiences from a single cell.
    Expected pattern: YYYY‚ÄìYYYY: <Role> at <Company> ‚Äì <Description>.
    Returns a list of dicts.
    """
    if pd.isna(text):
        return []

    # Replace en-dash with normal dash just in case
    text = text.replace('‚Äì', '-')

    pattern = r'(\d{4})-(\d{4}):\s*(.*?)\s+at\s+(.*?)\s*[-‚Äì]\s*(.*?)(?=\d{4}-\d{4}:|$)'
    matches = re.findall(pattern, text, flags=re.DOTALL)

    experiences = []
    for start, end, role, company, desc in matches:
        experiences.append({
            'Start_Year': int(start.strip()),
            'End_Year': int(end.strip()),
            'Role': role.strip(),
            'Company': company.strip(),
            'Description': desc.strip()
        })
    return experiences

# Apply to dataset
df['Parsed_Experiences'] = df['Work Experience'].apply(parse_experiences)

# Check one example
df['Parsed_Experiences'].iloc[0]


[{'Start_Year': 2013,
  'End_Year': 2015,
  'Role': 'Intern',
  'Company': 'Tech Mahindra',
  'Description': 'Assisted senior Data Analysts with day-to-day tasks and learned core tools.'},
 {'Start_Year': 2015,
  'End_Year': 2017,
  'Role': 'Associate Data Analyst',
  'Company': 'HSBC',
  'Description': 'Worked on implementations, wrote code/reports and supported project deliverables.'},
 {'Start_Year': 2017,
  'End_Year': 2020,
  'Role': 'Data Analyst',
  'Company': 'Infosys',
  'Description': 'Contributed to major projects and improved existing processes.'},
 {'Start_Year': 2020,
  'End_Year': 2023,
  'Role': 'Senior Data Analyst',
  'Company': 'Deloitte',
  'Description': 'Led modules, mentored juniors and managed stakeholder communications.'}]

In [None]:
# Flatten all parsed experiences into a new dataframe
records = []
for idx, row in df.iterrows():
    if isinstance(row['Parsed_Experiences'], list):
        for exp in row['Parsed_Experiences']:
            exp_record = exp.copy()
            exp_record['User_Index'] = idx  # to trace back to original user
            exp_record['Job_Title'] = row['Job Title']
            exp_record['Education'] = row['Education']
            exp_record['Certificates'] = row['Certificates']
            exp_record['Languages'] = row['Languages']
            exp_record['Organizations'] = row['Organizations']
            exp_record['Volunteer_Activities'] = row['Volunteer Activities']
            records.append(exp_record)

exp_df = pd.DataFrame(records)

# Preview
exp_df.head(10)


Unnamed: 0,Start_Year,End_Year,Role,Company,Description,User_Index,Job_Title,Education,Certificates,Languages,Organizations,Volunteer_Activities
0,2013,2015,Intern,Tech Mahindra,Assisted senior Data Analysts with day-to-day ...,0,Data Analyst,2010‚Äì2012: High School ‚Äì Focused on relevant s...,"Digital Marketing Professional, Tableau Deskto...","English, Hindi",EY,Participated in coding for good projects
1,2015,2017,Associate Data Analyst,HSBC,"Worked on implementations, wrote code/reports ...",0,Data Analyst,2010‚Äì2012: High School ‚Äì Focused on relevant s...,"Digital Marketing Professional, Tableau Deskto...","English, Hindi",EY,Participated in coding for good projects
2,2017,2020,Data Analyst,Infosys,Contributed to major projects and improved exi...,0,Data Analyst,2010‚Äì2012: High School ‚Äì Focused on relevant s...,"Digital Marketing Professional, Tableau Deskto...","English, Hindi",EY,Participated in coding for good projects
3,2020,2023,Senior Data Analyst,Deloitte,"Led modules, mentored juniors and managed stak...",0,Data Analyst,2010‚Äì2012: High School ‚Äì Focused on relevant s...,"Digital Marketing Professional, Tableau Deskto...","English, Hindi",EY,Participated in coding for good projects
4,2018,2021,Machine Learning Engineer,Byjus,Worked as Machine Learning Engineer focusing o...,1,Machine Learning Engineer,2013‚Äì2015: High School ‚Äì Focused on relevant s...,PMP Certification,"English, Tamil","Nvidia, Goldman Sachs, PwC",Mentored university students in career guidance
5,2021,2023,Machine Learning Engineer,Zomato,Worked as Machine Learning Engineer focusing o...,1,Machine Learning Engineer,2013‚Äì2015: High School ‚Äì Focused on relevant s...,PMP Certification,"English, Tamil","Nvidia, Goldman Sachs, PwC",Mentored university students in career guidance
6,2023,2026,Machine Learning Engineer,Oracle,Worked as Machine Learning Engineer focusing o...,1,Machine Learning Engineer,2013‚Äì2015: High School ‚Äì Focused on relevant s...,PMP Certification,"English, Tamil","Nvidia, Goldman Sachs, PwC",Mentored university students in career guidance
7,2013,2016,Intern,Oracle,Assisted senior Social Media Managers with day...,2,Social Media Manager,2012‚Äì2014: High School ‚Äì Focused on relevant s...,Microsoft Azure Fundamentals,"English, French","Tesla, Accenture, Spotify",Helped organize blood donation drives
8,2016,2017,Associate,KPMG,"Worked on implementations, wrote code/reports ...",2,Social Media Manager,2012‚Äì2014: High School ‚Äì Focused on relevant s...,Microsoft Azure Fundamentals,"English, French","Tesla, Accenture, Spotify",Helped organize blood donation drives
9,2017,2019,Social Media Manager,HSBC,"Led modules, mentored juniors and managed stak...",2,Social Media Manager,2012‚Äì2014: High School ‚Äì Focused on relevant s...,Microsoft Azure Fundamentals,"English, French","Tesla, Accenture, Spotify",Helped organize blood donation drives


In [None]:
exp_df['Role_Cleaned'] = exp_df['Role'].str.lower().str.replace(r'[^a-z\s]', '', regex=True).str.strip()
exp_df['Company_Cleaned'] = exp_df['Company'].str.title()


In [None]:
exp_df.to_csv('/content/Structured_Career_Experience.csv', index=False)
print("‚úÖ Clean structured dataset saved as Structured_Career_Experience.csv")


In [None]:
goal = "HR Manager"
background = """
BBA graduate with internship experience in recruitment and employee engagement.
Certificates in HR Analytics and Communication.
"""

predicted_role, roadmap = career_guidance_bot(goal, background)

print("üß≠ Predicted Next Role:", predicted_role)
print("\nüìç Personalized Career Roadmap:\n")
print(roadmap)
