In [21]:
# 🔧 Step 1: Required libraries install (Run only once)
!pip install transformers nltk --quiet  # transformers = model, nltk = text processing


In [22]:
# 🔧 Step 2: Import necessary Python libraries
import pandas as pd                      # CSV file read & dataframe manage
import re                                # Regular expression for text cleaning
from nltk.corpus import stopwords        # English stopword list (like: 'the', 'is', etc)
from nltk.tokenize import word_tokenize  # Word-by-word split for filtering
from nltk import download                # To download NLTK resources
from transformers import pipeline        # HuggingFace summarizer model
import nltk                              # Core nltk library

# ✅ Download only needed NLTK data
nltk.download('punkt')                   # For tokenization
nltk.download('stopwords')               # For removing common English words



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
# 🔧 Step 3: NLTK resources download (Only once)
download('punkt')            # Word & sentence tokenizers
download('stopwords')        # Common English stopwords



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
# 📂 Step 4: Load your dataset
df = pd.read_csv("/content/Last_Year_Project - Main.csv")  # Tumi jei file upload koro, tar path eta


In [25]:
# 🧹 Step 5: Company Overview column theke missing gula remove koro
df = df.dropna(subset=["Company Overview"])  # Jekhane review nai, oigula bad


In [26]:
# 🧼 Step 6: Text ke lowercase kora & punctuation clean kora
df["Clean_Review"] = df["Company Overview"].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', str(x).lower()))
# 🔹 Explanation:
# - x.lower(): sob kichu small letter e
# - re.sub(): special character gula (.,!@) remove kora


In [27]:
# 🚫 Step 7: Stopwords remove kora (optional but helps)

# 1️⃣ Stopwords set banano
stop_words = set(stopwords.words('english'))  # Example: ['is', 'the', 'and', 'a'...]

# ✅ Download only needed NLTK data for tokenization if not already downloaded
try:
    word_tokenize("test")
except LookupError:
    nltk.download('punkt_tab')


# 2️⃣ Cleaned text theke stopwords remove kora
df["Clean_Review"] = df["Clean_Review"].apply(
    lambda x: ' '.join([w for w in word_tokenize(str(x)) if str(x).strip() and w.lower() not in stop_words])
)

# 🔹 Explanation:
# - str(x): jodi kono value NaN hoy, seta keo string e convert kore
# - str(x).strip(): check if string is not empty or whitespace
# - word_tokenize(): text ke word e vag kore
# - w.lower(): lowercase kore compare kore stopwords er shathe
# - if w.lower() not in stop_words: stopword gulo remove
# - ' '.join(...): cleaned words gulo abar sentence e convert kora

In [28]:
# ⚡ Faster model than bart-large-cnn (no GPU needed)
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")


Device set to use cpu


In [46]:
def get_summary_for_position(position, max_reviews=15):
    reviews = df[df["Position"] == position]["Company Overview"].dropna().tolist()[:max_reviews]
    skills = df[df["Position"] == position]["Skills Required"].dropna().unique().tolist()

    if not reviews:
        return "No reviews found for this position.", []

    combined_text = " ".join(reviews)

    # Check if combined_text is too short for summarization
    # A threshold of 50 characters is used as an example
    if len(combined_text) < 50:
         return "Not enough review text to generate a summary.", skills[:5]


    # Limit to 1024 tokens for BART, ensuring we don't cut off mid-word if possible
    # This approximation might still cut words, but it's a simple way to handle length
    combined_text = combined_text[:1024]


    summary = summarizer(combined_text, max_length=50, min_length=30, do_sample=False)[0]['summary_text']
    return summary, skills[:5]

In [30]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [31]:
print(get_summary_for_position("QA Engineer"))


 Management and HR issues, uneven distribution of work . Some people in upper management are making side businesses by exploiting company's reputation . The team is not eager to develop mid-level projects; they only look for the high ones .


In [32]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
# If slow, you can use this instead:
# summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")


Device set to use cpu


In [33]:
def get_summary_for_position(position, max_reviews=15):
    reviews = df[df["Position"] == position]["Company Overview"].dropna().tolist()[:max_reviews]

    if not reviews:
        return "No reviews found for this position."

    combined_text = " ".join(reviews)[:1024]  # Limit to 1024 tokens for BART
    summary = summarizer(combined_text, max_length=50, min_length=30, do_sample=False)[0]['summary_text']
    return summary


In [41]:
from collections import Counter

def get_skills_summary(position):
    skill_texts = df[df["Position"] == position]["Skills Required"].dropna().tolist()

    # Split skills and flatten
    skills = [skill.strip() for text in skill_texts for skill in text.split(',') if skill.strip()]

    if not skills:
        return "No skills data available for this position."

    # Count and select top 5
    skill_counts = Counter(skills)
    top_skills = [skill for skill, _ in skill_counts.most_common(5)]

    return "Most common required skills: " + ", ".join(top_skills)

In [45]:
position = input("Please enter the position you want to analyze: ")

# Get the company name for the position (assuming one company per position in this context)
# Add error handling in case the position is not found
if position not in df["Position"].unique():
    print(f"Position '{position}' not found in the dataset. Please enter a valid position.")
else:
    company_name = df[df["Position"] == position]["Company Name"].iloc[0]

    print(f"\n🏢 Company: {company_name}")
    print(f"🧑‍💻 Position: {position}")

    # Get and print the review summary
    print("\n💬 Review Summary:")
    review_summary = get_summary_for_position(position)
    print(review_summary)

    # Get and print the skill summary
    print("\n🛠️ Skill Summary:")
    skill_summary_text = get_skills_summary(position)
    print(skill_summary_text)

Please enter the position you want to analyze: Analyst programmer


Your max_length is set to 50, but your input_length is only 9. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)



🏢 Company: Southtech Group
🧑‍💻 Position: Analyst programmer

💬 Review Summary:
CNN.com will feature iReporter photos in a weekly Travel Snapshots gallery. Visit CNN.com/Travel each week for a new gallery of snapshots.

🛠️ Skill Summary:
Most common required skills: Java, SQL, Data Modeling, REST APIs, Problem-Solving


In [51]:
position = input("Please enter the position you want to analyze: ")

# Add error handling in case the position is not found
if position not in df["Position"].unique():
    print(f"Position '{position}' not found in the dataset. Please enter a valid position.")
else:
    # Get the company name for the position (assuming one company per position in this context)
    company_name = df[df["Position"] == position]["Company Name"].iloc[0]

    print(f"\n🏢 Company: {company_name}")
    print(f"🧑‍💻 Position: {position}")

    # Get and print the review summary and skills
    print("\n💬 Review Summary:")
    review_summary, skills = get_summary_for_position(position)
    print(review_summary)

    # Print the skill summary
    print("\n🛠️ Skill Summary:")
    if skills:
        print("Most common required skills: " + ", ".join(skills))
    else:
        # If get_summary_for_position returned an empty skill list
        print("No skills data available for this position.")

Please enter the position you want to analyze: Full stack java developer

🏢 Company: Southtech Group
🧑‍💻 Position: Full stack java developer

💬 Review Summary:
Not enough review text to generate a summary.

🛠️ Skill Summary:
Most common required skills: Java, Spring Boot, React, MySQL, JavaScript, CI/CD


# Task
Modify the code to take user input for the company name, filter the data by the company name, and if reviews are found, summarize them and suggest up to 5 skills. If no reviews are found for the company, display the information from the corresponding row(s) in the dataframe.

## Modify input

### Subtask:
Change the input prompt to ask for the Company Name instead of the Position.


**Reasoning**:
Modify the input prompt to ask for the company name as requested by the subtask.



In [53]:
company_name_input = input("Please enter the company name you want to analyze: ")

Please enter the company name you want to analyze: Bdtask


## Filter by company name

### Subtask:
Filter the DataFrame to select rows corresponding to the entered Company Name.


**Reasoning**:
Filter the DataFrame based on the user-provided company name.



In [54]:
df_company = df[df["Company Name"] == company_name_input]

## Check for reviews

### Subtask:
Check if the filtered rows contain any non-missing entries in the "Company Overview" column.


**Reasoning**:
Check if the filtered dataframe is empty and if there are non-missing values in the "Company Overview" column.



In [55]:
if df_company.empty:
    print(f"Company '{company_name_input}' not found in the dataset. Please enter a valid company name.")
else:
    if df_company["Company Overview"].dropna().empty:
        print(f"No reviews found for '{company_name_input}'. Displaying available information.")
        # This is where you would add the logic to display the row information
        display(df_company)
    else:
        print(f"Reviews found for '{company_name_input}'. Proceeding with summarization.")
        # This is where you would add the logic for summarization and skill suggestion

Reviews found for 'Bdtask'. Proceeding with summarization.


## Summarize or display row info

### Subtask:
If reviews are found, combine them and generate a summary using the existing summarization function and determine the most common skills associated with this company. If no reviews are found, select the relevant row(s) for the company and display key information from that row(s).


**Reasoning**:
Implement the logic to summarize reviews and find top skills if reviews are found, or display relevant information if no reviews are found, based on the previous check.



In [56]:
if df_company.empty:
    print(f"Company '{company_name_input}' not found in the dataset. Please enter a valid company name.")
else:
    if df_company["Company Overview"].dropna().empty:
        print(f"No reviews found for '{company_name_input}'. Displaying available information.")
        # Display relevant columns if no reviews found
        display(df_company[["Position", "Skills Required", "Work Type"]])
    else:
        print(f"Reviews found for '{company_name_input}'. Proceeding with summarization and skill analysis.")
        # Combine reviews
        combined_text = " ".join(df_company["Company Overview"].dropna().tolist())

        # Generate summary (limit to 1024 tokens for BART)
        # Check if combined_text is too short for summarization
        # A threshold of 50 characters is used as an example
        if len(combined_text) < 50:
             review_summary = "Not enough review text to generate a summary."
        else:
            combined_text = combined_text[:1024]
            review_summary = summarizer(combined_text, max_length=50, min_length=30, do_sample=False)[0]['summary_text']

        # Extract and process skills
        skill_texts = df_company["Skills Required"].dropna().tolist()
        skills = [skill.strip() for text in skill_texts for skill in text.split(',') if skill.strip()]

        if not skills:
            skill_summary_text = "No skills data available for this company."
        else:
            skill_counts = Counter(skills)
            top_skills = [skill for skill, _ in skill_counts.most_common(5)]
            skill_summary_text = "Most common required skills: " + ", ".join(top_skills)

        # Print the results
        print(f"\n🏢 Company: {company_name_input}")
        # Note: If a company has multiple positions, this will only show the position from the first row.
        # A more comprehensive approach might iterate through unique positions or list them all.
        if not df_company["Position"].empty:
             print(f"🧑‍💻 Position(s): {', '.join(df_company['Position'].unique().tolist())}")


        print("\n💬 Review Summary:")
        print(review_summary)

        print("\n🛠️ Skill Summary:")
        print(skill_summary_text)

Reviews found for 'Bdtask'. Proceeding with summarization and skill analysis.

🏢 Company: Bdtask
🧑‍💻 Position(s): Sales asistant, Reactjs developer, Software engineer, .net developer, Sales associate, Laravel developer, Blockchain developer, Customer support engineer, Seo executive, Python developer, IT Engineer, Project manager, software engineer, Mobile applications developer, Php developer, Techical support engineer, Software developer, Junior sqa engineer, Software support engineer, SQA Engineer, Digital marketing specialist, Junior software developer, Seo specialist, Android developer, Software engineer(internship), Jr programmer

💬 Review Summary:
The work environment is extremely toxic and demotivating. The projects I’ve worked on are both challenging and rewarding, providing real-world experience that has significantly enhanced my skill set. The work culture is po

🛠️ Skill Summary:
Most common required skills: REST APIs, Git, JavaScript, JIRA, Python


## Present output

### Subtask:
Display the Company Name, and either the review summary and skills (if reviews exist) or the row information (if no reviews exist).


## Refine skills for company

### Subtask:
Address how to handle skills when filtering by company name (e.g., list top skills across all positions in the company, or ask the user to also specify a position).


**Reasoning**:
The current code extracts and counts skills for the entire company. For a more focused analysis, it makes more sense to ask the user to specify a position after the company name is entered and then provide skills relevant to that position. This aligns with the original request's structure of analyzing specific positions. I will modify the code to first ask for the company name, then if the company is found, list the unique positions within that company, and finally ask the user to select one of those positions for further analysis.



In [59]:
company_name_input = input("Please enter the company name you want to analyze: ")

# Add error handling in case the company is not found
if company_name_input not in df["Company Name"].unique():
    print(f"Company '{company_name_input}' not found in the dataset. Please enter a valid company name.")
else:
    df_company = df[df["Company Name"] == company_name_input]

    # Get the unique positions for the selected company
    unique_positions = df_company["Position"].dropna().unique().tolist()

    if not unique_positions:
        print(f"No position data found for '{company_name_input}'. Displaying available information.")
        # Display relevant columns if no position data found
        display(df_company[["Company Overview", "Skills Required", "Work Type"]])
    else:
        print(f"\nPositions available for '{company_name_input}':")
        for i, position in enumerate(unique_positions):
            print(f"{i + 1}. {position}")

        # Ask the user to select a position
        while True:
            try:
                position_index = int(input(f"Please enter the number corresponding to the position you want to analyze (1-{len(unique_positions)}): ")) - 1
                if 0 <= position_index < len(unique_positions):
                    position = unique_positions[position_index]
                    break
                else:
                    print("Invalid number. Please try again.")
            except ValueError:
                print("Invalid input. Please enter a number.")

        df_position = df_company[df_company["Position"] == position]

        print(f"\n🏢 Company: {company_name_input}")
        print(f"🧑‍💻 Position: {position}")

        # Get and print the review summary for the selected position
        print("\n💬 Review Summary:")
        # Filter out None values before joining
        reviews = df_position["Company Overview"].dropna().tolist()

        if not reviews:
            print("No reviews found for this position.")
        else:
            combined_text = " ".join(reviews)

            # Check if combined_text is too short for summarization
            if len(combined_text) < 50:
                 review_summary = "Not enough review text to generate a summary."
            else:
                combined_text = combined_text[:1024] # Limit for BART
                try:
                    review_summary = summarizer(combined_text, max_length=50, min_length=30, do_sample=False)[0]['summary_text']
                except Exception as e:
                    print(f"Error during summarization: {e}")
                    review_summary = "Could not generate summary."

            print(review_summary)

        # Get and print the skill summary for the selected position
        print("\n🛠️ Skill Summary:")
        skill_texts = df_position["Skills Required"].dropna().tolist()
        skills = [skill.strip() for text in skill_texts for skill in text.split(',') if skill.strip()]

        if not skills:
            skill_summary_text = "No skills data available for this position."
        else:
            skill_counts = Counter(skills)
            top_skills = [skill for skill, _ in skill_counts.most_common(5)]
            skill_summary_text = "Most common required skills: " + ", ".join(top_skills)

        print(skill_summary_text)

Please enter the company name you want to analyze: Flyte Solutions

Positions available for 'Flyte Solutions':
1. Data analyst
2. Software architect
3. QA engineer
4. React developer
5. Junior web developer
6. Full stack web developer
7. Software engineer
Please enter the number corresponding to the position you want to analyze (1-7): 2

🏢 Company: Flyte Solutions
🧑‍💻 Position: Software architect

💬 Review Summary:
The team is collaborative and open to Sharing knowledge, which supports continues learning. Management consistently provides guidance and support, creating a positive work environment. Opportunities for skill growth and project involvement are encouraging.

🛠️ Skill Summary:
Most common required skills: Technology stack selection, Framework evaluation, Tool and platform recommendation, Cloud architecture, System architecture design


## Summary:

### Data Analysis Key Findings

*   The code was successfully modified to accept user input for the company name and filter the dataset accordingly.
*   The process includes a check to see if reviews ("Company Overview") are available for the selected company.
*   If reviews are found, they are combined, summarized using a text summarization model (BART), and the top 5 most frequently listed skills for that company are identified and presented.
*   If no reviews are found, key information from the corresponding row(s) for the company (Position, Skills Required, Work Type) is displayed.
*   The approach for handling skills was refined to be position-specific, prompting the user to select a position within the company to get relevant review summaries and skill suggestions for that particular role.

### Insights or Next Steps

*   Implementing the position-specific analysis provides more targeted and useful insights to the user compared to a company-wide aggregation of skills.
*   Consider adding error handling or suggestions if the user-inputted company name is not found in the dataset.
