In [None]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm

In [23]:
df_text = pd.read_csv('/Users/arzumkarahan/Downloads/preprocessed_england_dataset.csv')

In [24]:
df_text.head()

Unnamed: 0.1,Unnamed: 0,firm,date_review,job_title,current,location,overall_rating,work_life_balance,culture_values,diversity_inclusion,...,senior_mgmt,recommend,ceo_approv,outlook,headline,pros,cons,year,pros_clean_min,cons_clean_min
0,1,AFH Wealth Management,2015-12-11,Office Administrator,"Current Employee, more than 1 year","Bromsgrove, England, England",2,3.0,1.0,,...,4.0,x,o,r,"Excellent staff, poor salary","Friendly, helpful and hard-working colleagues",Poor salary which doesn't improve much with pr...,2015,"Friendly, helpful and hard-working colleagues",Poor salary which doesn't improve much with pr...
1,2,AFH Wealth Management,2016-01-28,Office Administrator,"Current Employee, less than 1 year","Bromsgrove, England, England",1,1.0,1.0,,...,1.0,x,o,x,"Low salary, bad micromanagement",Easy to get the job even without experience in...,"Very low salary, poor working conditions, very...",2016,Easy to get the job even without experience in...,"Very low salary, poor working conditions, very..."
2,4,AFH Wealth Management,2016-04-23,Office Administrator,"Current Employee, more than 1 year","Bromsgrove, England, England",1,2.0,1.0,,...,1.0,x,o,x,client reporting admin,"Easy to get the job, Nice colleagues.","Abysmal pay, around minimum wage. No actual tr...",2016,"Easy to get the job, Nice colleagues.","Abysmal pay, around minimum wage. No actual tr..."
3,6,AFH Wealth Management,2016-09-23,IFA,Former Employee,"Bromsgrove, England, England",1,1.0,1.0,,...,1.0,x,o,r,It horrible management,Good investment management strategy. Overall t...,The management and seniors are ruthless. No tr...,2016,Good investment management strategy. Overall t...,The management and seniors are ruthless. No tr...
4,13,AFH Wealth Management,2017-05-21,Administrative Support,"Former Employee, more than 5 years","Birmingham, England, England",1,4.0,1.0,,...,1.0,o,v,v,Administration team,Free parking . Meet some nice people in the te...,Not treated at all well after 6 yrs of being a...,2017,Free parking . Meet some nice people in the te...,Not treated at all well after 6 yrs of being a...


## DISREGARD BELOW TWO

In [28]:
# Merge pros and cons into a single text field
df_text["combined_comments"] = df_text["pros_clean_min"].fillna("") + "\n" + df_text["cons_clean_min"].fillna("")

# Group by firm
firm_comments = df_text.groupby("firm")["combined_comments"].apply(lambda texts: "\n".join(texts)).reset_index()

# Load the summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Summarize function
def generate_summary(text):
    # BART models usually handle up to 1024 tokens (~800 words) easily
    if len(text) > 1024:
        text = text[:1024]
    summary = summarizer(text, max_length=100, min_length=30, do_sample=False)
    return summary[0]['summary_text']

# Add tqdm to the apply function
tqdm.pandas(desc="Summarizing companies")

# Apply summarization
firm_comments["summary"] = firm_comments["combined_comments"].progress_apply(generate_summary)

# Save the results
firm_comments.to_csv("company_summaries_bart_large.csv", index=False)
print("✅ Summarization complete. Saved to company_summaries_bart_large.csv")

Device set to use mps:0
Summarizing companies:  45%|████▍     | 149/333 [03:18<03:55,  1.28s/it]Your max_length is set to 100, but your input_length is only 42. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)
Summarizing companies:  53%|█████▎    | 177/333 [03:59<03:09,  1.22s/it]Your max_length is set to 100, but your input_length is only 59. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=29)
Summarizing companies:  66%|██████▌   | 220/333 [04:56<02:39,  1.41s/it]Your max_length is set to 100, but your input_length is only 18. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)
Summarizing co

✅ Summarization complete. Saved to company_summaries_bart_large.csv


In [30]:
from transformers import pipeline
import pandas as pd
from tqdm import tqdm

# Load your preprocessed dataset
df_text = pd.read_csv("/Users/arzumkarahan/Downloads/preprocessed_england_dataset.csv")

# 🧠 No need to merge pros and cons immediately here
# We will build a structured text later inside the summarization function

# Group pros and cons separately by firm
firm_comments = df_text.groupby("firm").agg({
    "pros_clean_min": lambda x: " ".join(x.dropna()),
    "cons_clean_min": lambda x: " ".join(x.dropna())
}).reset_index()

# Load the summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Summarization function (improved with PROS and CONS labeling)
def generate_summary(row):
    pros_text = row["pros_clean_min"]
    cons_text = row["cons_clean_min"]

    combined_text = f"PROS: {pros_text}\nCONS: {cons_text}\nSummarize both sections clearly."

    if len(combined_text) > 1024:
        combined_text = combined_text[:1024]  # truncate if needed to avoid overflow

    summary = summarizer(combined_text, max_length=100, min_length=30, do_sample=False)
    return summary[0]['summary_text']

# Add tqdm for progress bar
tqdm.pandas(desc="Summarizing companies")

# Apply summarization
firm_comments["summary"] = firm_comments.progress_apply(generate_summary, axis=1)

# Save the results
firm_comments.to_csv("company_summaries_prosconsseparate_bart_large.csv", index=False)
print("✅ Summarization complete. Saved to company_summaries_bart_large.csv")

Device set to use mps:0
Summarizing companies:  45%|████▍     | 149/333 [03:23<04:04,  1.33s/it]Your max_length is set to 100, but your input_length is only 56. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Summarizing companies:  53%|█████▎    | 177/333 [04:04<03:53,  1.50s/it]Your max_length is set to 100, but your input_length is only 68. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=34)
Summarizing companies:  66%|██████▌   | 220/333 [05:01<02:25,  1.29s/it]Your max_length is set to 100, but your input_length is only 32. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)
Summarizing c

✅ Summarization complete. Saved to company_summaries_bart_large.csv


## PROS AND CONS SEPARATED AND SUMMARIZED (use this one)

In [31]:
from transformers import pipeline
import pandas as pd
from tqdm import tqdm

# Load preprocessed dataset
df_text = pd.read_csv("/Users/arzumkarahan/Downloads/preprocessed_england_dataset.csv")

# Group by firm, aggregate all pros and cons separately
firm_comments = df_text.groupby("firm").agg({
    "pros_clean_min": lambda x: " ".join(x.dropna()),
    "cons_clean_min": lambda x: " ".join(x.dropna())
}).reset_index()

# Load BART summarizer pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Define function to summarize pros and cons separately
def generate_separate_summary(row):
    pros_text = row["pros_clean_min"]
    cons_text = row["cons_clean_min"]

    # Summarize PROS
    pros_summary = ""
    if pros_text.strip():
        pros_input = f"Summarize the following positive employee feedback:\n{pros_text}"
        pros_result = summarizer(pros_input[:1024], max_length=60, min_length=20, do_sample=False)
        pros_summary = pros_result[0]['summary_text']

    # Summarize CONS
    cons_summary = ""
    if cons_text.strip():
        cons_input = f"Summarize the following negative employee feedback:\n{cons_text}"
        cons_result = summarizer(cons_input[:1024], max_length=60, min_length=20, do_sample=False)
        cons_summary = cons_result[0]['summary_text']

    # Combine both in a structured way
    return f"PROS:\n{pros_summary}\n\nCONS:\n{cons_summary}"

# Enable tqdm progress bar
tqdm.pandas(desc="Summarizing companies")

# Apply summarization
firm_comments["summary"] = firm_comments.progress_apply(generate_separate_summary, axis=1)

# Save to CSV
firm_comments.to_csv("company_summaries_structured_bart.csv", index=False)
print("✅ Summarization complete. Saved to company_summaries_structured_bart.csv")

Device set to use mps:0
Summarizing companies:  25%|██▍       | 82/333 [04:50<12:13,  2.92s/it]  Your max_length is set to 60, but your input_length is only 55. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)
Summarizing companies:  45%|████▍     | 149/333 [14:36<08:33,  2.79s/it]   Your max_length is set to 60, but your input_length is only 20. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)
Your max_length is set to 60, but your input_length is only 43. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)
Summarizing companies:  53%|█████▎    | 177/333 [19:27<06:48,  2.62s/it]  Your max_l

✅ Summarization complete. Saved to company_summaries_structured_bart.csv
