In [16]:
#import necessary packages
from gliner import GLiNER
import pandas as pd

In [10]:
#Pick NER model
model = GLiNER.from_pretrained("urchade/gliner_large-v2.1")
#model = 

text = """
I would like to thank Dr. Johnson in Biochemistry for all her mentorship and guidance in my program. Also, Dr. Zhao taught the best class I took at the university. 
"""

#Specify what we want to extract
labels = ["person", "subject"] #any entity will work -- departments, organizations, etc.

#Call the model to find our entities
entities = model.predict_entities(text, labels)

#Print the resulting data
for entity in entities:
    print(entity["text"], "=>", entity["label"])

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Dr. Johnson => person
Biochemistry => subject
Dr. Zhao => person


In [35]:
#Here we'll loop through a number of student responses to extract all the names
#Import data -- just an excel file with student responses and some data about the students

df = pd.read_excel("fake_student_kudos.xlsx")  
#print(df.head())


#select labels
labels = ["person"]


#Here we loop through the responses and create a new dataframe that will keep the student's department, the kudos text
#the score (how confident the model is that this is a person), a question ID, and we'll extract the last name, which we'll use to join later
full_kudos_df = pd.DataFrame(columns = ['student_dept', 'kudos_text', 'person', 'score', 'question_id', 'last_name'])
for i in range(len(df)):     
    text = df['text'].iloc[i]
    entities = model.predict_entities(text, labels)
    if entities == []: #Including this in case the model doesn't find any people in one response
        entity_df = pd.DataFrame(columns=['start', 'end', 'text', 'label', 'score'])
    else: 
        entity_df = pd.DataFrame(entities)
    entity_df.rename(columns = {'text':'person'}, inplace=True)
    entity_df['question_id'] = df['question_id'].iloc[i]
    entity_df['student_dept'] = df['student_department'].iloc[i]
    entity_df['kudos_text'] = df['text'].iloc[i]
    entity_df['last_name'] = entity_df['person'].str.split(' ').str[-1]
    short_entity_df = entity_df[['student_dept', 'kudos_text', 'person', 'score', 'question_id', 'last_name']]
    full_kudos_df = pd.concat([full_kudos_df, short_entity_df], ignore_index=True, axis=0)


print(full_kudos_df)

  full_kudos_df = pd.concat([full_kudos_df, short_entity_df], ignore_index=True, axis=0)


   student_dept                                         kudos_text  \
0  Biochemistry  I would like to thank Dr. Johnson in Biochemis...   
1  Biochemistry  I would like to thank Dr. Johnson in Biochemis...   
2       English  Dr.smith was a great professor – I lerned so m...   
3     Economics  Kevin Shovanic ruined my college experience wi...   

           person     score question_id last_name  
0     Dr. Johnson  0.982321           1   Johnson  
1        Dr. Zhao  0.972869           1      Zhao  
2        Dr.smith  0.994968           2  Dr.smith  
3  Kevin Shovanic  0.983509           3  Shovanic  


In [36]:
#some basic cleaning -- you will have to update as new issues become apparent
#This first one just looks for missing spaces with the .
update_name = []
for i in range(len(full_kudos_df)):
    if "." in full_kudos_df["last_name"].iloc[i]:
        check = full_kudos_df["last_name"].iloc[i].split(".", 1)[-1]
        update_name.append(check)
    else:
        check = full_kudos_df["last_name"].iloc[i]
        update_name.append(check)

full_kudos_df['last_name'] = update_name
full_kudos_df = full_kudos_df[full_kudos_df.last_name != "I"]
full_kudos_df = full_kudos_df[full_kudos_df.last_name != "me"]
full_kudos_df.head()
    


Unnamed: 0,student_dept,kudos_text,person,score,question_id,last_name
0,Biochemistry,I would like to thank Dr. Johnson in Biochemis...,Dr. Johnson,0.982321,1,Johnson
1,Biochemistry,I would like to thank Dr. Johnson in Biochemis...,Dr. Zhao,0.972869,1,Zhao
2,English,Dr.smith was a great professor – I lerned so m...,Dr.smith,0.994968,2,smith
3,Economics,Kevin Shovanic ruined my college experience wi...,Kevin Shovanic,0.983509,3,Shovanic


In [59]:
#Now we're going to bring in our list of employees to match the names
emp_df = pd.read_excel("NER/fake_employee_data.xlsx")  
emp_df.head(10)

Unnamed: 0,employee_first_name,employee_last_name,employee_department
0,Eliza,smith,Chemistry
1,Ryan,Smith,English
2,Donna,Smith,Accounting
3,Kevin,Chovanec,Institutional Resarch
4,Xin,Zhao,Biochemistry
5,Susan,Johnson,Electrical Engineering


In [60]:
#And then join this to the entities dataframe on last names
join_df = full_kudos_df.merge(emp_df, left_on=full_kudos_df['last_name'].str.lower(), right_on=emp_df['employee_last_name'].str.lower(), how='left')
join_df.head(10)

Unnamed: 0,key_0,student_dept,kudos_text,person,score,question_id,last_name,employee_first_name,employee_last_name,employee_department
0,johnson,Biochemistry,I would like to thank Dr. Johnson in Biochemis...,Dr. Johnson,0.982321,1,Johnson,Susan,Johnson,Electrical Engineering
1,zhao,Biochemistry,I would like to thank Dr. Johnson in Biochemis...,Dr. Zhao,0.972869,1,Zhao,Xin,Zhao,Biochemistry
2,smith,English,Dr.smith was a great professor – I lerned so m...,Dr.smith,0.994968,2,smith,Eliza,smith,Chemistry
3,smith,English,Dr.smith was a great professor – I lerned so m...,Dr.smith,0.994968,2,smith,Ryan,Smith,English
4,smith,English,Dr.smith was a great professor – I lerned so m...,Dr.smith,0.994968,2,smith,Donna,Smith,Accounting
5,shovanic,Economics,Kevin Shovanic ruined my college experience wi...,Kevin Shovanic,0.983509,3,Shovanic,,,


In [62]:
#Look at who is missing from the join
missing = join_df[join_df['employee_last_name'].isnull()]
missing.head()

Unnamed: 0,key_0,student_dept,kudos_text,person,score,question_id,last_name,employee_first_name,employee_last_name,employee_department
5,shovanic,Economics,Kevin Shovanic ruined my college experience wi...,Kevin Shovanic,0.983509,3,Shovanic,,,


In [63]:
#Then, we can try to fix missing names
#I imagine we run the sequence matcher but also check

#name is the name pulled from the comment; emp name is a list of names from our emp file; threshold sets how closely related they have to be to update
import difflib

def name_replace(name, emp_name, threshold=.70):
    updated_name = ''
    max_val = 0
    for i in range(len(emp_name)):
        #print(emp_name[i])
        #print(difflib.SequenceMatcher(None, name, emp_name[i]).ratio())
        if difflib.SequenceMatcher(None, name, emp_name[i]).ratio() >= threshold and difflib.SequenceMatcher(None, name, emp_name[i]).ratio() > max_val:
            updated_name = emp_name[i] 
            #print(updated_name)
            max_val = difflib.SequenceMatcher(None, name, emp_name[i]).ratio()
        #Adding a check for when they're tied -- go with the name with the same first letter
        if difflib.SequenceMatcher(None, name, emp_name[i]).ratio() >= threshold and difflib.SequenceMatcher(None, name, emp_name[i]).ratio() == max_val:
            if name[0].lower == emp_name[i][0].lower:
                updated_name = emp_name[i] 
                #print(updated_name)
                max_val = difflib.SequenceMatcher(None, name, emp_name[i]).ratio()                
    return updated_name

In [64]:
#Try to update last names to check for typos/misspelled names
last_names_list = emp_df['employee_last_name'].tolist()

updated_names = []
for nm in range(len(missing)):
    name = missing['last_name'].iloc[nm]
    update = name_replace(name.title(), last_names_list, threshold=.7)
    updated_names.append(update)

missing['updated_names'] = updated_names
missing = missing.drop(['employee_first_name', 'employee_last_name', 'employee_department'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing['updated_names'] = updated_names


In [65]:
#rejoining to the employee data based on updated name
updated_join = missing.merge(emp_df, left_on='updated_names'.lower(), right_on='employee_last_name'.lower(), how='left')

#adding an updated name flag so we can check all updated names
updated_join['updated_name_flag'] = 1
updated_join.head()

Unnamed: 0,key_0,student_dept,kudos_text,person,score,question_id,last_name,updated_names,employee_first_name,employee_last_name,employee_department,updated_name_flag
0,shovanic,Economics,Kevin Shovanic ruined my college experience wi...,Kevin Shovanic,0.983509,3,Shovanic,Chovanec,Kevin,Chovanec,Institutional Resarch,1


In [75]:
#creating a df of just the original names to merge back with the updated names
init = join_df.dropna(subset=['employee_last_name'])

#adding updated name flag
init['updated_name_flag'] = 0
init

#rejoin the original to the updates
final_df = pd.concat([updated_join, init], ignore_index=True, axis=0)
final_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  init['updated_name_flag'] = 0


Unnamed: 0,key_0,student_dept,kudos_text,person,score,question_id,last_name,updated_names,employee_first_name,employee_last_name,employee_department,updated_name_flag
0,shovanic,Economics,Kevin Shovanic ruined my college experience wi...,Kevin Shovanic,0.983509,3,Shovanic,Chovanec,Kevin,Chovanec,Institutional Resarch,1
1,johnson,Biochemistry,I would like to thank Dr. Johnson in Biochemis...,Dr. Johnson,0.982321,1,Johnson,,Susan,Johnson,Electrical Engineering,0
2,zhao,Biochemistry,I would like to thank Dr. Johnson in Biochemis...,Dr. Zhao,0.972869,1,Zhao,,Xin,Zhao,Biochemistry,0
3,smith,English,Dr.smith was a great professor – I lerned so m...,Dr.smith,0.994968,2,smith,,Eliza,smith,Chemistry,0
4,smith,English,Dr.smith was a great professor – I lerned so m...,Dr.smith,0.994968,2,smith,,Ryan,Smith,English,0


In [76]:
#Look for anyone with a first name mentioned in the extracted entity
final_df['first_name_match'] = final_df.apply(lambda row: str(row.employee_first_name) in row.person, axis=1).astype(int)

#Create an indicator for first name matches and join back to original dataframe
first_names_check = final_df.groupby(['person', 'question_id'])['first_name_match'].sum()
final_df = final_df.merge(first_names_check, left_on=['person', 'question_id'], right_on=['person', 'question_id'], how='left')

#drop any non first name matching rows that are duplicates of a extracted entity with a first name match
final_df = final_df.drop(final_df[(final_df.first_name_match_x == 0) & (final_df.first_name_match_y == 1)].index)

#Look for anyone with the same department as the student's major
final_df['department_match'] = final_df.apply(lambda row: row.student_dept == row.employee_department, axis=1).astype(int)

#Create an indicator for department matches and join back to original dataframe
department_check = final_df.groupby(['person', 'question_id'])['department_match'].sum()
final_df = final_df.merge(department_check, left_on=['person', 'question_id'], right_on=['person', 'question_id'], how='left')

#drop any non first name matching rows that are duplicates of a extracted entity with a first name match
final_df = final_df.drop(final_df[(final_df.department_match_x == 0) & (final_df.department_match_y == 1)].index)

#Dropping the columns we used to delete rows and adding a dup flag, just in case our tests didn't eliminate all duplicates
final_df = final_df.drop(columns=['first_name_match_x', 'first_name_match_y', 'department_match_x', 'department_match_y'])
final_df['dups'] = final_df.duplicated(subset=['text', 'question_id'], keep=False).astype(int)
final_df


Unnamed: 0,key_0,student_dept,kudos_text,person,score,question_id,last_name,updated_names,employee_first_name,employee_last_name,employee_department,updated_name_flag,first_name_match_x,first_name_match_y,department_match_x,department_match_y
0,shovanic,Economics,Kevin Shovanic ruined my college experience wi...,Kevin Shovanic,0.983509,3,Shovanic,Chovanec,Kevin,Chovanec,Institutional Resarch,1,1,1,0,0
1,johnson,Biochemistry,I would like to thank Dr. Johnson in Biochemis...,Dr. Johnson,0.982321,1,Johnson,,Susan,Johnson,Electrical Engineering,0,0,0,0,0
2,zhao,Biochemistry,I would like to thank Dr. Johnson in Biochemis...,Dr. Zhao,0.972869,1,Zhao,,Xin,Zhao,Biochemistry,0,0,0,1,1
4,smith,English,Dr.smith was a great professor – I lerned so m...,Dr.smith,0.994968,2,smith,,Ryan,Smith,English,0,0,0,1,1


In [77]:
#finally, we could also run a quick sentiment analysis to make sure we aren't including negative comments

from transformers import pipeline

distilled_student_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
    return_all_scores=True
)

#If running sentiment analysis seperately, reimport the datafile
#df = pd.read_excel("fake_student_kudos")  



In [80]:
negative_ind = []
for txt in range(len(df)):   #len(kudos_df)
    text = df['text'].iloc[txt]
    scores = distilled_student_sentiment_classifier(text[:2400])
    negative = list(scores[0][2].values())[1]
    if negative >= .5:
        negative_flag = 1
    else:
        negative_flag = 0 
    negative_ind.append(negative_flag)
    print (df['text'].iloc[txt])
    print (scores[0][2])

I would like to thank Dr. Johnson in Biochemistry for all her mentorship and guidance in my program. Also, Dr. Zhao taught the best class I took at the university. 
{'label': 'negative', 'score': 0.09638964384794235}
Dr.smith was a great professor – I lerned so much about clear, descriptive writing and good grammer in the class! 
{'label': 'negative', 'score': 0.024455105885863304}
Kevin Shovanic ruined my college experience with inscrutable dashboards
{'label': 'negative', 'score': 0.5189586877822876}
