In [2]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import numpy as np
from collections import Counter
from tqdm import tqdm
import pickle

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def preprocess(text, tokenizer):
    words = tokenizer.tokenize(text.lower())
    return [w for w in words if w not in stop_words]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def text_to_bert_embedding(text, model, tokenizer):
    model = model.to(device)
    try:
        tokens = tokenizer.encode(text, add_special_tokens=True)
        input_ids = torch.tensor(tokens).unsqueeze(0).to(device)
        with torch.no_grad():
            outputs = model(input_ids)
        embedding = torch.mean(outputs.last_hidden_state, dim=1).cpu().numpy()
        return embedding
    except:
        return [[]]


In [5]:
df= pd.read_csv(r"./major.csv")
df.dropna(inplace=True)
df = df[df['major_field'] != 'education']
df = df[df['major_field'] != 'law']
df = df[df['major_field'] != 'civil.engineering']
df = df[df['major_field'] != 'geography']



In [6]:
df['major_field'].unique()

array(['linguistics', 'economics', 'history', 'architecture_art',
       'electrical & comp. engineering', 'mechanical.engineering',
       'psychology', 'english.language.and.literature',
       'mathematics_statistic', 'sociology', 'biology', 'medicine_health',
       'archeology', 'chemical.engineering', 'development.studies',
       'political.science', 'accounting.and.finance', 'philosophy',
       'physics', 'agriculture_environment', 'communication_info'],
      dtype=object)

In [44]:
from sklearn.model_selection import train_test_split
import pandas as pd
min_data_per_university = df.groupby('University').size().min()

# Create an empty list to store the sampled data
sampled_data = []

# Iterate over unique universities
for university in df['University'].unique():
    # Get the data for the current university
    university_data = df[df['University'] == university]
    
    # If the current university has more data than 'min_data_per_university', sample 'min_data_per_university' rows
    if len(university_data) > min_data_per_university:
        sampled_university_data = university_data.sample(min_data_per_university, random_state=42)
    else:
        sampled_university_data = university_data
    
    sampled_data.append(sampled_university_data)
balanced_df = pd.concat(sampled_data)

train_df, test_df = train_test_split(balanced_df, test_size=0.1, random_state=42)

test_df.to_csv('major_test.csv', index=False)

train_df.shape

(12875, 7)

In [48]:
tmp = []
for item in train_df["reviews_lemmatized"].apply(lambda x: x.split()):
    tmp.extend(item)
users_stop_words = []
for _ in Counter(tmp).most_common(30):
    if _[0] in stopwords.words('english'):
        continue
    else:
        users_stop_words.append(_[0])
stop_words = set(stopwords.words('english')+users_stop_words)

with open('stop_words.pkl', 'wb') as file:
    pickle.dump(stop_words, file)

with open('stop_words.pkl', 'rb') as file:
    stop_words = pickle.load(file)
tqdm.pandas()
embeddings_2 = train_df["reviews_lemmatized"].progress_apply(lambda x: text_to_bert_embedding(preprocess(x, tokenizer), model, tokenizer))
data_to_save = {
    'embeddings': np.vstack(embeddings_2),
    'reviews':  train_df["reviews_lemmatized"].to_numpy(),
    "University":  train_df["University"].to_numpy(),
    "Professor":train_df["Professor"].to_numpy(),
    "major_field": train_df["major_field"].to_numpy(),
    "course": train_df["course"].to_numpy(),
    "created": train_df["created"].to_numpy(),
}

np.save('major_embedding_bert.npy', data_to_save)

100%|████████████████████████████████████████████████████████████████████████████| 12875/12875 [24:30<00:00,  8.75it/s]


In [74]:
import numpy as np
import zipfile
import os
embedding_file='major_embedding_bert.zip'
def load_reviews(zip_file_path):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall("extracted_data")
        npy_file = [f for f in os.listdir("extracted_data") if f.endswith('.npy')][0]
        npy_file_path = os.path.join("extracted_data", npy_file)
        data = np.load(npy_file_path, allow_pickle=True).item()
        os.remove(npy_file_path)
        os.rmdir("extracted_data")
    return data['embeddings'], data['reviews'], data['University'], data['Professor'],data['major_field'], data['course'], data['created']


def calculate_topic_probabilities_with_reviews(input_text, model, tokenizer, zip_file_path):
    embeddings, reviews, universities, professor, sub_topics, course, created_dates = load_reviews(zip_file_path)
    input_text_tokens = preprocess(input_text, tokenizer)
    input_text_embedding = text_to_bert_embedding(" ".join(input_text_tokens), model, tokenizer)
    similarities = cosine_similarity(input_text_embedding.reshape(1, -1), embeddings).flatten()

    topic_scores = {}
    topic_counts = {}
    
    for score, topic in zip(similarities, sub_topics):
        topic_scores[topic] = topic_scores.get(topic, 0) + score
        topic_counts[topic] = topic_counts.get(topic, 0) + 1
    for topic in topic_scores:
        topic_scores[topic] /= topic_counts[topic]

    top_topics = sorted(topic_scores, key=topic_scores.get, reverse=True)[:5]
    total_topic_score = sum(topic_scores[topic] for topic in top_topics)
    topic_probabilities = {topic: topic_scores[topic] / total_topic_score for topic in top_topics}

    filtered_indices = [i for i, topic in enumerate(sub_topics) if topic in top_topics]
    filtered_data = {
        'major_field': [sub_topics[i] for i in filtered_indices],
        'review': [reviews[i] for i in filtered_indices],
        'similarity_score': [similarities[i] for i in filtered_indices],
        'University': [universities[i] for i in filtered_indices],
        'professor': [professor[i] for i in filtered_indices],
        'course': [course[i] for i in filtered_indices],
        'created': [created_dates[i] for i in filtered_indices],
        'topic_probability': [topic_probabilities[sub_topics[i]] for i in filtered_indices]
    }

    filtered_df = pd.DataFrame(filtered_data)

    return filtered_df
def evaluate_major(input_text, model, tokenizer, zip_file_path, input_major):
    df = calculate_topic_probabilities_with_reviews(input_text, model, tokenizer, zip_file_path)
    majors_prob_df = df.groupby('major_field')['topic_probability'].mean()
    sorted_majors = majors_prob_df.sort_values(ascending=False)
    ranking = None
    if input_major in sorted_majors.index:
        ranking = sorted_majors.index.tolist().index(input_major) + 1
    if ranking is None:
        assessment = 'bad'
    elif ranking <= 3:
        assessment = 'perfect'
    elif ranking <= 5:
        assessment = 'good'
    elif ranking <= 10:
        assessment = 'reasonable'
    else:
        assessment = 'bad'
    top_majors_list = sorted_majors.head(5).index.tolist()
    return assessment, top_majors_list
input_text = "I love working professor Matthew Potts, He is funny and knowledgeble"

a,b=evaluate_major(input_text, model, tokenizer, embedding_file,'chemical.engineering')
b[0]

'chemical.engineering'

In [75]:
def calculate_weighted_average_similarity(input_text, model, tokenizer, embedding_file):
    filtered_df = calculate_topic_probabilities_with_reviews(input_text, model, tokenizer, embedding_file)
    
    filtered_df['University'] = filtered_df['University'].astype(str)
    filtered_df['major_field'] = filtered_df['major_field'].astype(str)

    mean_scores = filtered_df.groupby(['University', 'major_field']).agg({'similarity_score': 'mean'}).reset_index()
    mean_scores['weighted_score'] = mean_scores.apply(
        lambda x: x['similarity_score'] * filtered_df[filtered_df['major_field'] == x['major_field']]['topic_probability'].iloc[0], 
        axis=1
    )
    was_scores = mean_scores.groupby('University')['weighted_score'].sum().reset_index().rename(columns={'weighted_score': 'WAS'})

    total_was = was_scores['WAS'].sum()
    was_scores['WAS'] = was_scores['WAS'] / total_was

    highest_prob_topic = mean_scores.loc[mean_scores.groupby('University')['similarity_score'].idxmax()][['University', 'major_field']]
    highest_prob_topic.columns = ['University', 'highest_prob_major']
    highest_prob_topic['University'] = highest_prob_topic['University'].astype(str)
    highest_prob_topic['highest_prob_major'] = highest_prob_topic['highest_prob_major'].astype(str)

    result = pd.merge(was_scores, highest_prob_topic, on='University')

    relevant_review = filtered_df.loc[filtered_df.groupby(['University', 'major_field'])['similarity_score'].idxmax()][['University', 'major_field', 'review','professor', 'course', 'created']]
    relevant_review.columns = ['University', 'highest_prob_major', 'Most_Relevant_Review', 'Most_Relevant_faculty', 'Most_Relevant_course', 'Most_Relevant_Created']
    relevant_review['University'] = relevant_review['University'].astype(str)
    relevant_review['highest_prob_major'] = relevant_review['highest_prob_major'].astype(str)

    final_result = pd.merge(result, relevant_review, on=['University', 'highest_prob_major'])

    return final_result[['University', 'WAS', 'highest_prob_major', 'Most_Relevant_Review','Most_Relevant_faculty', 'Most_Relevant_course', 'Most_Relevant_Created']]

def calculate_confidence_score(input_text, model, tokenizer, embedding_file):
    df = calculate_topic_probabilities_with_reviews(input_text, model, tokenizer, embedding_file)    
    topic_counts = df['major_field'].value_counts()
    total_data_points = len(df)
    topic_portions = topic_counts / total_data_points
    topic_probabilities = df['topic_probability'].groupby(df['major_field']).mean()
    weighted_average = sum(topic_portions * topic_probabilities)
    return weighted_average


input_text = "I love working professor Matthew Potts, He is funny and knowledgeble"
calculate_weighted_average_similarity(input_text, model, tokenizer, embedding_file)

Unnamed: 0,University,WAS,highest_prob_major,Most_Relevant_Review,Most_Relevant_faculty,Most_Relevant_course,Most_Relevant_Created
0,Boston University,0.045704,chemical.engineering,"hey, hey, here's something ... y'all ever hear...",Rosina Georgiadis,CH109,2005-12-13 12:55:07 +0000 UTC
1,Brown University,0.04517,communication_info,stay away. lorin doesn't seem to care about te...,Lorin Crawford,PHP2605,2020-01-28 07:12:26 +0000 UTC
2,Carnegie Mellon University,0.044022,biology,"seems nice, but doesn't seem like a great teac...",Linda Robic,3121,2014-11-21 23:53:42 +0000 UTC
3,Columbia University,0.041757,biology,40% of your grade will be based on blog posts ...,Lili Yamasaki,GU4300,2021-03-24 18:11:44 +0000 UTC
4,Cornell University,0.045517,biology,take this class!! mrs calliaud is literally th...,Marina Caillaud,BSOC2100,2023-09-26 23:52:39 +0000 UTC
5,Duke University,0.036281,english.language.and.literature,a bit of a stickler on grading papers but she ...,Nancy Mullenneaux,WRIT20,2011-11-22 14:06:31 +0000 UTC
6,Harvard University,0.043033,sociology,"garfinkle, funny name, funny professor.don't k...",Paul Garfinkle,WS499,2006-09-08 23:25:03 +0000 UTC
7,Johns Hopkins University,0.043773,chemical.engineering,i don't think the class should be more credits...,Louise Pasternack,CHEMLAB,2007-06-12 12:16:01 +0000 UTC
8,Massachusetts Institute of Technology,0.034145,chemical.engineering,i agree so much w/the comment about patronizin...,Janet Schrenk,5111,2004-06-13 20:54:18 +0000 UTC
9,New York University,0.045498,chemical.engineering,yoel is an incredible ta. his recitations are ...,Yoel Ohayon,GENCHEM126,2017-05-29 16:01:34 +0000 UTC


In [67]:
calculate_confidence_score(input_text, model, tokenizer, embedding_file)

0.200118808712514

In [77]:
from tqdm import tqdm
test_df= pd.read_csv(r"test.csv", encoding='ISO-8859-1')
predicted_universities = []
for review in tqdm(test_df['reviews_lemmatized'], desc="Processing Reviews"):
    preprocessed_review = " ".join(review.lower().split())
    result = calculate_weighted_average_similarity(preprocessed_review, model, tokenizer, embedding_file)
    top_universities = result.nlargest(5, 'WAS')['University'].tolist()
    predicted_universities.append(top_universities)
test_df['Predicted_University'] = predicted_universities
test_df['Match'] = test_df.apply(lambda row: row['University'] in row['Predicted_University'], axis=1)
accuracy = test_df['Match'].sum() / len(test_df)
print("Accuracy:", accuracy)

Processing Reviews: 100%|████████████████████████████████████████████████████████████| 489/489 [05:52<00:00,  1.39it/s]

Accuracy: 0.48261758691206547





------

In [1]:

from major_embeddings import EmbeddingsCalculator
from tqdm.auto import tqdm

ta = EmbeddingsCalculator()
input_text = "I love working professor Matthew Potts, He is funny and knowledgeble"
ta.calculate_topic_probabilities_with_reviews(input_text)


  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'course' is not defined