In [18]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import numpy as np
from collections import Counter
from tqdm import tqdm
import pickle
import numpy as np
import zipfile
import os
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
with open('all_stop_words.pkl', 'rb') as file:
    stop_words = pickle.load(file)
def preprocess(text, tokenizer):
    words = tokenizer.tokenize(text.lower())
    return [w for w in words if w not in stop_words]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def text_to_bert_embedding(text, model, tokenizer):
    model = model.to(device)
    try:
        tokens = tokenizer.encode(text, add_special_tokens=True)
        input_ids = torch.tensor(tokens).unsqueeze(0).to(device)
        with torch.no_grad():
            outputs = model(input_ids)
        embedding = torch.mean(outputs.last_hidden_state, dim=1).cpu().numpy()
        return embedding
    except:
        return [[]]

In [4]:

df= pd.read_csv(r"./majorreview.csv", encoding='ISO-8859-1')
df_all= pd.read_csv(r"./alltext.csv", encoding='ISO-8859-1')


In [5]:
from sklearn.model_selection import train_test_split
import pandas as pd
university_review_counts = df_all.groupby('sub_topic')['reviews_lemmatized'].count()

min_review_count = university_review_counts.min()

sampled_data = []
for topic in df_all['sub_topic'].unique():
    sampled_topic_data = df_all[df_all['sub_topic'] == topic].sample(min_review_count, random_state=42)
    sampled_data.append(sampled_topic_data)

balanced_df_all = pd.concat(sampled_data)

train_df, test_df = train_test_split(balanced_df_all, test_size=0.1, stratify=balanced_df_all['sub_topic'], random_state=42)
test_df.to_csv('test.csv', index=False)
train_df.to_csv('train.csv', index=False)

In [14]:
topic_counts = train_df.groupby('sub_topic').size()
train_topic_counts = train_df.groupby('sub_topic').size()
topic_counts = test_df.groupby('sub_topic').size()
test_topic_counts = test_df.groupby('sub_topic').size()
print(topic_counts)
print(train_topic_counts)

sub_topic
Administration and school policies    489
Admission process                     489
Career opportunities                  489
Diversity and inclusion               488
Financial aid and scholarships        488
General academic quality              489
Online learning                       488
Student opportunities                 489
Technology and computer labs          489
dtype: int64
sub_topic
Administration and school policies    489
Admission process                     489
Career opportunities                  489
Diversity and inclusion               488
Financial aid and scholarships        488
General academic quality              489
Online learning                       488
Student opportunities                 489
Technology and computer labs          489
dtype: int64


In [13]:

print(topic_counts)
print(test_topic_counts)

sub_topic
Administration and school policies    54
Admission process                     54
Career opportunities                  54
Diversity and inclusion               55
Financial aid and scholarships        55
General academic quality              54
Online learning                       55
Student opportunities                 54
Technology and computer labs          54
dtype: int64
sub_topic
Administration and school policies    54
Admission process                     54
Career opportunities                  54
Diversity and inclusion               55
Financial aid and scholarships        55
General academic quality              54
Online learning                       55
Student opportunities                 54
Technology and computer labs          54
dtype: int64


In [9]:
tmp = []
for item in train_df["reviews_lemmatized"].astype(str).apply(lambda x: x.split()):
    tmp.extend(item)

users_stop_words = []
for _ in Counter(tmp).most_common(30):
    if _[0] in stopwords.words('english'):
        continue
    else:
        users_stop_words.append(_[0])

stop_words = set(stopwords.words('english')+users_stop_words)
with open('all_stop_words.pkl', 'wb') as file:
    pickle.dump(stop_words, file)

In [58]:
with open('all_stop_words.pkl', 'rb') as file:
    stop_words = pickle.load(file)

tqdm.pandas()
embeddings_2 = train_df["reviews_lemmatized"].progress_apply(lambda x: text_to_bert_embedding(preprocess(x, tokenizer), model, tokenizer))

np.save('reviews_embedding_bert.npy', 
        {'embeddings': np.vstack(embeddings_2),
         'University': train_df["University"].to_numpy(),
         'reviews': train_df["reviews_lemmatized"].to_numpy(),
         'sub_topic': train_df["sub_topic"].to_numpy(),
         'author': train_df["author"].to_numpy(),
         'created': train_df["created"].to_numpy()})

100%|██████████████████████████████████████████████████████████████████████████████| 8299/8299 [14:23<00:00,  9.61it/s]


In [24]:
import numpy as np
import zipfile
import os
embedding_file='reviews_embedding_bert.zip'
def load_reviews(zip_file_path):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall("extracted_data")
        npy_file = [f for f in os.listdir("extracted_data") if f.endswith('.npy')][0]
        npy_file_path = os.path.join("extracted_data", npy_file)
        data = np.load(npy_file_path, allow_pickle=True).item()
        os.remove(npy_file_path)
        os.rmdir("extracted_data")
    return data['embeddings'], data['reviews'], data['University'], data['sub_topic'], data['author'], data['created']

def calculate_topic_probabilities_with_reviews(input_text, model, tokenizer, zip_file_path):
    embeddings, reviews, universities, sub_topics, authors, created_dates = load_reviews(zip_file_path)
    input_text_tokens = preprocess(input_text, tokenizer)
    input_text_embedding = text_to_bert_embedding(" ".join(input_text_tokens), model, tokenizer)
    similarities = cosine_similarity(input_text_embedding.reshape(1, -1), embeddings).flatten()
    topic_scores = {}
    for score, topic in zip(similarities, sub_topics):
        topic_scores[topic] = topic_scores.get(topic, 0) + score
    top_topics = sorted(topic_scores, key=topic_scores.get, reverse=True)[:3]
    total_topic_score = sum(topic_scores[topic] for topic in top_topics)
    topic_probabilities = {topic: topic_scores[topic] / total_topic_score for topic in top_topics}
    filtered_indices = [i for i, topic in enumerate(sub_topics) if topic in top_topics]
    filtered_data = {
        'sub_topic': [sub_topics[i] for i in filtered_indices],
        'review': [reviews[i] for i in filtered_indices],
        'similarity_score': [similarities[i] for i in filtered_indices],
        'university': [universities[i] for i in filtered_indices],
        'author': [authors[i] for i in filtered_indices],
        'created': [created_dates[i] for i in filtered_indices],
        'topic_probability': [topic_probabilities[sub_topics[i]] for i in filtered_indices]
    }

    filtered_df = pd.DataFrame(filtered_data)

    return filtered_df
def calculate_weighted_average_similarity(input_text, model, tokenizer, embedding_file):
    filtered_df = calculate_topic_probabilities_with_reviews(input_text, model, tokenizer, embedding_file)
    
    filtered_df['university'] = filtered_df['university'].astype(str)
    filtered_df['sub_topic'] = filtered_df['sub_topic'].astype(str)

    mean_scores = filtered_df.groupby(['university', 'sub_topic']).agg({'similarity_score': 'mean'}).reset_index()
    mean_scores['weighted_score'] = mean_scores.apply(
        lambda x: x['similarity_score'] * filtered_df[filtered_df['sub_topic'] == x['sub_topic']]['topic_probability'].iloc[0], 
        axis=1
    )
    was_scores = mean_scores.groupby('university')['weighted_score'].sum().reset_index().rename(columns={'weighted_score': 'WAS'})

    # Renormalizing the WAS scores
    total_was = was_scores['WAS'].sum()
    was_scores['WAS'] = was_scores['WAS'] / total_was

    highest_prob_topic = mean_scores.loc[mean_scores.groupby('university')['similarity_score'].idxmax()][['university', 'sub_topic']]
    highest_prob_topic.columns = ['university', 'Highest_Prob_Topic']
    highest_prob_topic['university'] = highest_prob_topic['university'].astype(str)
    highest_prob_topic['Highest_Prob_Topic'] = highest_prob_topic['Highest_Prob_Topic'].astype(str)

    result = pd.merge(was_scores, highest_prob_topic, on='university')

    relevant_review = filtered_df.loc[filtered_df.groupby(['university', 'sub_topic'])['similarity_score'].idxmax()][['university', 'sub_topic', 'review', 'author', 'created']]
    relevant_review.columns = ['university', 'Highest_Prob_Topic', 'Most_Relevant_Review', 'Most_Relevant_Author', 'Most_Relevant_Created']
    relevant_review['university'] = relevant_review['university'].astype(str)
    relevant_review['Highest_Prob_Topic'] = relevant_review['Highest_Prob_Topic'].astype(str)

    final_result = pd.merge(result, relevant_review, on=['university', 'Highest_Prob_Topic'])

    return final_result[['university', 'WAS', 'Highest_Prob_Topic', 'Most_Relevant_Review', 'Most_Relevant_Author', 'Most_Relevant_Created']]

def calculate_confidence_score(input_text, model, tokenizer, embedding_file):
    df = calculate_topic_probabilities_with_reviews(input_text, model, tokenizer, embedding_file)    
    topic_counts = df['sub_topic'].value_counts()
    total_data_points = len(df)
    topic_portions = topic_counts / total_data_points
    topic_probabilities = df['topic_probability'].groupby(df['sub_topic']).mean()
    weighted_average = sum(topic_portions * topic_probabilities)
    return weighted_average


input_text = "I love multiple labs and well online learning in school"

b=calculate_weighted_average_similarity(input_text, model, tokenizer, embedding_file)
b

Unnamed: 0,university,WAS,Highest_Prob_Topic,Most_Relevant_Review,Most_Relevant_Author,Most_Relevant_Created
0,Boston University,0.043957,Online learning,most class transition well to online learning ...,Junior,2020/7/30
1,Brown University,0.043627,Online learning,the online experience have be positive obvious...,Sophomore,2020/8/19
2,Carnegie Mellon University,0.043229,Online learning,cmu do a solid job transition to online learn ...,Sophomore,2020/8/24
3,Columbia University,0.044245,Student opportunities,internship be well worth the time and they hel...,College Senior,2013/3/4
4,Cornell University,0.043431,Student opportunities,my major have the great career service departm...,College Sophomore,2014/1/8
5,Duke University,0.04501,Online learning,online learning have be difficult but my profe...,Graduate Student,2021/4/6
6,Harvard University,0.043458,Technology and computer labs,wifi everywhere computer lab in every dorm pri...,College Sophomore,2013/12/1
7,Johns Hopkins University,0.043244,Online learning,the online learning experience with john hopki...,Graduate Student,2021/1/20
8,Massachusetts Institute of Technology,0.029402,Student opportunities,mit be a great place to find research or biote...,College Senior,2014/8/14
9,New York University,0.04416,Online learning,nyu have be try but it isnt really do well wit...,Senior,2020/10/31


In [17]:
from embeddings import allEmbeddingsCalculator
from tqdm import tqdm
input_text = "love college with multiple libarary"
embeddings_calculator = allEmbeddingsCalculator()
b=embeddings_calculator.calculate_weighted_average_similarity(input_text)
a=embeddings_calculator.calculate_confidence_score(input_text)
b.head()

Unnamed: 0,University,WAS,Highest_Prob_Topic,Most_Relevant_Review,Most_Relevant_Author,Most_Relevant_Created
0,Boston University,0.044081,Campus party scene,mit definitely have the best night time event ...,College Freshman,2014/3/15
1,Brown University,0.044276,Online learning,i havent do any online learning yet but the fe...,Freshman,2020/6/25
2,Carnegie Mellon University,0.04434,Campus party scene,what youre expect florida state dont get me wr...,College Freshman,2011/11/6
3,Columbia University,0.043259,Campus party scene,the bar be pretty lame but occasionally they b...,College Sophomore,2014/3/11
4,Cornell University,0.043806,Online learning,horrible rampant cheating and no way to stop i...,Junior,2020/7/31


In [11]:
test_df= pd.read_csv(r"./text.csv", encoding='ISO-8859-1')
from embeddings import allEmbeddingsCalculator
from tqdm.auto import tqdm
embeddings_calculator = allEmbeddingsCalculator()
predicted_universities = []
for review in tqdm(test_df['reviews_lemmatized'], desc="Processing Reviews"):
    preprocessed_review = " ".join(review.lower().split())
    result = embeddings_calculator.calculate_weighted_average_similarity(preprocessed_review)
    top_universities = result.nlargest(5, 'WAS')['University'].tolist()
    predicted_universities.append(top_universities)
test_df['Predicted_University'] = predicted_universities
test_df['Match'] = test_df.apply(lambda row: row['University'] in row['Predicted_University'], axis=1)
accuracy = test_df['Match'].sum() / len(test_df)
print("Accuracy:", accuracy)

Accuracy: 0.4130879345603272


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Predicted_University'] = predicted_universities
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Match'] = test_df.apply(lambda row: row['University'] in row['Predicted_University'], axis=1)


In [None]:
test_df= pd.read_csv(r"./text.csv", encoding='ISO-8859-1')
from embeddings import allEmbeddingsCalculator
from tqdm.auto import tqdm
embeddings_calculator = allEmbeddingsCalculator()
predicted_universities = []
for review in tqdm(test_df['reviews_lemmatized'], desc="Processing Reviews"):
    preprocessed_review = " ".join(review.lower().split())
    result = embeddings_calculator.calculate_weighted_average_similarity(preprocessed_review)
    top_universities = result.nlargest(7, 'WAS')['University'].tolist()
    predicted_universities.append(top_universities)
test_df['Predicted_University'] = predicted_universities
test_df['Match'] = test_df.apply(lambda row: row['University'] in row['Predicted_University'], axis=1)
accuracy = test_df['Match'].sum() / len(test_df)
print("Accuracy:", accuracy)

In [7]:
from embeddings import allEmbeddingsCalculator
from tqdm import tqdm
import pandas as pd
import numpy as np
embeddings_calculator = allEmbeddingsCalculator()
predicted_universities = []
for review in tqdm(test_df['reviews_lemmatized'], desc="Processing Reviews"):
    preprocessed_review = " ".join(review.lower().split())
    result = embeddings_calculator.general_scores(preprocessed_review)
    
    top_universities = result.sort_values(by='WAS', ascending=False).index[:5]
    predicted_universities.append(top_universities)

test_df['Predicted_University'] = predicted_universities
test_df['Match'] = test_df.apply(lambda row: row['University'] in row['Predicted_University'], axis=1)
accuracy_per_sub_topic = test_df.groupby('sub_topic').apply(lambda group: group['Match'].sum() / len(group))
print("Accuracy per Sub-Topic:")
print(accuracy_per_sub_topic)
def weighted_accuracy(input_text, embeddings_calculator):
    topic_probabilities_df = embeddings_calculator.calculate_topic_probabilities(input_text)
    topic_probabilities_df = topic_probabilities_df[topic_probabilities_df['sub_topic'] != 'Student opportunities']
    total_probability = topic_probabilities_df['topic_probability'].sum()
    topic_probabilities_df['normalized_probability'] = topic_probabilities_df['topic_probability'] / total_probability
    accuracy_df = pd.read_csv('accuracy_per_sub_topic.csv')
    accuracy_df['Accuracy'] = pd.to_numeric(accuracy_df['Accuracy'], errors='coerce')
    accuracy_dict = accuracy_df.set_index('Sub_Topic')['Accuracy'].to_dict()
    weighted_avg_accuracy = sum(
        topic_probabilities_df['normalized_probability'] *
        topic_probabilities_df['sub_topic'].apply(lambda topic: accuracy_dict.get(topic, 0))
    )
    return weighted_avg_accuracy
input_text = "love college with multiple libarary"
weighted_accuracy_value = weighted_accuracy(input_text, embeddings_calculator)
print("Weighted Average Accuracy:", weighted_accuracy_value)

Processing Reviews: 100%|████████████████████████████████████████████████████████████| 100/100 [00:54<00:00,  1.84it/s]

Accuracy per Sub-Topic:
sub_topic
Administration and school policies    0.312500
Admission process                     0.285714
Career opportunities                  0.363636
Diversity and inclusion               0.250000
Financial aid and scholarships        0.181818
General academic quality              0.250000
Online learning                       0.333333
Student opportunities                 0.000000
Technology and computer labs          0.375000
dtype: float64



