In [1]:
import pandas as pd

dftext = pd.read_csv(r"./alltext.csv", encoding='ISO-8859-1')


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict, Counter
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
import nltk

sia = SentimentIntensityAnalyzer()
stop_words = set(stopwords.words('english'))

# Extract keywords 
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=stop_words, max_features=100)
X = vectorizer.fit_transform(dftext['reviews_lemmatized'])
features = vectorizer.get_feature_names()

uni_keywords = defaultdict(list)
keyword_sentiments = {}

# Verify keyword
for i in range(len(dftext)):
    review = dftext.iloc[i]
    university = review['University']
    review_text = review['reviews']
    review_sentiment = review['sentiment']
    review_keywords = [features[index] for index in X[i].indices]

    # Use VADER to check emotions
    vader_result = sia.polarity_scores(review_text)
    vader_sentiment = 'positive' if vader_result['compound'] >= 0.05 else 'negative' if vader_result['compound'] <= -0.05 else 'neutral'

    for word in review_keywords:
        if word not in keyword_sentiments:
            word_sentiment = 'positive' if sia.polarity_scores(word)['compound'] >= 0.05 else 'negative' if sia.polarity_scores(word)['compound'] <= -0.05 else 'neutral'
            keyword_sentiments[word] = word_sentiment

        if keyword_sentiments[word] == review_sentiment and vader_sentiment == review_sentiment:
            uni_keywords[university].append(word)

uni_keyword_counts = {uni: Counter(keywords) for uni, keywords in uni_keywords.items()}

def extract_keywords(text):
    vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=100)
    X = vectorizer.fit_transform([text])
    features = vectorizer.get_feature_names()
    return features

def predict_universities(new_review):
    # Assuming the comment is what they hope for in their future university
    new_review_keywords = extract_keywords(new_review)
    new_keywords = [word for word in new_review_keywords if word in keyword_sentiments and keyword_sentiments[word] == 'positive']

    uni_scores = defaultdict(int)
    for uni, keyword_counts in uni_keyword_counts.items():
        for keyword in new_keywords:
            uni_scores[uni] += keyword_counts[keyword]

    total_count = sum(uni_scores.values())
    if total_count == 0:
        print("No matching universities found.")
        return
    uni_probabilities = {uni: (count / total_count) for uni, count in uni_scores.items()}

    top_unis = sorted(uni_probabilities, key=uni_probabilities.get, reverse=True)[:5]
    s = 0
    for uni in top_unis:
        s += uni_probabilities[uni]
        print(f"{uni}: {uni_probabilities[uni]:.2%}")
    print(s)
    print(uni_probabilities)



In [11]:
#test model
new_review = "I want to learn at a university that encourages innovation and has a vibrant community life."
predict_universities(new_review)

New York University: 13.78%
University of California, Santa Barbara: 12.11%
University of Texas at Austin: 9.19%
University of California, Los Angeles: 7.72%
Boston University: 6.26%
0.4906054279749478
{'Princeton University': 0.014613778705636743, 'Massachusetts Institute of Technology': 0.012526096033402923, 'Harvard University': 0.012526096033402923, 'Stanford University': 0.022964509394572025, 'Yale University': 0.022964509394572025, 'University of Pennsylvania': 0.010438413361169102, 'Duke University': 0.020876826722338204, 'Brown University': 0.027139874739039668, 'Johns Hopkins University': 0.020876826722338204, 'Northwestern University': 0.025052192066805846, 'Columbia University': 0.016701461377870562, 'Cornell University': 0.031315240083507306, 'University of California, Berkeley': 0.04175365344467641, 'University of California, Los Angeles': 0.07724425887265135, 'University of North Carolina at Chapel Hill': 0.033402922755741124, 'Carnegie Mellon University': 0.0417536534446