In [12]:
import os
import pandas as pd

# Assuming you've already loaded your DataFrame `alignment_df` from the CSV
directory_path = '/Users/nachiketh/Library/CloudStorage/OneDrive-TrinityCollegeDublin/Text_analytics/project/data/raw-txt'

alignment_df = pd.read_csv('/Users/nachiketh/Library/CloudStorage/OneDrive-TrinityCollegeDublin/Text_analytics/project/data/combined_csv.csv')
def ensure_txt_extension(filename):
    if pd.isnull(filename):
        return None  # Handles NaN values gracefully
    if not filename.endswith('.txt'):
        return f"{filename}.txt"
    return filename
alignment_df['filename'] = alignment_df['filename'].apply(ensure_txt_extension)

# Function to read text from a .txt file
def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

documents_list = []
ranks = []
uni_names = []

for index, row in alignment_df.iterrows():
    # Check if the filename is not NaN (not a float)
    if not pd.isna(row['filename']) and isinstance(row['filename'], str):
        file_path = os.path.join(directory_path, row['filename'])
        if os.path.exists(file_path):
            text = read_txt(file_path)
            documents_list.append(text)
            ranks.append(row['qs_ranking'])
            uni_names.append(row['institution'])
        else:
            print(f"File does not exist: {file_path}")
    else:
        # Handle the case where filename is NaN or not a string
        print(f"Invalid or missing filename at index {index}")

# Convert lists to DataFrame
documents_df = pd.DataFrame({
    'document': documents_list,
    'rank': ranks,
    'institution': uni_names
})



In [13]:
documents_df

Unnamed: 0,document,rank,institution
0,"FINAL : APPROVED DECEMBER 8 , 2022 \n \n20...",102,University of Wisconsin-Madison
1,PLAN\n2020 • 2025DEVELOPMENT\nPLAN\n2020 • 202...,103,The Pontifical Catholic University of Chile
2,GO FURTHER2021–2025\nTransforming lives \nthr...,104,The University of Sheffield
3,\n UFV 2018/641 \nUppsala University: \nMis...,105,Uppsala University
4,university of copenhagen\nSTRATEGY 2030\nCreat...,107,University of Copenhagen
...,...,...,...
248,BSU BSU ––an an \nexperienced experienced \nan...,387,Belarusian State University
249,July 2023 - Case Number : 2019FR470010 \n \n...,392,Institut National des Sciences AppliquÃ©es de ...
250,Prospective Student (http://sites.scut.edu.cn/...,392,South China University of Technology
251,THE DEVELOPMENT PLAN OF JUSTUS LIEBIG UNIVERSI...,396,Justus-Liebig-University Giessen


In [14]:
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# import nltk
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# import re
# import gensim

# stop_words = set(stopwords.words('english'))
# lemmatizer = WordNetLemmatizer()
# def preprocess(text):
#     text = text.lower()
#     text = "".join(re.findall("[a-z\s]*", text)) 
#     words = text.split() 
#     filtered_text = [word for word in words if word not in stop_words]
#     lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_text]
#     return lemmatized_text 

In [15]:
import gensim
from gensim.models.phrases import Phrases, Phraser
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
import re
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
# Function to detect phrases in the documents
def detect_phrases(docs):
    phrases = Phrases(docs, min_count=5, threshold=12) # Play with these parameters based on your corpus
    phraser = Phraser(phrases)
    return phraser

# Update the preprocess function to integrate phrase detection
def preprocess_with_phrases(text, phraser):
    text = text.lower()
    text = "".join(re.findall("[a-z\s]*", text))
    words = text.split()
    words = phraser[words]  # Apply phrase model
    # filtered_text = [word for word in words if word not in stop_words]
    filtered_text = [word for word in words if len(word) > 2 and '_' not in word and word not in stop_words]
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_text]
    return lemmatized_text

# First, tokenize the documents for phrase detection
tokenized_docs = [[word for word in document.lower().split() if word not in stop_words] for document in documents_df['document']]

# Detect phrases
phraser = detect_phrases(tokenized_docs)

# Now preprocess documents including the detected phrases
processed_docs_with_phrases = [preprocess_with_phrases(text, phraser) for text in documents_df['document']]

# Continue with dictionary and bow_corpus creation using the processed_docs_with_phrases
dictionary = gensim.corpora.Dictionary(processed_docs_with_phrases)
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs_with_phrases]


[nltk_data] Downloading package punkt to /Users/nachiketh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nachiketh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nachiketh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nachiketh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [16]:
# Assuming processed_docs is a list of preprocessed documents aligned with external_metrics
# processed_docs = [preprocess(text) for text in documents_df['document']]

# # Update dictionary and bow_corpus creation steps accordingly
# dictionary = gensim.corpora.Dictionary(processed_docs)
# dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
# bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
# from gensim import corpora, models

# tfidf = models.TfidfModel(bow_corpus)
# corpus_tfidf = tfidf[bow_corpus]


In [17]:
# Assuming 'rank' is the column in documents_df that contains the rankings
external_metrics = documents_df['rank'].to_numpy()


In [18]:
from gensim.models import LdaMulticore
import numpy as np
from gensim.models.coherencemodel import CoherenceModel
from scipy.stats import pearsonr
import os

def evaluate_model_coherence(lda_model, texts, dictionary, coherence='c_v'):
    coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence=coherence)
    return coherence_model.get_coherence()

def train_and_evaluate_models(corpus, id2word, texts, num_topics_list, passes=10, random_state=42, top_n_models=3, checkpoint_dir="model/"):
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    
    model_list = []
    coherence_scores = []
    model_paths = []

    for num_topics in num_topics_list:
        model = LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topics, 
                             passes=passes, random_state=random_state, workers=4)
        model_path = os.path.join(checkpoint_dir, f"lda_model_{num_topics}.model")
        model.save(model_path)
        model_paths.append(model_path)
        
        coherence_score = evaluate_model_coherence(model, texts, id2word)
        model_list.append(model)
        coherence_scores.append(coherence_score)
        print(f"Model with {num_topics} topics saved at {model_path} with coherence score {coherence_score}")

    top_indices = np.argsort(coherence_scores)[-top_n_models:]
    top_models = [model_list[i] for i in top_indices]
    top_scores = [coherence_scores[i] for i in top_indices]
    top_model_paths = [model_paths[i] for i in top_indices]

    return top_models, top_scores, top_model_paths

import numpy as np

def process_models_and_extract_features(top_models, corpus, id2word, num_topics):
    # Determine the maximum number of topics among all models and the specified number of topics
    max_num_topics = max(model.num_topics for model in top_models)
    num_topics = min(max_num_topics, num_topics)
    
    feature_vectors = np.zeros((len(corpus), num_topics))
    averaged_topics = [[] for _ in range(num_topics)]

    # Iterate over each document in the corpus
    for i, doc_bow in enumerate(corpus):
        doc_topics_avg = np.zeros(num_topics)
        
        # Iterate over each model to get the topic distribution for the document
        for model in top_models:
            doc_topics = dict(model.get_document_topics(doc_bow, minimum_probability=0))
            for topic_num, prob in doc_topics.items():
                if topic_num < num_topics:  # Ensure topic_num is within the specified number of topics
                    doc_topics_avg[topic_num] += prob / len(top_models)
        
        # Update the feature vector for the document
        feature_vectors[i, :] = doc_topics_avg
    
    # Collect and average topic terms across models
    for topic_num in range(num_topics):
        topic_terms = {}
        for model in top_models:
            if topic_num < model.num_topics:  # Ensure topic_num is within the model's number of topics
                for term_id, weight in model.get_topic_terms(topic_num, topn=20):
                    topic_terms[term_id] = topic_terms.get(term_id, 0) + weight / len(top_models)
        averaged_terms = sorted(topic_terms.items(), key=lambda x: -x[1])[:20]
        averaged_topics[topic_num] = averaged_terms
    
    # Print averaged topics
    print("\nAveraged Topics:")
    for idx, terms in enumerate(averaged_topics):
        if terms:  # Only print if there are terms for this topic
            terms_str = " + ".join([f"{weight:.3f}*{id2word[term]}" for term, weight in terms])
            print(f"Topic {idx}: {terms_str}")

    return feature_vectors



def calculate_correlation(feature_vectors, external_metrics):
    correlations = []
    for i in range(feature_vectors.shape[1]):
        correlation, _ = pearsonr(feature_vectors[:, i], external_metrics)
        correlations.append(correlation)
    
    return correlations


num_topics_list = [6, 10, 15, 20]
top_n_models = 2

top_models, top_scores, top_model_paths = train_and_evaluate_models(bow_corpus, dictionary, texts=processed_docs_with_phrases, num_topics_list=num_topics_list, top_n_models=top_n_models)


Model with 6 topics saved at model/lda_model_6.model with coherence score 0.35446738165382485
Model with 10 topics saved at model/lda_model_10.model with coherence score 0.383880088600114
Model with 15 topics saved at model/lda_model_15.model with coherence score 0.3892610254017365
Model with 20 topics saved at model/lda_model_20.model with coherence score 0.3646888565521921


In [19]:
chosen_number_of_topics=10
feature_vectors = process_models_and_extract_features(top_models, corpus=bow_corpus, id2word=dictionary, num_topics=chosen_number_of_topics)



Averaged Topics:
Topic 0: 0.006*programme + 0.004*scientific + 0.004*creation + 0.003*professor + 0.003*organisation + 0.003*term + 0.003*strengthening + 0.003*promoting + 0.003*context + 0.003*addition + 0.003*therefore + 0.002*cooperation + 0.002*different + 0.002*office + 0.002*centre + 0.002*european + 0.002*pillar + 0.001*kpi + 0.001*employee + 0.001*internal
Topic 1: 0.012*universit + 0.010*iit + 0.010*center + 0.009*scientific + 0.007*iits + 0.007*paris + 0.005*european + 0.004*laboratory + 0.004*hospital + 0.003*table + 0.003*environmental + 0.003*industrial + 0.002*college + 0.002*unit + 0.002*robotics + 0.002*water + 0.002*company + 0.002*total + 0.002*startup + 0.002*programme
Topic 2: 0.005*establishment + 0.005*china + 0.004*semester + 0.004*msc + 0.004*humanity + 0.004*cooperation + 0.004*strengthening + 0.003*engineering + 0.003*beyond + 0.003*korea + 0.003*center + 0.003*centre + 0.003*programme + 0.003*construction + 0.003*startup + 0.002*creative + 0.002*overseas + 0

In [20]:
def calculate_correlation(feature_vectors, external_metrics):
    correlations = []
    for i in range(feature_vectors.shape[1]):
        correlation, _ = pearsonr(feature_vectors[:, i], external_metrics)
        correlations.append(correlation)
    return correlations

# Perform correlation analysis
correlations = calculate_correlation(feature_vectors, external_metrics)
for idx, corr in enumerate(correlations):
    print(f"Topic {idx}: Correlation = {corr}")

Topic 0: Correlation = -0.00113087570087576
Topic 1: Correlation = 0.05828495701050555
Topic 2: Correlation = 0.04057677178520706
Topic 3: Correlation = 0.0134694096797457
Topic 4: Correlation = -0.16785769975179315
Topic 5: Correlation = -0.031010928688843007
Topic 6: Correlation = -0.043958803283330954
Topic 7: Correlation = 0.15098224999109386
Topic 8: Correlation = 0.06783083357119717
Topic 9: Correlation = -0.1721703477981967


In [21]:
final_df = pd.DataFrame({
    'Name': documents_df['institution'],
    'rank': documents_df['rank']
})
num_features = len(feature_vectors[0])  # Get the number of features
for i in range(num_features):
    final_df[f'feature_{i+1}'] = [vector[i] for vector in feature_vectors]

final_df

Unnamed: 0,Name,rank,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10
0,University of Wisconsin-Madison,102,0.102442,0.000180,0.000180,0.444332,0.000180,0.000180,0.000180,0.000180,0.000180,0.114312
1,The Pontifical Catholic University of Chile,103,0.591698,0.000017,0.000017,0.000790,0.000017,0.000017,0.329850,0.077522,0.000017,0.000017
2,The University of Sheffield,104,0.000121,0.000121,0.000121,0.000121,0.000121,0.000121,0.000121,0.013050,0.009960,0.656555
3,Uppsala University,105,0.504582,0.000070,0.000070,0.000070,0.000070,0.000070,0.000070,0.000070,0.011733,0.000070
4,University of Copenhagen,107,0.047874,0.000242,0.254257,0.000242,0.000242,0.498789,0.000242,0.000242,0.178663,0.018720
...,...,...,...,...,...,...,...,...,...,...,...,...
248,Belarusian State University,387,0.000391,0.000392,0.000391,0.000391,0.023160,0.000391,0.304012,0.194165,0.000391,0.000391
249,Institut National des Sciences AppliquÃ©es de ...,392,0.000107,0.000107,0.837890,0.000107,0.000107,0.000107,0.000107,0.012787,0.000107,0.000107
250,South China University of Technology,392,0.000684,0.000684,0.185460,0.000684,0.122238,0.000684,0.000684,0.169379,0.005998,0.273779
251,Justus-Liebig-University Giessen,396,0.481001,0.000030,0.000019,0.000019,0.018059,0.000019,0.000019,0.000871,0.000019,0.000019


In [22]:
csv_file_path = '/Users/nachiketh/Library/CloudStorage/OneDrive-TrinityCollegeDublin/Text_analytics/project/final_file_bow.csv'

# Dump the DataFrame to a CSV file
final_df.to_csv(csv_file_path, index=False)