In [1]:
import os
import pandas as pd

# Assuming you've already loaded your DataFrame `alignment_df` from the CSV
directory_path = '/Users/nachiketh/Library/CloudStorage/OneDrive-TrinityCollegeDublin/Text_analytics/project/data/raw-txt'

alignment_df = pd.read_csv('/Users/nachiketh/Library/CloudStorage/OneDrive-TrinityCollegeDublin/Text_analytics/project/data/combined_csv.csv')
def ensure_txt_extension(filename):
    if pd.isnull(filename):
        return None  # Handles NaN values gracefully
    if not filename.endswith('.txt'):
        return f"{filename}.txt"
    return filename
alignment_df['filename'] = alignment_df['filename'].apply(ensure_txt_extension)

# Function to read text from a .txt file
def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

documents_list = []
ranks = []
uni_names = []

for index, row in alignment_df.iterrows():
    # Check if the filename is not NaN (not a float)
    if not pd.isna(row['filename']) and isinstance(row['filename'], str):
        file_path = os.path.join(directory_path, row['filename'])
        if os.path.exists(file_path):
            text = read_txt(file_path)
            documents_list.append(text)
            ranks.append(row['qs_ranking'])
            uni_names.append(row['institution'])
        else:
            print(f"File does not exist: {file_path}")
    else:
        # Handle the case where filename is NaN or not a string
        print(f"Invalid or missing filename at index {index}")

# Convert lists to DataFrame
documents_df = pd.DataFrame({
    'document': documents_list,
    'rank': ranks,
    'institution': uni_names
})



In [2]:
documents_df

Unnamed: 0,document,rank,institution
0,"FINAL : APPROVED DECEMBER 8 , 2022 \n \n20...",102,University of Wisconsin-Madison
1,PLAN\n2020 • 2025DEVELOPMENT\nPLAN\n2020 • 202...,103,The Pontifical Catholic University of Chile
2,GO FURTHER2021–2025\nTransforming lives \nthr...,104,The University of Sheffield
3,\n UFV 2018/641 \nUppsala University: \nMis...,105,Uppsala University
4,university of copenhagen\nSTRATEGY 2030\nCreat...,107,University of Copenhagen
...,...,...,...
248,BSU BSU ––an an \nexperienced experienced \nan...,387,Belarusian State University
249,July 2023 - Case Number : 2019FR470010 \n \n...,392,Institut National des Sciences AppliquÃ©es de ...
250,Prospective Student (http://sites.scut.edu.cn/...,392,South China University of Technology
251,THE DEVELOPMENT PLAN OF JUSTUS LIEBIG UNIVERSI...,396,Justus-Liebig-University Giessen


In [3]:
import gensim
from gensim.models.phrases import Phrases, Phraser
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
import re
from langdetect import detect

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
# Function to detect phrases in the documents
def detect_phrases(docs):
    phrases = Phrases(docs, min_count=5, threshold=12) # Play with these parameters based on your corpus
    phraser = Phraser(phrases)
    return phraser

def remove_non_english_words(words):
    # Filter out words that are not detected as English
    filtered_words = [word for word in words if detect(word) == 'en']
    return filtered_words

# Update the preprocess function to integrate phrase detection
def preprocess_with_phrases(text, phraser):
    text = text.lower()
    text = "".join(re.findall("[a-z\s]*", text))
    words = text.split()
    words = remove_non_english_words(words)
 # Apply phrase model
    # filtered_text = [word for word in words if word not in stop_words]
    filtered_text = [word for word in words if len(word) > 2 and '_' not in word and word not in stop_words]
    filtered_text = phraser[filtered_text] 
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_text]
    return lemmatized_text

# First, tokenize the documents for phrase detection
tokenized_docs = [[word for word in document.lower().split() if word not in stop_words] for document in documents_df['document']]

# Detect phrases
phraser = detect_phrases(tokenized_docs)

# Now preprocess documents including the detected phrases
processed_docs_with_phrases = [preprocess_with_phrases(text, phraser) for text in documents_df['document']]

# Continue with dictionary and bow_corpus creation using the processed_docs_with_phrases
dictionary = gensim.corpora.Dictionary(processed_docs_with_phrases)
dictionary.filter_extremes(no_below=6, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs_with_phrases]


[nltk_data] Downloading package punkt to /Users/nachiketh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nachiketh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nachiketh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nachiketh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


KeyboardInterrupt: 

In [None]:


# tfidf = models.TfidfModel(bow_corpus)
# corpus_tfidf = tfidf[bow_corpus]


In [None]:
# Assuming 'rank' is the column in documents_df that contains the rankings
external_metrics = documents_df['rank'].to_numpy()


In [None]:
from gensim.models import LdaMulticore
import numpy as np
from gensim.models.coherencemodel import CoherenceModel
from scipy.stats import pearsonr
import os

def evaluate_model_coherence(lda_model, texts, dictionary, coherence='c_v'):
    coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence=coherence)
    return coherence_model.get_coherence()

def train_and_evaluate_models(corpus, id2word, texts, num_topics_list, passes=10, random_state=42, top_n_models=3, checkpoint_dir="model/"):
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    
    model_list = []
    coherence_scores = []
    model_paths = []

    for num_topics in num_topics_list:
        model = LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topics, 
                             passes=passes, random_state=random_state, workers=4)
        model_path = os.path.join(checkpoint_dir, f"lda_model_{num_topics}.model")
        model.save(model_path)
        model_paths.append(model_path)
        
        coherence_score = evaluate_model_coherence(model, texts, id2word)
        model_list.append(model)
        coherence_scores.append(coherence_score)
        print(f"Model with {num_topics} topics saved at {model_path} with coherence score {coherence_score}")

    top_indices = np.argsort(coherence_scores)[-top_n_models:]
    top_models = [model_list[i] for i in top_indices]
    top_scores = [coherence_scores[i] for i in top_indices]
    top_model_paths = [model_paths[i] for i in top_indices]

    return top_models, top_scores, top_model_paths

import numpy as np

def process_models_and_extract_features(top_models, corpus, id2word, num_topics):
    # Determine the maximum number of topics among all models and the specified number of topics
    max_num_topics = max(model.num_topics for model in top_models)
    num_topics = min(max_num_topics, num_topics)
    
    feature_vectors = np.zeros((len(corpus), num_topics))
    averaged_topics = [[] for _ in range(num_topics)]

    # Iterate over each document in the corpus
    for i, doc_bow in enumerate(corpus):
        doc_topics_avg = np.zeros(num_topics)
        
        # Iterate over each model to get the topic distribution for the document
        for model in top_models:
            doc_topics = dict(model.get_document_topics(doc_bow, minimum_probability=0))
            for topic_num, prob in doc_topics.items():
                if topic_num < num_topics:  # Ensure topic_num is within the specified number of topics
                    doc_topics_avg[topic_num] += prob / len(top_models)
        
        # Update the feature vector for the document
        feature_vectors[i, :] = doc_topics_avg
    
    # Collect and average topic terms across models
    for topic_num in range(num_topics):
        topic_terms = {}
        for model in top_models:
            if topic_num < model.num_topics:  # Ensure topic_num is within the model's number of topics
                for term_id, weight in model.get_topic_terms(topic_num, topn=20):
                    topic_terms[term_id] = topic_terms.get(term_id, 0) + weight / len(top_models)
        averaged_terms = sorted(topic_terms.items(), key=lambda x: -x[1])[:20]
        averaged_topics[topic_num] = averaged_terms
    
    # Print averaged topics
    print("\nAveraged Topics:")
    for idx, terms in enumerate(averaged_topics):
        if terms:  # Only print if there are terms for this topic
            terms_str = " + ".join([f"{weight:.3f}*{id2word[term]}" for term, weight in terms])
            print(f"Topic {idx}: {terms_str}")

    return feature_vectors



def calculate_correlation(feature_vectors, external_metrics):
    correlations = []
    for i in range(feature_vectors.shape[1]):
        correlation, _ = pearsonr(feature_vectors[:, i], external_metrics)
        correlations.append(correlation)
    
    return correlations


num_topics_list = [6, 10, 15, 20]
top_n_models = 2

top_models, top_scores, top_model_paths = train_and_evaluate_models(bow_corpus, dictionary, texts=processed_docs_with_phrases, num_topics_list=num_topics_list, top_n_models=top_n_models)


Model with 6 topics saved at model/lda_model_6.model with coherence score 0.35797974727309206
Model with 10 topics saved at model/lda_model_10.model with coherence score 0.4398457492842652
Model with 15 topics saved at model/lda_model_15.model with coherence score 0.36244324559489505
Model with 20 topics saved at model/lda_model_20.model with coherence score 0.3888987775318332


In [None]:
chosen_number_of_topics=10
feature_vectors = process_models_and_extract_features(top_models, corpus=bow_corpus, id2word=dictionary, num_topics=chosen_number_of_topics)



Averaged Topics:
Topic 0: 0.021*msc + 0.020*engineering + 0.009*centre + 0.008*bsc + 0.008*del + 0.007*programme + 0.005*master + 0.005*aau + 0.005*pbl + 0.005*industrial + 0.004*energy + 0.004*agricultural + 0.004*architecture + 0.004*ear + 0.003*food + 0.003*option + 0.003*problem + 0.003*organisation + 0.003*mining + 0.003*feature
Topic 1: 0.007*strengthening + 0.005*center + 0.005*organization + 0.004*construction + 0.004*evaluation + 0.003*fall + 0.003*promoting + 0.003*joint + 0.003*table + 0.003*china + 0.002*scientific + 0.002*sdgs + 0.002*cooperation + 0.002*unit + 0.002*production + 0.002*total + 0.002*target + 0.002*improving + 0.002*term + 0.002*japan
Topic 2: 0.027*york + 0.012*indigenous + 0.012*queen + 0.008*scholarship + 0.007*centre + 0.007*scholar + 0.006*canada + 0.005*environmental + 0.005*equity + 0.005*fundamental + 0.005*engineering + 0.004*canadian + 0.004*material + 0.004*discovery + 0.004*art + 0.004*disease + 0.003*governance + 0.003*law + 0.003*creation + 0

In [None]:
def calculate_correlation(feature_vectors, external_metrics):
    correlations = []
    for i in range(feature_vectors.shape[1]):
        correlation, _ = pearsonr(feature_vectors[:, i], external_metrics)
        correlations.append(correlation)
    return correlations

# Perform correlation analysis
correlations = calculate_correlation(feature_vectors, external_metrics)
for idx, corr in enumerate(correlations):
    print(f"Topic {idx}: Correlation = {corr}")

Topic 0: Correlation = 0.0694321376869563
Topic 1: Correlation = 0.10311634020106784
Topic 2: Correlation = -0.013423427241307568
Topic 3: Correlation = -0.13530900346238692
Topic 4: Correlation = -0.11804120977449338
Topic 5: Correlation = 0.11027110446022696
Topic 6: Correlation = 0.11324648981019557
Topic 7: Correlation = -0.02082309242622215
Topic 8: Correlation = 0.07808711264243345
Topic 9: Correlation = -0.03965137982762084


In [None]:
final_df = pd.DataFrame({
    'Name': documents_df['institution'],
    'rank': documents_df['rank']
})
num_features = len(feature_vectors[0])  # Get the number of features
for i in range(num_features):
    final_df[f'feature_{i+1}'] = [vector[i] for vector in feature_vectors]

final_df

Unnamed: 0,Name,rank,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10
0,University of Wisconsin-Madison,102,0.000164,0.072534,0.000164,0.067232,0.258236,0.000164,0.000164,0.000164,0.584896,0.000164
1,The Pontifical Catholic University of Chile,103,0.071030,0.211017,0.000016,0.000016,0.206379,0.000016,0.000016,0.073620,0.038851,0.000016
2,The University of Sheffield,104,0.000110,0.000110,0.000110,0.956485,0.000110,0.000110,0.000110,0.000110,0.042269,0.000110
3,Uppsala University,105,0.000064,0.000064,0.000064,0.000064,0.657056,0.000064,0.000064,0.002237,0.000064,0.000064
4,University of Copenhagen,107,0.012755,0.000221,0.000221,0.158821,0.375748,0.000221,0.000221,0.000221,0.000221,0.159480
...,...,...,...,...,...,...,...,...,...,...,...,...
248,Belarusian State University,387,0.000352,0.739682,0.000352,0.000352,0.000352,0.065905,0.000352,0.000352,0.000352,0.000352
249,Institut National des Sciences AppliquÃ©es de ...,392,0.000100,0.292336,0.000100,0.000100,0.072312,0.028619,0.001507,0.000100,0.105060,0.000100
250,South China University of Technology,392,0.062404,0.369316,0.000625,0.065985,0.000625,0.000625,0.496458,0.000625,0.000625,0.000625
251,Justus-Liebig-University Giessen,396,0.000018,0.007645,0.000018,0.000018,0.491169,0.000395,0.000018,0.000745,0.000018,0.000018


In [None]:
csv_file_path = '/Users/nachiketh/Library/CloudStorage/OneDrive-TrinityCollegeDublin/Text_analytics/project/final_file_bow.csv'

# Dump the DataFrame to a CSV file
final_df.to_csv(csv_file_path, index=False)