# Install and Import Libraries

In [None]:
!pip install sentence-transformers

In [None]:
import re
import pandas as pd
import spacy
from joblib import Parallel, delayed
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import pairwise_distances_argmin_min,classification_report, accuracy_score
from textblob import TextBlob
from sentence_transformers import SentenceTransformer, util
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from transformers import BertModel,BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments,DataCollatorWithPadding
import torch
from torch.utils.data import Dataset, DataLoader
from keras.layers import GRU
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, SpatialDropout1D
from keras.preprocessing.sequence import pad_sequences
# from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from sklearn.neural_network import MLPClassifier
import nltk
from textblob import TextBlob
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
# Visualize missing values
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from nltk.corpus import stopwords
import gensim.downloader as api

  from tqdm.autonotebook import tqdm, trange


# Download NLTK Packages

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

In [None]:
# Load spaCy model
nlp = spacy.load('en_core_web_sm')

In [None]:
# Initialize BERT tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Load and Pre-process Data

In [None]:
# Load the CSV file
df = pd.read_csv('Impact.csv',encoding='latin1')
df.head()

In [None]:
# Define text processing functions
def clean_text(text):
    text = str(text)  # Ensure the input is a string
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespaces
    text = re.sub(r'\[.*?\]', '', text)  # Remove text inside brackets
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and numbers
    text = text.lower()  # Convert to lowercase
    return text

In [None]:
def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

In [None]:
def lemmatize_text(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

In [None]:
def tokenize_text(text):
    return word_tokenize(text)

In [None]:
def named_entity_recognition(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

In [None]:
def pos_tagging(text):
    words = word_tokenize(text)
    return nltk.pos_tag(words)

In [None]:
def vectorize_text(text, method='tfidf'):
    if method == 'tfidf':
        vectorizer = TfidfVectorizer()
    elif method == 'bow':
        vectorizer = CountVectorizer()
    else:
        raise ValueError("Method must be 'tfidf' or 'bow'")

    return vectorizer.fit_transform(text)

In [None]:
def bert_embedding(text):
    inputs = bert_tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    model = BertModel.from_pretrained('bert-base-uncased')
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

In [None]:
def expand_contractions(text):
    contractions_dict = {
        "can't": "cannot",
        "won't": "will not",
        "n't": " not",
        "'re": " are",
        "'s": " is",
        "'d": " would",
        "'ll": " will",
        "'t": " not",
        "'ve": " have",
        "'m": " am"
    }
    contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

In [None]:
def correct_spelling(text):
    return str(TextBlob(text).correct())

In [None]:
def handle_missing_data(text):
    if pd.isnull(text):
        return "missing"
    return text

In [None]:
# Load the CSV file
df = pd.read_csv('Impact_cleaned_data.csv',encoding='latin1')
df.head()

# Exploratory Data Analysis

In [None]:
# Calculate text length for each column
df['Title_length'] = df['Title'].apply(lambda x: len(str(x).split()))
df['Summary_length'] = df['1. Summary of the impact'].apply(lambda x: len(str(x).split()))
df['Research_length'] = df['2. Underpinning research'].apply(lambda x: len(str(x).split()))
df['References_length'] = df['3. References to the research'].apply(lambda x: len(str(x).split()))
df['Details_length'] = df['4. Details of the impact'].apply(lambda x: len(str(x).split()))
df['Sources_length'] = df['5. Sources to corroborate the impact'].apply(lambda x: len(str(x).split()))

# Summary statistics for text lengths
print(df[['Title_length', 'Summary_length', 'Research_length', 'References_length', 'Details_length', 'Sources_length']].describe())

# Visualize text length distributions
df[['Title_length', 'Summary_length', 'Research_length', 'References_length', 'Details_length', 'Sources_length']].hist(bins=30, figsize=(15, 10))
plt.suptitle('Text Length Distributions')
plt.show()

In [None]:
def get_vocabulary_size(text_series):
    all_words = ' '.join(text_series).split()
    unique_words = set(all_words)
    return len(unique_words)

# Get vocabulary size for each column
title_vocab_size = get_vocabulary_size(df['Title'])
summary_vocab_size = get_vocabulary_size(df['1. Summary of the impact'])
research_vocab_size = get_vocabulary_size(df['2. Underpinning research'])
references_vocab_size = get_vocabulary_size(df['3. References to the research'])
details_vocab_size = get_vocabulary_size(df['4. Details of the impact'])
sources_vocab_size = get_vocabulary_size(df['5. Sources to corroborate the impact'])

print("Vocabulary size in Titles:", title_vocab_size)
print("Vocabulary size in Summaries:", summary_vocab_size)
print("Vocabulary size in Research:", research_vocab_size)
print("Vocabulary size in References:", references_vocab_size)
print("Vocabulary size in Details:", details_vocab_size)
print("Vocabulary size in Sources:", sources_vocab_size)

In [None]:
stop_words = set(stopwords.words('english'))

# Function to get word frequency
def get_word_frequency(text_series):
    all_words = ' '.join(text_series).split()
    all_words = [word for word in all_words if word not in stop_words]
    word_freq = Counter(all_words)
    return word_freq

# Get word frequencies for each column
title_word_freq = get_word_frequency(df['Title'])
summary_word_freq = get_word_frequency(df['1. Summary of the impact'])
research_word_freq = get_word_frequency(df['2. Underpinning research'])
references_word_freq = get_word_frequency(df['3. References to the research'])
details_word_freq = get_word_frequency(df['4. Details of the impact'])
sources_word_freq = get_word_frequency(df['5. Sources to corroborate the impact'])

# Print top 10 common words
print("Top 10 common words in Titles:", title_word_freq.most_common(10))
print("Top 10 common words in Summaries:", summary_word_freq.most_common(10))
print("Top 10 common words in Research:", research_word_freq.most_common(10))
print("Top 10 common words in References:", references_word_freq.most_common(10))
print("Top 10 common words in Details:", details_word_freq.most_common(10))
print("Top 10 common words in Sources:", sources_word_freq.most_common(10))

In [None]:
# Function to get n-grams
def get_ngrams(text_series, n=2):
    vectorizer = CountVectorizer(ngram_range=(n, n))
    X = vectorizer.fit_transform(text_series)
    ngram_counts = X.sum(axis=0)
    ngram_freq = [(word, ngram_counts[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    ngram_freq = sorted(ngram_freq, key=lambda x: x[1], reverse=True)
    return ngram_freq

# Get bigrams for each column
title_bigrams = get_ngrams(df['Title'], n=2)
summary_bigrams = get_ngrams(df['1. Summary of the impact'], n=2)
research_bigrams = get_ngrams(df['2. Underpinning research'], n=2)
references_bigrams = get_ngrams(df['3. References to the research'], n=2)
details_bigrams = get_ngrams(df['4. Details of the impact'], n=2)
sources_bigrams = get_ngrams(df['5. Sources to corroborate the impact'], n=2)

# Print top 10 bigrams
print("Top 10 bigrams in Titles:", title_bigrams[:10])
print("Top 10 bigrams in Summaries:", summary_bigrams[:10])
print("Top 10 bigrams in Research:", research_bigrams[:10])
print("Top 10 bigrams in References:", references_bigrams[:10])
print("Top 10 bigrams in Details:", details_bigrams[:10])
print("Top 10 bigrams in Sources:", sources_bigrams[:10])


In [None]:
def plot_word_freq(word_freq, title):
    words, counts = zip(*word_freq.most_common(10))
    plt.figure(figsize=(10, 6))
    plt.bar(words, counts)
    plt.title(title)
    plt.xticks(rotation=45)
    plt.show()

# Plot top 10 words
plot_word_freq(title_word_freq, 'Top 10 Words in Titles')
plot_word_freq(summary_word_freq, 'Top 10 Words in Summaries')
plot_word_freq(research_word_freq, 'Top 10 Words in Research')
plot_word_freq(references_word_freq, 'Top 10 Words in References')
plot_word_freq(details_word_freq, 'Top 10 Words in Details')
plot_word_freq(sources_word_freq, 'Top 10 Words in Sources')

def plot_ngram_freq(ngram_freq, title):
    ngrams, counts = zip(*ngram_freq[:10])
    plt.figure(figsize=(10, 6))
    plt.bar(ngrams, counts)
    plt.title(title)
    plt.xticks(rotation=45)
    plt.show()

# Plot top 10 bigrams
plot_ngram_freq(title_bigrams, 'Top 10 Bigrams in Titles')
plot_ngram_freq(summary_bigrams, 'Top 10 Bigrams in Summaries')
plot_ngram_freq(research_bigrams, 'Top 10 Bigrams in Research')
plot_ngram_freq(references_bigrams, 'Top 10 Bigrams in References')
plot_ngram_freq(details_bigrams, 'Top 10 Bigrams in Details')
plot_ngram_freq(sources_bigrams, 'Top 10 Bigrams in Sources')

# Word Embeddings

In [None]:
# Combine all text columns into a single text column for feature extraction
df['combined_text'] = df['Title'] + ' ' + df['1. Summary of the impact'] + ' ' + df['2. Underpinning research'] + ' ' + df['3. References to the research'] + ' ' + df['4. Details of the impact'] + ' ' + df['5. Sources to corroborate the impact']

In [None]:
# Load pre-trained word2vec model
word2vec_model = api.load("word2vec-google-news-300")

def get_average_word2vec(text, model, vector_size=300):
    words = text.split()
    word_vectors = [model[word] for word in words if word in model]
    if len(word_vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

df['word2vec'] = df['combined_text'].apply(lambda x: get_average_word2vec(x, word2vec_model))

# Convert to numpy array
word2vec_features = np.vstack(df['word2vec'].values)

print("Word2Vec Features Shape:", word2vec_features.shape)

# Convert Word2Vec features into a DataFrame
word2vec_df = pd.DataFrame(df['word2vec'].to_list(), columns=[f'w2v_{i}' for i in range(300)])

# Display Word2Vec features
print("Word2Vec Features:\n", word2vec_df.head())

In [None]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    return torch.mean(last_hidden_states, dim=1).detach().numpy()

df['bert'] = df['combined_text'].apply(lambda x: get_bert_embedding(x, tokenizer, model))

# Convert to numpy array
bert_features = np.vstack(df['bert'].values)

print("BERT Features Shape:", bert_features.shape)

# Convert BERT features into a DataFrame
bert_df = pd.DataFrame(df['bert'].to_list(), columns=[f'bert_{i}' for i in range(model.config.hidden_size)])

# Display BERT features
print("BERT Embeddings:\n", bert_df.head())

In [None]:
#  Squeeze the singleton dimension
bert_features = np.squeeze(np.array(df['bert'].to_list()), axis=1)

# Convert squeezed BERT features into a DataFrame
bert_df = pd.DataFrame(bert_features, columns=[f'bert_{i}' for i in range(model.config.hidden_size)])

# Display BERT features
print("BERT Features Shape:", bert_df.shape)
print("BERT Features:\n", bert_df.head())

In [None]:
df['doc'] = df['combined_text'].apply(nlp)

# Extract Named Entities, POS tags, and Dependency Parsing
def extract_ner(doc):
    return [(ent.text, ent.label_) for ent in doc.ents]

def extract_pos_tags(doc):
    return [(token.text, token.pos_) for token in doc]

def extract_dependencies(doc):
    return [(token.text, token.dep_, token.head.text) for token in doc]

df['named_entities'] = df['doc'].apply(extract_ner)
df['pos_tags'] = df['doc'].apply(extract_pos_tags)
df['dependencies'] = df['doc'].apply(extract_dependencies)

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_text'])

# Convert TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display TF-IDF features
print("TF-IDF Features:\n", tfidf_df.head())

# Convert TF-IDF matrix to dense format
dense_matrix = tfidf_matrix.toarray()

# Create a DataFrame for TF-IDF features
tfidf_df = pd.DataFrame(dense_matrix, columns=tfidf_vectorizer.get_feature_names_out())

# Add the target variable 'overall_rating'
tfidf_df['overall_rating'] = df['overall_rating']

# Compute correlation matrix
correlation_matrix = tfidf_df.corr()

# Extract correlations of 'overall_rating' with other features
overall_rating_correlations = correlation_matrix['overall_rating'].sort_values(ascending=False)

# Plotting the top correlations
top_correlations = overall_rating_correlations.head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_correlations.values, y=top_correlations.index)
plt.title('Top 10 Correlations with Overall Rating')
plt.xlabel('Correlation Coefficient')
plt.ylabel('Features')
plt.show()

In [None]:
# Combine all features into a single DataFrame
features_df = pd.concat([tfidf_df, word2vec_df, bert_df], axis=1)

# The combined features_df now contains TF-IDF, Word2Vec, and BERT embeddings.
print("Combined Feature DataFrame:\n", features_df.head())


In [None]:
# Add REF scores to the features DataFrame for correlation analysis
features_df['overall_rating'] = df['overall_rating']

# Compute correlation matrix
correlation_matrix = features_df.corr()

# Visualize the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Features with REF Scores')
plt.show()

# Identify top features correlated with REF score
top_correlated_features = correlation_matrix['overall_rating'].sort_values(ascending=False).head(10)
print("Top 10 Features Correlated with REF Score:\n", top_correlated_features)

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# Compute correlation matrix
# correlation_matrix = features_df.corr()

# Get the top 10 features most positively and negatively correlated with REF score
top_correlated_features = correlation_matrix['overall_rating'].abs().sort_values(ascending=False).head(11).index

# Create a focused correlation matrix for these features
focused_corr_matrix = correlation_matrix.loc[top_correlated_features, top_correlated_features]

# Plot the focused correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(focused_corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Top Correlated Features with REF Score')
plt.show()

# Assign Proxy Ratings

In [None]:
# Topic Modeling
def topic_modeling(text, num_topics=5):
    vectorizer = CountVectorizer(stop_words='english')
    doc_term_matrix = vectorizer.fit_transform([text])
    LDA = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    LDA.fit(doc_term_matrix)
    topics = LDA.components_
    coherence_scores = []
    for topic in topics:
        top_indices = topic.argsort()[-10:]
        top_terms = [vectorizer.get_feature_names_out()[i] for i in top_indices]
        term_vectors = vectorizer.transform(top_terms).toarray()
        coherence, _ = pairwise_distances_argmin_min(term_vectors, doc_term_matrix)
        coherence_scores.append(coherence.mean())
    return LDA, vectorizer, sum(coherence_scores) / len(coherence_scores)

In [None]:
# Keyword Extraction using TF-IDF
def keyword_extraction(text):
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform([text])
    tfidf_scores = np.asarray(tfidf_matrix.mean(axis=0)).ravel().tolist()
    tfidf_df = pd.DataFrame(list(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_scores)), columns=['Word', 'TF-IDF Score'])
    return tfidf_df.sort_values(by='TF-IDF Score', ascending=False).head(10)

In [None]:
# Dependency Parsing
def dependency_parsing(text):
    doc = nlp(text)
    parsed = [(token.text, token.dep_, token.head.text) for token in doc]
    return parsed, len(parsed), len(set([token.head.text for token in doc]))

In [None]:
# Sentiment Analysis
def sentiment_analysis(text):
    blob = TextBlob(text)
    return blob.sentiment

In [None]:
# Semantic Similarity
def semantic_similarity(main_text, comparison_texts):
    main_embedding = embedder.encode(main_text, convert_to_tensor=True)
    comparison_embeddings = embedder.encode(comparison_texts, convert_to_tensor=True)
    similarities = util.pytorch_cos_sim(main_embedding, comparison_embeddings)
    return similarities.cpu().numpy().flatten()

In [None]:
# Simple Summarization
def simple_summarization(text, num_sentences=3):
    sentences = text.split('. ')
    return '. '.join(sentences[:num_sentences])

In [None]:
df['sentiment_value'] = df['combined_text'].apply(sentiment_analysis)
df['summarization'] = df['combined_text'].apply(simple_summarization)

In [None]:
# Normalize score function
def normalize_score(score, min_val, max_val):
    return (score - min_val) / (max_val - min_val)

In [None]:
# Combine all features for the final rating
def combine_results(text,named_entities,dependencies,sentiment_value,summarization):
    score_orginality=[]
    score_significance=[]
    score_rigour=[]
    preprocessed_text = text

    # Named Entity Recognition (NER)
    entities = named_entities

    # Dependency Parsing
    parsed, num_dependencies, num_heads = dependency_parsing(preprocessed_text)

    # Topic Modeling
    LDA_model, vectorizer, topic_coherence = topic_modeling(preprocessed_text)

    # Keyword Extraction
    tfidf_df = keyword_extraction(preprocessed_text)

    # Sentiment Analysis
    sentiment = sentiment_value

    # Summarization
    summary = summarization
    summary_length = len(summary.split())

    # Calculate raw scores
    num_entities = len(entities)
    top_keyword_score = tfidf_df.iloc[0]['TF-IDF Score'] if not tfidf_df.empty else 0

    score_orginality.append(sentiment.polarity + (num_entities / 10) + (top_keyword_score / 100) + (topic_coherence / 5))
    score_significance.append(sentiment.polarity + (num_dependencies / 50) + (num_heads / 10) + (summary_length / 50))
    score_rigour.append(sentiment.polarity + (num_entities / 10) + (num_dependencies / 50) + (summary_length / 50))

    # Normalize scores
    # originality_score_normalized = normalize_score(originality_score, 29.912090614995588, 648.7089093079097)
    # significance_score_normalized = normalize_score(significance_score, 34.76434749278499, 648.7089093079097)
    # score_rigour = normalize_score(rigour_score, 61.518175529425534, 721.7422954996392)



    final_rating = {
        'originality_score': score_orginality,
        'significance_score': score_significance,
        'rigour_score': score_rigour
    }

    return final_rating

In [None]:
# Process each paper in parallel
def process_paper(row):
    # text = row['combined_text']
    return combine_results(row['combined_text'],row['named_entities'],row['doc'],row['sentiment_value'],row['summarization'])

In [None]:
# Adding results to the DataFrame
ratings_df = pd.DataFrame(results_list)
df[['originality_score','significance_score','rigour_score']] = ratings_df

In [None]:
# Normalize scores to the 0-1 range
min_orginality_score = df['originality_score'].min()
max_score_orginality = df['originality_score'].max()
df['orginality_normalized'] = [normalize_score(score,min_orginality_score,max_score_orginality) for score in df['originality_score']]

In [None]:
rigour_score=[]
for score in df['rigour_score']:
  print(score[0])
  rigour_score.append(score[0])
df['rigour_score']=rigour_score
# Normalize scores to the 0-1 range
min_rigour_score = df['rigour_score'].min()
max_rigour_orginality = df['rigour_score'].max()
df['rigour_normalized'] = [normalize_score(score,min_orginality_score,max_score_orginality) for score in df['rigour_score']]

In [None]:
significance_score=[]
for score in df['significance_score']:
  significance_score.append(score[0])
df['significance_score']=significance_score
# Normalize scores to the 0-1 range
min_significance_score = df['significance_score'].min()
max_significance_orginality = df['significance_score'].max()
df['significance_normalized'] = [normalize_score(score,min_orginality_score,max_score_orginality) for score in df['significance_score']]

In [None]:
# Determine ratings based on normalized scores
def determine_originality_rating(score):
    if score > 0.2:
        return 4
    elif score > 0.15:
        return 3
    elif score > 0.1:
        return 2
    else:
        return 1

In [None]:
# Determine ratings based on normalized scores
def determine_significance_rating(score):
    if score > 0.15:
        return 4
    elif score > 0.1:
        return 3
    elif score > 0.05:
        return 2
    else:
        return 1

In [None]:
# Determine ratings based on normalized scores
def determine_rigour_rating(score):
    if score > 0.4:
        return 4
    elif score > 0.3:
        return 3
    elif score > 0.2:
        return 2
    else:
        return 1

In [None]:
df['originality']=df['orginality_normalized'].apply(determine_originality_rating)
df['significance']=df['significance_normalized'].apply(determine_significance_rating)
df['rigour']=df['rigour_normalized'].apply(determine_rigour_rating)

In [None]:
# Calculate the average of the three columns
df['overall_rating'] = df[['originality', 'significance', 'rigour']].mean(axis=1).round().astype(int)
# Display the DataFrame
print(df['overall_rating'].value_counts())

# Split data for Train and Test

In [None]:
# Extract TF-IDF features from the text
vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features based on your dataset
X = vectorizer.fit_transform(df['combined_text'])
y = df['overall_rating']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest

In [None]:
# Random Forest
print("Evaluating Random Forest")
rf_model = RandomForestClassifier()
cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='accuracy')
print(f"Cross-validation results for Random Forest:")
print(f"Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}\n")

rf_model.fit(X_train, y_train)
y_pred = cross_val_predict(rf_model, X, y, cv=5)
print(f"Results for Random Forest:")
print("Accuracy:", accuracy_score(y, y_pred))
print(classification_report(y, y_pred))
print("\n")

In [None]:
# Hyperparameter optimization using GridSearchCV for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10]
}

grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

# Best parameters
print("Best parameters for Random Forest: ", grid_search_rf.best_params_)

# Train the best model
best_model_rf = grid_search_rf.best_estimator_
best_model_rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = best_model_rf.predict(X_test)

# Evaluate the model
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

In [None]:
feature_importance = best_model_rf.feature_importances_
# Get feature names corresponding to TF-IDF features
feature_names = vectorizer.get_feature_names_out()

# Create a DataFrame to hold feature names and their importance scores
# features_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})
# features_df = features_df.sort_values(by='Importance', ascending=False)

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'].head(20), importance_df['Importance'].head(20), color='royalblue')
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Top 20 Random Forest Feature Importances')
plt.gca().invert_yaxis()
plt.show()

# Print the feature importance
print(importance_df.head(20))  # Show top 20 features for brevity

In [None]:
# Evaluation metrics for Random Forest
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')

print("Random Forest Metrics:")
print(f"Accuracy: {accuracy_rf}")
print(f"Precision: {precision_rf}")
print(f"Recall: {recall_rf}")
print(f"F1 Score: {f1_rf}")

# Neural Network LSTM

In [None]:
max_words = 1000
max_len = 100
tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ')
tokenizer.fit_on_texts(df['combined_text'].values)
X_seq = tokenizer.texts_to_sequences(df['combined_text'].values)
X_pad = pad_sequences(X_seq, maxlen=max_len)
y_cat = pd.get_dummies(df['overall_rating']).values

X_train_pad, X_test_pad, y_train_cat, y_test_cat = train_test_split(X_pad, y_cat, test_size=0.2, random_state=42)

# Define LSTM model
def create_lstm_model():
    model = Sequential()
    model.add(Embedding(max_words, 128, input_length=max_len))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(y_cat.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Train and evaluate LSTM model
print("Evaluating LSTM")
lstm_model = create_lstm_model()
lstm_model.fit(X_train_pad, y_train_cat, epochs=10, batch_size=64, validation_split=0.1)
lstm_y_pred = lstm_model.predict(X_test_pad)
lstm_y_pred_classes = lstm_y_pred.argmax(axis=-1)
lstm_y_test_classes = y_test_cat.argmax(axis=-1)

print("Results for LSTM:")
print("Accuracy:", accuracy_score(lstm_y_test_classes, lstm_y_pred_classes))
print(classification_report(lstm_y_test_classes, lstm_y_pred_classes))
print("\n")

In [None]:
from sklearn.metrics import accuracy_score

def permutation_importance(model, X_test_pad, y_test_cat, n_repeats=10):
    baseline_score = accuracy_score(y_test_cat.argmax(axis=-1), model.predict(X_test_pad).argmax(axis=-1))
    importance = np.zeros(X_test_pad.shape[1])
    print(X_test_pad.shape[1])
    print(n_repeats)
    for i in range(X_test_pad.shape[1]):
        score_diffs = []
        for j in range(n_repeats):
          print(j)
          X_permuted = X_test_pad.copy()
          np.random.shuffle(X_permuted[:, i])  # Shuffle the i-th feature (word position)
          permuted_score = accuracy_score(y_test_cat.argmax(axis=-1), model.predict(X_permuted).argmax(axis=-1))
          score_diffs.append(baseline_score - permuted_score)
        importance[i] = np.mean(score_diffs)

    return importance

# Compute permutation importance
perm_importance = permutation_importance(lstm_model, X_test_pad, y_test_cat)

# Plot permutation importance
plt.figure(figsize=(10, 6))
plt.bar(range(len(perm_importance)), perm_importance)
plt.xlabel('Feature (Word Position)')
plt.ylabel('Importance')
plt.title('Permutation Feature Importance')
plt.show()

In [None]:
# Evaluation metrics for LSTM
accuracy_lstm = accuracy_score(lstm_y_test_classes, lstm_y_pred_classes)
precision_lstm = precision_score(lstm_y_test_classes, lstm_y_pred_classes, average='weighted')
recall_lstm = recall_score(lstm_y_test_classes, lstm_y_pred_classes, average='weighted')
f1_lstm = f1_score(lstm_y_test_classes, lstm_y_pred_classes, average='weighted')

print("\nLSTM Metrics:")
print(f"Accuracy: {accuracy_lstm}")
print(f"Precision: {precision_lstm}")
print(f"Recall: {recall_lstm}")
print(f"F1 Score: {f1_lstm}")

# BERT Model

In [None]:
# Split the data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['combined_text'], df['overall_rating'], test_size=0.2, random_state=42)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True)

In [None]:
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)  # Ensure labels are float
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, train_labels.tolist())
test_dataset = TextDataset(test_encodings, test_labels.tolist())

# Load pre-trained model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,  # Set to 10 epochs
    per_device_train_batch_size=16,  # Suitable batch size
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()


In [None]:
# Evaluate the model
trainer.evaluate()

# Make predictions
predictions = trainer.predict(test_dataset)
predicted_ratings = predictions.predictions

In [None]:
# Calculate accuracy
predicted_ratings = np.squeeze(predicted_ratings)
predicted_labels = np.round(predicted_ratings)
accuracy = np.mean(predicted_labels == test_labels.to_numpy())

print(f'Accuracy: {accuracy * 100:.2f}%')

# Save the model
model.save_pretrained('saved_model')
tokenizer.save_pretrained('saved_model')

In [None]:
# from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,  # Set to 10 epochs
    per_device_train_batch_size=16,  # Suitable batch size
    per_device_eval_batch_size=16,
    warmup_steps=500,  # Adjust warmup steps if needed
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=3e-5,  # Experiment with different learning rates if needed
    lr_scheduler_type='linear',  # Choose from 'linear', 'cosine', etc.
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model='eval_loss',  # Specify the metric to monitor
    evaluation_strategy='steps', # Evaluate and save at the same time
    save_strategy='steps' # Evaluate and save at the same time
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()


In [None]:
# Initialize Trainer with early stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train and evaluate the model
trainer.train()
evaluation_results = trainer.evaluate()

# Make predictions
predictions = trainer.predict(test_dataset)
predicted_ratings = predictions.predictions

# Calculate accuracy
predicted_ratings = np.squeeze(predicted_ratings)
predicted_labels = np.round(predicted_ratings)
accuracy = np.mean(predicted_labels == test_labels.to_numpy())

print(f'Accuracy: {accuracy * 100:.2f}%')

# Save the model
model.save_pretrained('saved_model_2')
tokenizer.save_pretrained('saved_model_2')

In [None]:
model = BertForSequenceClassification.from_pretrained('saved_model_2')
tokenizer = BertTokenizer.from_pretrained('saved_model_2')

In [None]:
sample_text = df['combined_text'][0]
inputs = tokenizer(sample_text, return_tensors='pt', truncation=True, padding=True)

In [None]:
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits
predicted_rating = predictions.squeeze().item()
predicted_label = round(predicted_rating)
print(f"Predicted rating: {predicted_rating}")
print(f"Predicted label (rounded rating): {predicted_label}")

In [None]:
# Forward pass to get attention scores
outputs = model(**inputs,output_attentions=True)
attentions = outputs.attentions  # A list of attention scores from each layer

# For simplicity, use the attention scores from the last layer
last_layer_attentions = attentions[-1]  # Shape: (batch_size, num_heads, seq_len, seq_len)

# Aggregate across heads and tokens
avg_attention = last_layer_attentions.mean(dim=1).squeeze(0).detach().numpy()

# Plot attention heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(avg_attention, xticklabels=tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze().tolist()),
            yticklabels=tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze().tolist()), cmap='viridis')
plt.title('BERT Attention Heatmap')
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# import numpy as np

# Convert true labels to numpy array
true_labels = test_labels.to_numpy().astype(int)

# Calculate accuracy
accuracy_BERT = accuracy_score(true_labels, predicted_label)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Calculate precision, recall, and F1-score
precision_BERT = precision_score(true_labels, predicted_labels, average='binary')
recall_BERT = recall_score(true_labels, predicted_labels, average='binary')
f1_BERT = f1_score(true_labels, predicted_labels, average='binary')

print(f'Precision: {precision * 100:.2f}%')
print(f'Recall: {recall * 100:.2f}%')
print(f'F1 Score: {f1 * 100:.2f}%')

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data
models = ['Random Forest', 'LSTM', 'BERT']
accuracy = [accuracy_rf, accuracy_lstm, accuracy_BERT ]
precision = [precision_rf, precision_lstm, precision_BERT]
recall = [precision_rf, precision_lstm, precision_BERT]
f1_score = [precision_rf, precision_lstm, precision_BERT]

x = np.arange(len(models))  # The label locations
width = 0.2  # The width of the bars

# Create the figure and the bar chart
fig, ax = plt.subplots(figsize=(8, 6))
plt.legend(loc='upper right', bbox_to_anchor=(1.15, 1))
bars1 = ax.bar(x - 1.5*width, accuracy, width, label='Accuracy', color='mediumpurple')
bars2 = ax.bar(x - 0.5*width, precision, width, label='Precision', color='cornflowerblue')
bars3 = ax.bar(x + 0.5*width, recall, width, label='Recall', color='orange')
bars4 = ax.bar(x + 1.5*width, f1_score, width, label='F1-Score', color='mediumseagreen')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_xlabel('Models')
ax.set_ylabel('Scores (%)')
ax.set_title('Model Performance Comparison')
ax.set_xticks(x)
ax.set_xticklabels(models, rotation=25, ha='center')
ax.legend()
yticks = np.arange(0, 101, 25)  # Changed step size to 25
ax.set_yticks(yticks)
ax.set_yticklabels([f'{tick}%' for tick in yticks])

# Attach a text label above each bar, displaying its height
def add_labels(bars):
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

add_labels(bars1)
add_labels(bars2)
add_labels(bars3)
add_labels(bars4)

fig.tight_layout()

plt.show()