In [None]:
!pip install transformers datasets rouge-score gensim networkx matplotlib

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
from gensim.models import Word2Vec
import itertools
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import matplotlib.pyplot as plt
from rouge_score import rouge_scorer
import torch

nltk.download('stopwords')
nltk.download('wordnet')

csv_file_path = "/content/impression_300_llm.csv"
df = pd.read_csv(csv_file_path)

df['text'] = df['Report Name'] + ' ' + df['History'] + ' ' + df['Observation']
df = df[['text', 'Impression']]

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def process_text(text):
    words = [word for word in text.split() if word.lower() not in stop_words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

df['processed_text'] = df['text'].apply(process_text)

model_name = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(preprocess_function, batched=True)

train_dataset = tokenized_dataset.select(range(300))
eval_dataset = tokenized_dataset.select(range(300, 330))

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Fine-tune the model (optional, uncomment if you want to fine-tune)
# trainer.train()

# Save the model (optional, uncomment if you want to save)
# model.save_pretrained("fine_tuned_model")
# tokenizer.save_pretrained("fine_tuned_model")

def calculate_perplexity(text):
    tokenize_input = tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(tokenize_input, labels=tokenize_input)
    loss = outputs.loss
    return torch.exp(loss)

example_text = "Your sample input text here"
perplexity = calculate_perplexity(example_text)
print(f"Perplexity: {perplexity.item()}")

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
reference = "Your reference impression"
generated = "Your generated impression"
scores = scorer.score(reference, generated)
print(f"ROUGE scores: {scores}")

sentences = [text.split() for text in df['processed_text']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

word_vectors = word2vec_model.wv
words = list(word_vectors.index_to_key)
pairs = list(itertools.combinations(words, 2))
similarities = [(pair[0], pair[1], cosine_similarity([word_vectors[pair[0]]], [word_vectors[pair[1]]])[0][0]) for pair in pairs]

top_100_pairs = sorted(similarities, key=lambda x: x[2], reverse=True)[:100]

G = nx.Graph()
for word1, word2, similarity in top_100_pairs:
    G.add_edge(word1, word2, weight=similarity)

pos = nx.spring_layout(G)
plt.figure(figsize=(10, 10))
nx.draw(G, pos, with_labels=True, node_size=50, font_size=10)
plt.show()

top_100_df = pd.DataFrame(top_100_pairs, columns=["Word1", "Word2", "Similarity"])
top_100_df.to_csv("top_100_word_pairs.csv", index=False)




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Map:   0%|          | 0/330 [00:00<?, ? examples/s]

Perplexity: 1488.3973388671875
ROUGE scores: {'rouge1': Score(precision=0.6666666666666666, recall=0.6666666666666666, fmeasure=0.6666666666666666), 'rougeL': Score(precision=0.6666666666666666, recall=0.6666666666666666, fmeasure=0.6666666666666666)}
