CONVERTING MENTAL HEALTH PDFs TO CSVs

In [24]:
import PyPDF2
import csv

def extract_text_from_pdf(pdf_file):
    text = ""
    with open(pdf_file, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        for page_num in range(num_pages):
            text += reader.pages[page_num].extract_text()
    return text

def format_to_csv(text):
    lines = text.split("\n")
    csv_data = [["Question", "Answer"]]
    current_question = ""
    current_answer = ""
    for line in lines:
        line = line.strip()
        if not line:
            continue
        if line.endswith("?"):
            # New question
            if current_question and current_answer:
                csv_data.append([current_question, current_answer])
            current_question = line
            current_answer = ""
        else:
            # Append to answer
            current_answer += line + " "
    # Add the last question-answer pair
    if current_question and current_answer:
        csv_data.append([current_question, current_answer])
    return csv_data

import csv

def write_to_csv(csv_data, csv_file):
    with open(csv_file, "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerows(csv_data)


def main():
    pdf_file = "mental_health_pdf.pdf"
    csv_file = "mental_health_csv.csv"
    text = extract_text_from_pdf(pdf_file)
    csv_data = format_to_csv(text)
    write_to_csv(csv_data, csv_file)
    print(f"CSV file '{csv_file}' has been created successfully.")

if __name__ == "__main__":
    main()


CSV file 'mental_health_csv.csv' has been created successfully.


In [25]:
import pdfplumber
import csv
import re

def extract_text_from_pdf(pdf_file):
    text = ""
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

def extract_questions_and_answers(text):
    questions = []
    answers = []
    # Use regular expression to match question numbers and their respective answers
    matches = re.findall(r'(\d+\))\s*(.*?)\s*(\d+)\)', text, re.DOTALL)
    for match in matches:
        question_number = match[0]
        question = match[1]
        answer_number = match[2]
        answer = re.search(rf"{answer_number}\)(.*?)\d+\)", text, re.DOTALL).group(1)
        questions.append(question.strip())
        answers.append(answer.strip())
    return questions, answers

def write_to_csv(questions, answers, csv_file):
    with open(csv_file, "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Question", "Answer"])
        writer.writerows(zip(questions, answers))

def main():
    pdf_file = "Mental_health_T_F.pdf"
    pdf_text = extract_text_from_pdf(pdf_file)
    questions, answers = extract_questions_and_answers(pdf_text)
    csv_file = "mental_health_T_F_csv.csv"
    write_to_csv(questions, answers, csv_file)
    print(f"CSV file '{csv_file}' has been created successfully.")

if __name__ == "__main__":
    main()


CSV file 'mental_health_T_F_csv.csv' has been created successfully.


In [26]:
import pdfplumber
import csv
import re

def extract_text_from_pdf(pdf_file):
    text = ""
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

def extract_questions_and_answers(text):
    questions = []
    answers = []
    # Use regular expression to match questions ending with a question mark and their respective answers
    matches = re.findall(r'(\d+-.*?)(?:\s*Answer.*?)\s*(.*?)(?=(?:\d+-|$))', text, re.DOTALL)
    for match in matches:
        question = match[0].strip()
        answer = match[1].strip()
        questions.append(question)
        answers.append(answer)
    return questions, answers

def write_to_csv(questions, answers, csv_file):
    with open(csv_file, "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Question", "Answer"])
        writer.writerows(zip(questions, answers))

def main():
    pdf_file = "Quiz.pdf"
    pdf_text = extract_text_from_pdf(pdf_file)
    questions, answers = extract_questions_and_answers(pdf_text)
    csv_file = "mental_health_quiz_csv.csv"
    write_to_csv(questions, answers, csv_file)
    print(f"CSV file '{csv_file}' has been created successfully.")

if __name__ == "__main__":
    main()


CSV file 'mental_health_quiz_csv.csv' has been created successfully.


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load the merged CSV file
df = pd.read_csv("merged_csv.csv")

# Handle missing values by replacing NaNs with empty strings
df['Question'] = df['Question'].fillna('')
df['Answer'] = df['Answer'].fillna('')

# Preprocess the data
corpus = df['Question'].tolist()
answers = df['Answer'].tolist()

# Vectorize the corpus
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

def get_response(query):
    # Vectorize the query
    query_vec = vectorizer.transform([query])
    
    # Calculate cosine similarity between query vector and corpus vectors
    similarities = cosine_similarity(query_vec, X)
    
    # Get index of most similar question
    idx = similarities.argmax()
    
    # Return corresponding answer
    return answers[idx]

# Evaluation function
def evaluate_model():
    predicted_answers = [get_response(query) for query in corpus]
    # Calculate accuracy
    accuracy = sum(1 for pred, actual in zip(predicted_answers, answers) if pred == actual) / len(answers)
    # Calculate precision and recall (assuming binary classification)
    true_positives = sum(1 for pred, actual in zip(predicted_answers, answers) if pred == 'relevant' and actual == 'relevant')
    false_positives = sum(1 for pred, actual in zip(predicted_answers, answers) if pred == 'relevant' and actual == 'irrelevant')
    false_negatives = sum(1 for pred, actual in zip(predicted_answers, answers) if pred == 'irrelevant' and actual == 'relevant')
    
    # Add a small epsilon value to avoid division by zero
    epsilon = 1e-9
    
    # Calculate precision
    precision_denominator = true_positives + false_positives + epsilon
    precision = true_positives / precision_denominator
    
    # Calculate recall
    recall_denominator = true_positives + false_negatives + epsilon
    recall = true_positives / recall_denominator
    
    return accuracy, precision, recall

# Evaluate the model on the entire dataset
accuracy, precision, recall = evaluate_model()

print("Accuracy:", accuracy)


Accuracy: 0.40821917808219177


MERGING ALL FIVE CSVs INTO ONE DATASET CSV 

In [5]:
import csv

def merge_csv_files(csv_files, merged_csv_file):
    with open(merged_csv_file, 'w', newline='', encoding='utf-8') as outfile:
        writer = csv.writer(outfile)
        for index, csv_file in enumerate(csv_files):
            with open(csv_file, 'r', newline='', encoding='utf-8') as infile:
                reader = csv.reader(infile)
                if index > 0:
                    # Skip the header from the second file onwards
                    next(reader)
                for row in reader:
                    writer.writerow(row)

def main():
    csv_files = ["mental_health_csv.csv", "mental_health_quiz_csv.csv", "mental_health_T_F_csv.csv","intents.csv","mentalhealth.csv","transformed_csv.csv"]  # Replace with your CSV file names
    merged_csv_file = "merged_csv.csv"  # Output merged CSV file name
    merge_csv_files(csv_files, merged_csv_file)
    print(f"Merged CSV file '{merged_csv_file}' has been created successfully.")

if __name__ == "__main__":
    main()


Merged CSV file 'merged_csv.csv' has been created successfully.


ROUGH WORK

To check models report its avg accuracy and stuff

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import pandas as pd

# Load the merged CSV file
df = pd.read_csv("merged_csv.csv")

# Handle missing values by replacing NaNs with empty strings
df['Question'] = df['Question'].fillna('')
df['Answer'] = df['Answer'].fillna('')

# Preprocess the data
corpus = df['Question'].tolist()
answers = df['Answer'].tolist()

# Vectorize the corpus
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

# Evaluation function
def evaluate_model():
    predicted_answers = [get_response(query) for query in corpus]
    return classification_report(answers, predicted_answers)

# Evaluate the model on the entire dataset
report = evaluate_model()

print(report)


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [30]:
query = "What are mental health problems?"
response = get_response(query)
print("Bot:", response)

Bot: In many ways, mental health is just like physical health: everybody has it and we need to take care of it. Good mental health means being generally able to think, feel and react in the ways that you need and want to live your life. But if you go through a period of poor mental health you might find the ways you're frequently thinking, feeling or reacting become diff icult, or even impossible, to cope with. This can feel just as bad as a p hysical illness, or even worse. Mental health problems affect around one in four people in any given year. They range from common problems, such as  depression  and anxiet y, to rarer problems such as schizophrenia  and bipolar disorder . “I now know that if I felt there was something wrong, it's because there was, but I didn't understand mental health fully... it's a spectrum and you should feel able to decide where and whe n you [are] on that spectrum. ” 


In [23]:
import PyPDF2
import csv

def extract_text_from_pdf(pdf_file):
    text = ""
    with open(pdf_file, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

def pdf_to_csv(pdf_file, csv_file):
    text = extract_text_from_pdf(pdf_file)
    lines = text.split('\n')
    questions = []
    answers = []
    current_question = ""
    for line in lines:
        if line.strip().endswith("?"):
            if current_question:
                questions.append(current_question.strip())
                current_question = ""
            current_question = line.strip()
        elif line.startswith("Answer"):
            answers.append(line[7:].strip())
    
    #print("Extracted Questions:")
   # for q in questions:
    #    print(q)
    
   # print("\nExtracted Answers:")
   # for a in answers:
   #     print(a)

    with open(csv_file, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Question", "Answer"])
        for q, a in zip(questions, answers):
            writer.writerow([q, a])

# Usage
pdf_file = "intents.pdf"  # Replace with your PDF file path
csv_file = "intents.csv"  # Output CSV file name
pdf_to_csv(pdf_file, csv_file)
print("CSV CREATED!!!")


CSV CREATED!!!


In [33]:
import csv
from PyPDF2 import PdfReader

def extract_text_from_pdf(pdf_file):
    text = ""
    with open(pdf_file, 'rb') as file:
        reader = PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

def pdf_to_csv(pdf_file, csv_file):
    text = extract_text_from_pdf(pdf_file)
    lines = text.split('\n')
    questions = []
    answers = []
    current_question = ""
    for line in lines:
        if line.strip().endswith("?"):
            if current_question:
                questions.append(current_question.strip())
                current_question = ""
            current_question = line.strip()
        elif line.startswith("Answer"):
            answers.append(line[7:].strip())

    with open(csv_file, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Question", "Answer"])
        for q, a in zip(questions, answers):
            writer.writerow([q, a])

# Usage
pdf_file = "intents.pdf"  # Replace with your PDF file path
csv_file = "intents.csv"  # Output CSV file name
pdf_to_csv(pdf_file, csv_file)
print("CSV CREATED!!!")


CSV CREATED!!!
