In [1]:
import pandas as pd
import os
import re
import fitz  # PyMuPDF
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

nltk.download('stopwords')
nltk.download('wordnet')

def extract_what_happened(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
       
    complaint_pattern = r'The complaint\s*((?:[\s\S](?!What happened))*.)'
    complaint_match = re.search(complaint_pattern, text)
    
    happened_pattern = r'What happened\s*([\s\S]*?)(?=What I’ve decided – and why|What I provisionally said|What I provisionally decided – and why)'
    happened_match = re.search(happened_pattern, text)
    
    complaint_text = complaint_match.group(1).strip() if complaint_match else ""
    happened_text = happened_match.group(1).strip() if happened_match else ""
    
    combined_text = complaint_text + " " + happened_text
    
    return combined_text

df = pd.read_csv('D:/dissertation/session 2/merged_output_claim_1.csv', encoding='ISO-8859-1')

base_path = 'D:/dissertation/session 2/decisions_half'
df['pdf_path'] = df['decision_id'].apply(lambda x: os.path.join(base_path, f"{x}.pdf"))

df['text'] = df['pdf_path'].apply(extract_what_happened)

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer() 

def preprocess_text(text):
    words = text.lower().split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

df['processed_text'] = df['text'].apply(preprocess_text)

vectorizer = TfidfVectorizer(max_features=1000)
X_text = vectorizer.fit_transform(df['processed_text'])

encoder = OneHotEncoder()
X_categorical = encoder.fit_transform(df[['field', 'category']])

X = hstack([X_text, X_categorical])

y = df['decision'].str.strip().str.lower().apply(lambda x: 1 if 'upheld' == x else 0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression models
model = LogisticRegression(max_iter=1000)  # Increase the number of iterations to ensure convergence
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

print(f"Overall Accuracy: {accuracy:.2f}")
print(f"Overall Precision: {precision:.2f}")
print(f"Overall Recall: {recall:.2f}")
print(f"Overall F1-Score: {f1:.2f}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Confusion Matrix:
[[270  66]
 [ 89 238]]
Overall Accuracy: 0.77
Overall Precision: 0.78
Overall Recall: 0.73
Overall F1-Score: 0.75
