In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import joblib
import re
from lbl2vec import Lbl2Vec
from gensim.models import Word2Vec

# Download NLTK resources
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Load the dataset
df = pd.read_csv('legal_texts.csv')

# Drop rows with missing values in 'case_text' or 'case_outcome'
df = df.dropna(subset=['case_text', 'case_outcome'])

# Function to preprocess text
def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words and len(word) > 2]
    return ' '.join(tokens)

# Apply the preprocessing function to the 'case_text' column
df['case_text'] = df['case_text'].apply(preprocess_text)

# Print the first few rows to ensure preprocessing is correct
print(df.head())

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 5), max_features=100000)

# Fit and transform the data
X_tfidf = tfidf_vectorizer.fit_transform(df['case_text'])

# Train Word2Vec model
documents = [text.split() for text in df['case_text']]
word2vec_model = Word2Vec(sentences=documents, vector_size=100, window=5, min_count=1, workers=4)

# Define labels (for demonstration, using case outcomes as labels)
labels = df['case_outcome'].unique().tolist()

# Create label descriptions (assuming each label has its own text description)
label_descriptions = {label: label for label in labels}

# Define keywords for each label (mock example)
keywords_list = [[label] for label in labels]

# Train Lbl2Vec model
lbl2vec_model = Lbl2Vec(keywords_list=keywords_list)
lbl2vec_model.fit(documents=documents, model=word2vec_model, epochs=10)

# Predict labels for documents
predicted_labels = lbl2vec_model.predict(documents)

# Add the predicted labels to the dataframe
df['predicted_label'] = predicted_labels

# Calculate silhouette score to evaluate the clustering
silhouette_avg = silhouette_score(X_tfidf, predicted_labels)
print(f'Silhouette Score: {silhouette_avg:.2f}')

# Save the model and vectorizer
joblib.dump(lbl2vec_model, 'lbl2vec_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

# Display the first few rows with the predicted labels
print(df.head())


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gigah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gigah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  case_id case_outcome                                         case_title  \
0   Case1        cited  Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...   
1   Case2        cited  Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...   
2   Case3        cited  Colgate Palmolive Co v Cussons Pty Ltd (1993) ...   
3   Case4        cited  Dais Studio Pty Ltd v Bullett Creative Pty Ltd...   
4   Case5        cited  Dr Martens Australia Pty Ltd v Figgins Holding...   

                                           case_text  
0  ordinarily discretion exercised cost follow ev...  
1  general principle governing exercise discretio...  
2  ordinarily discretion exercised cost follow ev...  
3  general principle governing exercise discretio...  
4  preceding general principle inform exercise di...  


ValueError: Either provide a pre-trained Doc2Vec model in the "doc2vec_model" paramater or provide tagged documents in the "tagged_documents" parameter to train a new Doc2Vec model. This is a logical XOR condition.