In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
import joblib


# Download NLTK stopwords
nltk.download('stopwords')

# Load the dataset
df = pd.read_csv('legal_texts.csv')

# Check for any missing values
print(df.isnull().sum())

# Drop rows with missing values in 'case_text' or 'case_outcome'
df = df.dropna(subset=['case_text', 'case_outcome'])

# Define the stopwords
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    tokens = [word for word in text.split() if word.lower() not in stop_words]
    return ' '.join(tokens)

# Apply the preprocessing function to the 'case_text' column
df['case_text'] = df['case_text'].apply(preprocess_text)

# Print the first few rows to ensure preprocessing is correct
print(df.head())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['case_text'], df['case_outcome'], test_size=0.2, random_state=42)

# Print the shape of the train and test sets
print(f'Training set shape: {X_train.shape}, {y_train.shape}')
print(f'Test set shape: {X_test.shape}, {y_test.shape}')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gigah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


case_id           0
case_outcome      0
case_title        0
case_text       176
dtype: int64
  case_id case_outcome                                         case_title  \
0   Case1        cited  Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...   
1   Case2        cited  Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...   
2   Case3        cited  Colgate Palmolive Co v Cussons Pty Ltd (1993) ...   
3   Case4        cited  Dais Studio Pty Ltd v Bullett Creative Pty Ltd...   
4   Case5        cited  Dr Martens Australia Pty Ltd v Figgins Holding...   

                                           case_text  
0  Ordinarily discretion exercised costs follow e...  
1  general principles governing exercise discreti...  
2  Ordinarily discretion exercised costs follow e...  
3  general principles governing exercise discreti...  
4  preceding general principles inform exercise d...  
Training set shape: (19847,), (19847,)
Test set shape: (4962,), (4962,)


In [11]:
# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the training data, transform the test data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_train_tfidf, y_train)

# Predict the outcomes for the test data
y_pred = nb_classifier.predict(X_test_tfidf)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print the classification report
print(classification_report(y_test, y_pred))


Accuracy: 49.74%
               precision    recall  f1-score   support

     affirmed       0.00      0.00      0.00        23
      applied       0.25      0.01      0.01       496
     approved       0.00      0.00      0.00        21
        cited       0.50      0.99      0.66      2440
   considered       0.25      0.01      0.02       353
    discussed       0.33      0.00      0.01       206
distinguished       0.00      0.00      0.00       110
     followed       0.00      0.00      0.00       437
  referred to       0.67      0.05      0.09       855
      related       0.00      0.00      0.00        21

     accuracy                           0.50      4962
    macro avg       0.20      0.11      0.08      4962
 weighted avg       0.42      0.50      0.34      4962



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# Assuming nb_classifier and tfidf_vectorizer are your trained model and vectorizer

# Save the Naive Bayes model
joblib.dump(nb_classifier, 'nb_classifier_model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']