In [1]:
import sys
import bz2

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

# Dataset of Australian Legal Court Cases and General Wikipedia Pages
You should download and look at the Court-Wiki-Dataset.txt

file before you begin. You’ll see that the contents are sort of a pseudo-XML, where each text document begins with a 
 tag, and ends with 
.

Note that all of the Australia legal cases begin with something like 
 that is, the doc id for an Australian legal case always starts with AU. You will be trying to figure out if the document is an Australian legal case by looking only at the contents of the document.

# Step-1: Read the data source. 

In [2]:
# Download the file from here 
# https://github.com/kiat/Elements-of-Data-Analytics/blob/main/datasets/Court-Wiki-Dataset.txt.bz2

file = bz2.open("./datasets/Court-Wiki-Dataset.txt.bz2", "r")

mlist = []


count = 0 
for line in file:
    count +=1

    line = (lambda x : (x[x.index('id="') + 4 : x.index('" url=')], x[x.index('">') + 2:][:-6]))(str(line))
    mlist.append({'text' : line[1], 'label' : (lambda x: 0 if 'AU' in x else 1 )(line[0])})

    if(count%100==0):
        print("Line number: " , count)


# text_list
print("Line number: " , count)

data = pd.DataFrame.from_dict(mlist)

data 

Line number:  100
Line number:  200
Line number:  300
Line number:  400
Line number:  500
Line number:  600
Line number:  700
Line number:  800
Line number:  800


Unnamed: 0,text,label
0,purported appeal from orders made by federal m...,0
1,bankruptcy noticefailure by creditor to attach...,0
2,where documents produced to commission of inqu...,0
3,leave to appealinterlocutory judgmentwhether s...,0
4,"applicant, a married person who had undergone ...",0
...,...,...
795,Le LanderonLe Landeron is a municipality in th...,1
796,"Saint-Blaise, SwitzerlandSaint-Blaise is a mun...",1
797,Tahirih Justice CenterThe Tahirih Justice Cent...,1
798,The Ladies of Grace Adieu and Other StoriesThe...,1


In [3]:
labels = data['label']

unique_labels, category_sizes = np.unique(labels, return_counts=True)
category_sizes

array([377, 423])

# Step-2: Vectorize the text dataset using Tfidf Vectorizer. 

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_df = 0.8,
    min_df = 0.20,
    stop_words="english",
)

X_tfidf = vectorizer.fit_transform(list(data['text']))

print(f"n_samples: {X_tfidf.shape[0]}, n_features: {X_tfidf.shape[1]}")

n_samples: 800, n_features: 510


In [30]:
# To see what is inside it
# X_tfidf.toarray()[0]

# Step 3: Classification using Logistic Regression

In [12]:
# Create Test/Train
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.2, random_state=123)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression( fit_intercept=True)
model.fit(X_train,y_train)


y_pred = model.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.9875


In [14]:
# Classification Report
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99        79
           1       0.99      0.99      0.99        81

    accuracy                           0.99       160
   macro avg       0.99      0.99      0.99       160
weighted avg       0.99      0.99      0.99       160



# Step 4: Classification using kNN


In [28]:

from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=9)

model.fit(X_train,y_train)


y_pred = model.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.9875
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        79
           1       1.00      0.98      0.99        81

    accuracy                           0.99       160
   macro avg       0.99      0.99      0.99       160
weighted avg       0.99      0.99      0.99       160



# Step 5: Classification using GaussianNB


In [35]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train.toarray(),y_train)


y_pred = model.predict(X_test.toarray())
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        79
           1       1.00      1.00      1.00        81

    accuracy                           1.00       160
   macro avg       1.00      1.00      1.00       160
weighted avg       1.00      1.00      1.00       160

