In [1]:
import sys
import bz2

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

# Dataset of Australian Legal Court Cases and General Wikipedia Pages
You should download and look at the Court-Wiki-Dataset.txt

file before you begin. You’ll see that the contents are sort of a pseudo-XML, where each text document begins with a 
 tag, and ends with 
.

Note that all of the Australia legal cases begin with something like 
 that is, the doc id for an Australian legal case always starts with AU. You will be trying to figure out if the document is an Australian legal case by looking only at the contents of the document.

# Step-1: Read the data source. 

In [2]:
# Download the file from here 
# https://github.com/kiat/Elements-of-Data-Analytics/blob/main/datasets/Court-Wiki-Dataset.txt.bz2

file = bz2.open("./datasets/Court-Wiki-Dataset.txt.bz2", "r")

mlist = []


count = 0 
for line in file:
    count +=1

    line = (lambda x : (x[x.index('id="') + 4 : x.index('" url=')], x[x.index('">') + 2:][:-6]))(str(line))
    mlist.append({'text' : line[1], 'label' : (lambda x: 0 if 'AU' in x else 1 )(line[0])})

    if(count%100==0):
        print("Line number: " , count)


# text_list
print("Line number: " , count)

data = pd.DataFrame.from_dict(mlist)

data 

Line number:  100
Line number:  200
Line number:  300
Line number:  400
Line number:  500
Line number:  600
Line number:  700
Line number:  800
Line number:  800


Unnamed: 0,text,label
0,purported appeal from orders made by federal m...,0
1,bankruptcy noticefailure by creditor to attach...,0
2,where documents produced to commission of inqu...,0
3,leave to appealinterlocutory judgmentwhether s...,0
4,"applicant, a married person who had undergone ...",0
...,...,...
795,Le LanderonLe Landeron is a municipality in th...,1
796,"Saint-Blaise, SwitzerlandSaint-Blaise is a mun...",1
797,Tahirih Justice CenterThe Tahirih Justice Cent...,1
798,The Ladies of Grace Adieu and Other StoriesThe...,1


In [3]:
labels = data['label']

unique_labels, category_sizes = np.unique(labels, return_counts=True)
category_sizes

array([377, 423])

# Step-2: Vectorize the text dataset using Tfidf Vectorizer. 

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_df = 0.8,
    min_df = 0.20,
    stop_words="english",
)

X_tfidf = vectorizer.fit_transform(list(data['text']))

print(f"n_samples: {X_tfidf.shape[0]}, n_features: {X_tfidf.shape[1]}")

n_samples: 800, n_features: 510


In [5]:
# To see what is inside it
# X_tfidf.toarray()[0]

# Step 3: Classification using Logistic Regression

In [6]:
# Create Test/Train
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.2, random_state=123)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression( fit_intercept=True)
model.fit(X_train,y_train)


y_pred = model.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.9875


In [8]:
# Classification Report
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99        79
           1       0.99      0.99      0.99        81

    accuracy                           0.99       160
   macro avg       0.99      0.99      0.99       160
weighted avg       0.99      0.99      0.99       160



# Step 4: Random Feature Selection
We can drop lots of features and we will see no drop in classification accuracy. 

# Dropping 10% of Features

In [21]:
def drop_columns(arr, drop_rate):
    num_columns = arr.shape[1]
    num_drop = int(num_columns * drop_rate)  # Calculate 10% of the total columns
    
    # Generate a list of column indices to drop
    drop_idx = np.random.choice(num_columns, num_drop, replace=False)
    
    # Remove the selected columns
    new_arr = np.delete(arr, drop_idx, axis=1)
    
    return new_arr

In [10]:
# Drop 10% of Data Features and repeate the Classification Steps 
X_new = drop_columns(X_tfidf.toarray(), 0.10)

X_train, X_test, y_train, y_test = train_test_split(X_new, labels, test_size=0.2, random_state=123)

# A New Model with 10% less features
model = LogisticRegression( fit_intercept=True)
model.fit(X_train,y_train)


y_pred = model.predict(X_test)
print(accuracy_score(y_test,y_pred))
# No Change in ACC

0.9875


In [11]:
# Drop 20% of Data Features 
X_new = drop_columns(X_tfidf.toarray(), 0.20)

X_train, X_test, y_train, y_test = train_test_split(X_new, labels, test_size=0.2, random_state=123)

# A New Model with 10% less features
model = LogisticRegression( fit_intercept=True)
model.fit(X_train,y_train)


y_pred = model.predict(X_test)
print(accuracy_score(y_test,y_pred))
# No Change in ACC

0.98125


In [12]:
# Drop 30% of Data Features 
X_new = drop_columns(X_tfidf.toarray(), 0.30)

X_train, X_test, y_train, y_test = train_test_split(X_new, labels, test_size=0.2, random_state=123)

# A New Model with 10% less features
model = LogisticRegression( fit_intercept=True)
model.fit(X_train,y_train)


y_pred = model.predict(X_test)
print(accuracy_score(y_test,y_pred))
# No Change in ACC

0.98125


In [13]:
# Drop 80% of Data Features 
X_new = drop_columns(X_tfidf.toarray(), 0.80)
print(X_tfidf.shape)
print(X_new.shape)
X_train, X_test, y_train, y_test = train_test_split(X_new, labels, test_size=0.2, random_state=123)

# A New Model with 10% less features
model = LogisticRegression( fit_intercept=True)
model.fit(X_train,y_train)


y_pred = model.predict(X_test)
print(accuracy_score(y_test,y_pred))
# No Change in ACC

(800, 510)
(800, 102)
0.96875


In [20]:
# Drop 92% of Data Features 
X_new = drop_columns(X_tfidf.toarray(), 0.92)
print(X_tfidf.shape)
print(X_new.shape)
X_train, X_test, y_train, y_test = train_test_split(X_new, labels, test_size=0.2, random_state=123)

# A New Model with 10% less features
model = LogisticRegression( fit_intercept=True)
model.fit(X_train,y_train)


y_pred = model.predict(X_test)
print(accuracy_score(y_test,y_pred))
# No Change in ACC

(800, 510)
(800, 41)
0.96875
