In [149]:
import sys
# import re
import bz2

# regex = re.compile("[^a-zA-Z]")

import numpy as np
import pandas as pd


import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.feature_extraction.text import TfidfVectorizer

# Dataset of Australian legal case and Wikipedia Pages

You should download and look at the Court-Wiki-Dataset.txt

file before you begin. You’ll see that the contents are sort of a pseudo-XML, where each text document
begins with a $<doc \ id = ... >$ tag, and ends with $</doc>$.

Note that all of the Australia legal cases begin with something like $<\text{doc id = ‘‘AU1222’’ ...}>$
that is, the doc id for an Australian legal case always starts with AU. You will be trying to figure out if the
document is an Australian legal case by looking only at the contents of the document.

In [150]:
file = bz2.open("./datasets/Court-Wiki-Dataset.txt.bz2", "r")
mlist = []

count = 0 
for line in file:
    count +=1

    line = (lambda x : (x[x.index('id="') + 4 : x.index('" url=')], x[x.index('">') + 2:][:-6]))(str(line))
    text = line[1]
    
    mlist.append({'text' : text, 'label' : line[0]})


    if(count%100==0):
        print("Line number: " , count)

print("Line number: " , count)
data = pd.DataFrame.from_dict(mlist)
data 

Line number:  100
Line number:  200
Line number:  300
Line number:  400
Line number:  500
Line number:  600
Line number:  700
Line number:  800
Line number:  800


Unnamed: 0,text,label
0,purported appeal from orders made by federal m...,AU11
1,bankruptcy noticefailure by creditor to attach...,AU28
2,where documents produced to commission of inqu...,AU29
3,leave to appealinterlocutory judgmentwhether s...,AU31
4,"applicant, a married person who had undergone ...",AU38
...,...,...
795,Le LanderonLe Landeron is a municipality in th...,6292963
796,"Saint-Blaise, SwitzerlandSaint-Blaise is a mun...",6293133
797,Tahirih Justice CenterThe Tahirih Justice Cent...,6220600
798,The Ladies of Grace Adieu and Other StoriesThe...,6221345


In [151]:
# Convert the labels. 
# if it has AU it means it is an austrial court case report and if not it is a wikipedia article. 
data['label'] = data['label'].apply(lambda x: 1 if 'AU' in str(x) else 0 )
data

Unnamed: 0,text,label
0,purported appeal from orders made by federal m...,1
1,bankruptcy noticefailure by creditor to attach...,1
2,where documents produced to commission of inqu...,1
3,leave to appealinterlocutory judgmentwhether s...,1
4,"applicant, a married person who had undergone ...",1
...,...,...
795,Le LanderonLe Landeron is a municipality in th...,0
796,"Saint-Blaise, SwitzerlandSaint-Blaise is a mun...",0
797,Tahirih Justice CenterThe Tahirih Justice Cent...,0
798,The Ladies of Grace Adieu and Other StoriesThe...,0


# Creating Feature Matrix using TF-Idf

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [152]:
from sklearn.feature_extraction.text import TfidfVectorizer
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# Number of feature words is max_feature 

corpus = data['text'].to_numpy()
vectorizer = TfidfVectorizer(lowercase=True, max_features=500)

X = vectorizer.fit_transform(corpus)

# print(vectorizer.get_feature_names_out())

vectorizer.get_feature_names_out()
print(type(X))
print(X.shape)
X=X.toarray()
print(type(X))


<class 'scipy.sparse._csr.csr_matrix'>
(800, 500)
<class 'numpy.ndarray'>


In [153]:
y = data['label'].to_numpy()
y.shape

(800,)

In [154]:
print("count of Australian Court Cases", len(data[data['label'] ==1]))

count of Australian Court Cases 377


# Document Classification 



## Traing and Test Data Split

In [155]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# 1. Logistic Regression

In [156]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

model = LogisticRegression(max_iter=500, fit_intercept=True)
model.fit(X_train,y_train)


score = model.score(X_test, y_test)
print("Accuracy of Classifier is:  ",  score)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

Accuracy of Classifier is:   0.9875
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        81
           1       0.98      1.00      0.99        79

    accuracy                           0.99       160
   macro avg       0.99      0.99      0.99       160
weighted avg       0.99      0.99      0.99       160



# 2. Support Vector Machine - Linear SVM



In [157]:
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

model = SVC(kernel="linear", C=0.025)
model.fit(X_train,y_train)


score = model.score(X_test, y_test)
print("Accuracy of Classifier is:  ",  score)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

Accuracy of Classifier is:   0.63125
              precision    recall  f1-score   support

           0       0.58      1.00      0.73        81
           1       1.00      0.25      0.40        79

    accuracy                           0.63       160
   macro avg       0.79      0.63      0.57       160
weighted avg       0.79      0.63      0.57       160



# 3. Support Vector Machine -  SVM with Radial basis function kernel

In [164]:
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Radial basis function kernel
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
model = SVC(gamma=2, C=1)

model.fit(X_train,y_train)


score = model.score(X_test, y_test)
print("Accuracy of Classifier is:  ",  score)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

Accuracy of Classifier is:   0.99375
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        81
           1       1.00      0.99      0.99        79

    accuracy                           0.99       160
   macro avg       0.99      0.99      0.99       160
weighted avg       0.99      0.99      0.99       160



# 4. Decision Tree Classifier

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html


In [165]:
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Decision Tree Classifier
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
model =  DecisionTreeClassifier(max_depth=5)

model.fit(X_train,y_train)


score = model.score(X_test, y_test)
print("Accuracy of Classifier is:  ",  score)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


Accuracy of Classifier is:   0.99375
              precision    recall  f1-score   support

           0       1.00      0.99      0.99        81
           1       0.99      1.00      0.99        79

    accuracy                           0.99       160
   macro avg       0.99      0.99      0.99       160
weighted avg       0.99      0.99      0.99       160



# 5. Gaussian Naive Bayes

https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html

In [166]:
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Decision Tree Classifier
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
model =  GaussianNB()


model.fit(X_train,y_train)


score = model.score(X_test, y_test)
print("Accuracy of Classifier is:  ",  score)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

Accuracy of Classifier is:   1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        81
           1       1.00      1.00      1.00        79

    accuracy                           1.00       160
   macro avg       1.00      1.00      1.00       160
weighted avg       1.00      1.00      1.00       160



# 6. Random Forest Classifier
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [167]:
from sklearn.ensemble import RandomForestClassifier


from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Random Forest Classifier
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
model =  RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)

model.fit(X_train,y_train)

score = model.score(X_test, y_test)
print("Accuracy of Classifier is:  ",  score)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


Accuracy of Classifier is:   0.98125
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        81
           1       1.00      0.96      0.98        79

    accuracy                           0.98       160
   macro avg       0.98      0.98      0.98       160
weighted avg       0.98      0.98      0.98       160



# 7. AdaBoostClassifier
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html

In [168]:
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# AdaBoostClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
model =  AdaBoostClassifier()


model.fit(X_train,y_train)


score = model.score(X_test, y_test)
print("Accuracy of Classifier is:  ",  score)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

Accuracy of Classifier is:   1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        81
           1       1.00      1.00      1.00        79

    accuracy                           1.00       160
   macro avg       1.00      1.00      1.00       160
weighted avg       1.00      1.00      1.00       160

