In [1]:
#import
import nltk , re ,  os ,  sys , pymongo , json , string , numpy as np , pandas as pd , matplotlib.pyplot as plt
############
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from io import StringIO
############
from sklearn.cross_validation import train_test_split #splitting dataset
from sklearn.preprocessing import StandardScaler #feature scaling
from sklearn.linear_model import LogisticRegression #classifer
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV #feature selection
from sklearn.metrics import confusion_matrix #confusion matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
############
from nltk.stem.snowball import SnowballStemmer #for stemming purpose
############
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer #Tfidf
############
#nltk.download('stopwords')



In [2]:
######read input_data path

path = '/home/computer/Desktop/txt_mining_project/final_project_files/Final_Input_Data'
files = [path+'/'+x for x in os.listdir(path) if x.endswith('.html') ] 

In [3]:
#remove all data in the database

mng_client = pymongo.MongoClient('localhost', 27017)
mng_db = mng_client['document'] 
collection_name = 'dataset'
db_cm = mng_db[collection_name]
db_cm.remove()

  import sys


{'n': 671, 'ok': 1}

In [4]:
######read one document and storage preparation function (html to csv)

def doc_prep (url):
    #read file as xml
    html = open(url, encoding='latin-1')
    soup = BeautifulSoup(html,"lxml").get_text()
    #########
    #file format processing
    
    #eliminate spaces in start & end of the file
    soup=soup.lstrip('\n').rstrip('\n')
    #split file into paragraphs
    para= soup.split(sep='\n\n')
    # eliminate newlines(\n) from paragraphs
    para_tmp=[]
    for s in para:
        para_tmp.append(s.replace('\n',' '))
    para=para_tmp
    #remove pages' numbers
    para_tmp = []
    for line in para:
        if len(line.lstrip(' ').rstrip(' ')) > 4:
            para_tmp.append(line)
    para = para_tmp
    #########
    #creating pandas dataframe
    
    #adding \t\t in the beginning of non-header paragraphs(training purpose)
    para_compressed=[' ']
    for line in para:
        if re.search('^.*[\\t]+.*$', line):
            para_compressed.append(line)
        else:
            para_compressed.append('\t\t' + line)
    #concatenate the paragraphs into one string
    data='\n'.join(para_compressed)
    #########
    #Parsing data using pandas dataframe
    TESTDATA = StringIO(data)
    parser=pd.read_csv(TESTDATA,sep='\t',names=['is_header','section_type','section'])
    #######dataframe preprocessing
    #treating missing data
    parser['is_header'] = parser['is_header'].fillna(False)
    parser['section_type'] = parser['section_type'].fillna('None')
    parser.drop(parser[parser['section_type'].map(len) > 35].index, inplace=True)
    parser=parser.dropna()
    #solving type problems(in case)
    parser.is_header=parser.is_header.astype(bool)
    parser.section=parser.section.astype(str)
    parser.section_type=parser.section_type.astype(str)
    return parser


In [5]:
######insert documents in database function

def store_content(csv_file):
    mng_client = pymongo.MongoClient('localhost', 27017)
    mng_db = mng_client['document'] 
    collection_name = 'dataset'
    db_cm = mng_db[collection_name]
    data_json = json.loads(csv_file.to_json(orient='records'))
    #db_cm.remove()
    db_cm.insert(data_json)
    return

In [6]:
######retreive documents from database function

def read_content():
    """ Read from Mongo and Store into DataFrame """
    mng_client = pymongo.MongoClient('localhost', 27017)
    mng_db = mng_client['document'] 
    collection_name = 'dataset'
    db_cm = mng_db[collection_name]
    cursor = db_cm.find()
    df =  pd.DataFrame(list(cursor))
    # Delete the _id
    try:
        del df['_id']
    except:
        print('This is an error message!')
    return df

In [7]:
for url in files:
    parser = doc_prep(url)
    store_content(parser) 
parser = read_content()
NbSections = len(parser)
#parser

  # Remove the CWD from sys.path while we load stuff.


In [8]:
####### part_1_processing: ####### train a classifier to decide whether a section is header or not

In [9]:
#######feature preparation

#######define feature functions
def startWithArabic (Instr):
    """
    this function return true if the given string starts with am Arabic numeral
    """
    return Instr[:1].isdigit()
########################################
def startWithRoman (Instr):
    """
    this function return true if the given string starts with Simple Roman numeral followed by period or ' '
    """
    return re.match('^(?=[MDCLXVI])M*(C[MD]|D?C*)(X[CL]|L?X*)(I[XV]|V?I*)[ |.]',Instr[0:3]) is not None

#######################################
def leadingWhiteSpace(Instr):
    """
    this function return true if the given string starts with WhiteSpace character
    """
    return re.match('^\s',Instr) is not None

########################################
def ellipses(Instr):
    """
    this function return true if the given string contains ellipses
    """
    return re.search(r'(\w+)\.{3,}',Instr) is not None 
#######################################
def ContainsComma (Instr):
    return ',' in Instr
######################################
def common_words_count (intstr) :

    CommonWordsList = stopwords.words('english') ;
    CourtWordList = ['complaint' , 'commission' , 'defendant' , 'appeal' , 'case' , 'action' , 'accused' , 'appellant' , 'crime' , 'answer' , 'brief' , ' claim' , ' collateral' , ' complaint' , ' contract' , ' counsel' , 'count' ,
                    'defendants' , ' evidence' , 'federal question' , 'issue' , 'guilty' , ' judge' , 'conviction' , 'precedent'  , 'procedure' , 'jury' , 'sentence' , 'statute' , 'witness' ,
                    'however' , 'federal' , 'law' , 'claim' , 'party' , 'petition' , 'record' , 'statute' , 'judgment' , 'objection' , 'part' , 'party' , 'parties' , 'witness' , 'witnesses' ,
                    'judge' , 'fact' , 'state' , 'sentence' , 'district' , 'trial']
    CommonWordsList += CourtWordList

    var = intstr
    return sum(1 for word in var.split() if word.lower() in CommonWordsList)
#######################################
def stop_words_count(inputstr):
    """
    this function return the number of stopwords in a String
    """
    return sum(1 for word in inputstr.split() if word.lower() in stopwords.words('english') )
######################################
def num_Ponctuation(Instr):
    """
    this function return the number of ponctuation characters in a String
    """
    nonpuc=[c for c in Instr if c not in string.punctuation]
    nonpuc=''.join(nonpuc)
    if '.' in nonpuc:
        nonpuc = nonpuc + '.' # this line just to ignore the period for one time
    return len(Instr)-len(nonpuc)
#####################################
def special_begin(string) :
    """
    this function looks for special begin
    """
    return re.match('^\s*[0-9IVa-fiv]{1,3}[)|.]\s*.+',string) is not None

In [10]:
#######applying feature functions to dataframe

def feature_application(parser):
    
    #punctuation count
    parser.loc[:,'num_Ponctuation'] = parser['section'].apply(lambda st: num_Ponctuation(st)) 

    #stop_words_count
    parser.loc[:,'common_words_count'] = parser['section'].apply(lambda st: common_words_count(st)) 

    # LeadingAsterisk
    #parser.loc[:,'LeadingAsterisk'] = parser['section'].apply(str.startswith, args='*') 

        # # leading arabic numeral
    ArabicNumeral= parser['section'].str.lstrip(' ').apply(lambda st: startWithArabic(st))
    # # leading Roman numeral
    RomanNumerals= parser['section'].str.lstrip(' ').apply(lambda st: startWithRoman(st))
    # leadingNumeral 
    #parser.loc[:,'LeadingNumeral'] = ArabicNumeral | RomanNumerals
    
    # Special start
    parser.loc[:,'special_begin'] = parser['section'].apply(lambda st: special_begin(st))

    # endsInPeriod
    #parser.loc[:,'endsInPeriod'] = parser['section'].apply(str.endswith, args='.') 

    # leadingWhiteSpace
    #parser['leadingWhiteSpace'] = parser['section'].apply(lambda st: leadingWhiteSpace(st))

    # ellipses
    #parser.loc[:,'ellipses'] = parser['section'].apply(lambda st: ellipses(st))

    #ContainsComma:
    parser.loc[:,'ContainsComma'] = parser['section'].apply(lambda st: ContainsComma(st)) 

    #remove spaces in begening and end of sections to get the real section length
    parser.loc[:,'section'] = parser['section'].map(lambda x: x.lstrip(' ').rstrip(' ')) 
    #section length
    parser.loc[:,'stringLength'] = parser['section'].apply(len)

    # percentCaps (presentation precision = 2)
    parser.loc[:,'percentCaps'] = parser['section'].apply(lambda st: round(sum(1 for c in st if c.isupper())*100/len(st),2))

    #remove remaining page's numbers
    #parser = parser.drop(parser[parser['stringLength']<4].index)
    #reset index of dataframe
    parser = parser.reset_index(drop=True)
    
    return parser

parser = feature_application(parser)

In [11]:
###### machine_learning_processing

def non_shuffling_train_test_split(X, y, test_size=0.25):
    i = int((1 - test_size) * X.shape[0]) + 1
    X_train, X_test = np.split(X, [i])
    y_train, y_test = np.split(y, [i])
    return X_train, X_test, y_train, y_test


#read dependant & independants variables
X1=parser.iloc[:,3:len(parser.columns)].values #[4,5,7,8,9]
Y1=parser.iloc[:,0].values
#splitting data set to training set and test set
X1_train, X1_test, Y1_train, Y1_test= non_shuffling_train_test_split(X1, Y1, test_size=0.25) 
   
#feature scaling
sc_X= StandardScaler()
X1_train=sc_X.fit_transform(X1_train)
X11_test = X1_test
X1_test=sc_X.transform(X1_test)
#feature selection using recursive feature elimination & training classifer
classifier1 = RFECV(SVC(kernel = 'linear',random_state=0),scoring='accuracy')
#classifier1 = RFECV(LogisticRegression(random_state=0),scoring='accuracy')
classifier1.fit(X1_train, Y1_train)
#predict the test set result
Y1_pred=classifier1.predict(X1_test)

#to be used in part 2
tested_data, result_part1 = Y1_test, Y1_pred



In [12]:
####### performance of part 1

#confusion Matrix
cm=confusion_matrix(tested_data, result_part1)
print('confusion_matrix:\n' ,cm)
#accuracy
print('accuracy = ',accuracy_score(tested_data, result_part1))
#recall
print('recall = ',recall_score(tested_data, result_part1))
#precision
print('presicion = ',precision_score(tested_data, result_part1))


confusion_matrix:
 [[116   0]
 [  1  50]]
accuracy =  0.9940119760479041
recall =  0.9803921568627451
presicion =  1.0


In [13]:
#######analysing error cause

diff_true=np.logical_xor(tested_data, result_part1)
#count how many true in diff_true
np.sum(diff_true)
#identifying false results
true_indices = list(np.argwhere(diff_true == True).flatten())
true_indices[:] = [x + len(parser)-len(tested_data) for x in true_indices]
#show false results
parser.loc[true_indices,:]

##-->the cause of this error is no obvious

Unnamed: 0,is_header,section,section_type,num_Ponctuation,common_words_count,special_begin,ContainsComma,stringLength,percentCaps
586,True,"2. Written case Counseling on January 15, 2003",STATEMENT OF THE CASE,2,2,True,True,46,6.52


In [14]:
####### part_2_processing: ####### train a classifier to decide the type of headers
#-->this part will be devided into 2 subparts:
"""
subpart 1 : train the classifier using all the heading section present in the data 
            Goal: analyse performance of classification independantely from part 1
"""
"""
subpart 2 : use the trained classifier with the heading section resulting from part 1 
            Goal: analyse influence of part 1 (error propagation) on the performance of part 2
"""

'\nsubpart 2 : use the trained classifier with the heading section resulting from part 1 \n            Goal: analyse influence of part 1 (error propagation) on the performance of part 2\n'

In [15]:
######subpart 1:

###extracting headers from initial data
Aheader = parser[['section_type','section']]
Aheader = Aheader.drop(Aheader[Aheader['section_type'] == 'None'].index)

###Features extraction:remove ponctuation -->remove stopwords -->generate stemming -->extract features

#remove ponctuation
Aheader.loc[:,'Non_Ponc'] = Aheader['section'].apply(lambda st: re.sub(r'[^a-zA-Z ]', ' ',st))

#remove stopwords
def clean (st):
    """
    this function returns String without stopwords or numerals
    """
    clean_st = [word for word in st.split() if (word.lower() not in stopwords.words('english')) & (len(word)>3)]
    return clean_st 

Aheader.loc[:,'clean_section'] = Aheader['Non_Ponc'].apply(lambda st:clean(st))

#generate stemming 
stemmer = SnowballStemmer('english')
Aheader.loc[:,'stemm'] = Aheader['clean_section'].apply(lambda st: [stemmer.stem(word) for word in st])

#sorting dataframe
Aheader.sort_values(by='section_type').reset_index(drop = True) #-->this is not necessary

#extract features(contains steps)
#gathering stemms for each header type in one list
def aggreg (row):
    """
    this function returns a list of concatenated stemms
    """
    flist = []
    for ls in row['stemm'] :
        flist = flist + ls
    return flist

conca = (Aheader.groupby('section_type').apply(lambda lis:aggreg(lis))).to_frame('vocab') #.reset_index()

#sort elements in the vocab listes
conca.loc[:,'vocab']=conca['vocab'].apply(lambda st:sorted(st))

#eliminate stemms that are not useful for classification(appears just ones)
def  useful_words(ls):
    """
    this function takes a list of strings and return a list with strings used more than ones
    """
    bow_transformer = CountVectorizer().fit(ls)
    csr_matrix = bow_transformer.transform([' '.join(ls)])
    tfidf_transfrom = TfidfTransformer().fit_transform(csr_matrix)
    """eliminate"""
    tmp_list = [] # tmp_list contains the elements to be eliminated
    Mc=tfidf_transfrom.tocoo()
    for i in Mc.col:
        if Mc.data[Mc.col ==i][0] == Mc.data.min() and Mc.data.min()!= Mc.data.max():
            tmp_list.append(bow_transformer.get_feature_names()[i])
    ls = list(set(ls)-set(tmp_list))
    return ls

conca.loc[:,'vocab']=conca['vocab'].apply(lambda st: useful_words(st))

# generate list of features 
flist = []
for i in conca.index.values: # list of categories
    flist = flist + conca.vocab[i]

##generate features/expected-classification-result dataframe
featuresdf = Aheader[['section_type','section']]
# insert features in dataframe
for i in flist:
    featuresdf.loc[:,i] = Aheader['section'].apply(lambda st:i in ' '.join([stemmer.stem(i) for i in st.split(' ')]))  # st.lower()
# insert expected results in dataframe
for i in conca.index.values:
    featuresdf.insert(0, i,  Aheader['section_type'].apply(lambda st:st.lower() == i.lower()))
#reset index of dataframe
featuresdf.reset_index(drop=True)    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,TESTIMONY,STATEMENT OF THE CASE,STATEMENT OF FACTS,STATE,STANDARD OF REVIEW,REMOVAL,PROCEDURE,PRELIMINARY STATEMENT,PREEMPTION,NOTICE,...,action,counsel,assist,fact,case,chen,famili,polici,plan,father
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,True,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
5,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,False,False,True,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [16]:
###prediction

#initialise the result dataframe
class_result= featuresdf[['section_type','section']].reset_index(drop = True)
class_result = class_result.iloc[int(len(featuresdf)*0.66):,:]
class_result.loc[:,'estimated_type']='None'

#make estimations
for i in range(0, len(conca)):
    #splitting data set to training_set and test_set
    X21_train = featuresdf.iloc[:int(len(featuresdf)*0.66),len(conca)+2:].values
    Y21_train = featuresdf.iloc[:int(len(featuresdf)*0.66),i].values
    ##
    X21_test = featuresdf.iloc[int(len(featuresdf)*0.66):,len(conca)+2:].values
    Y21_test = featuresdf.iloc[int(len(featuresdf)*0.66):,i].values
    
    
    #feature scaling
    sc_X= StandardScaler()
    X21_train=sc_X.fit_transform(X21_train)
    X21_test=sc_X.fit_transform(X21_test)
    #feature selection & classifier training
    #classifier21 = RFECV(SVC(kernel = 'linear',random_state=0),scoring='accuracy')
    classifier21 = RFECV(LogisticRegression(random_state=0),scoring='accuracy')
    
    classifier21.fit(X21_train, Y21_train)
    #predict the test set result
    Y_pred21=classifier21.predict(X21_test)
   
    #store perdiction for class i in the result dataframe
    Y_true = class_result.index[Y_pred21==True]
    for tr_ind in Y_true:
        if class_result.loc[tr_ind,'estimated_type']=='None':
            class_result.loc[tr_ind,'estimated_type'] =featuresdf.columns[i]
            
    print('treatment of type\'',featuresdf.columns[i],'\' :done ')

treatment of type' TESTIMONY ' :done 




treatment of type' STATEMENT OF THE CASE ' :done 
treatment of type' STATEMENT OF FACTS ' :done 




treatment of type' STATE ' :done 
treatment of type' STANDARD OF REVIEW ' :done 




treatment of type' REMOVAL ' :done 




treatment of type' PROCEDURE ' :done 
treatment of type' PRELIMINARY STATEMENT ' :done 




treatment of type' PREEMPTION ' :done 
treatment of type' NOTICE ' :done 




treatment of type' MOTION ' :done 
treatment of type' LEGAL FRAMEWORK ' :done 
treatment of type' JURISDICTIONAL STATEMENT ' :done 




treatment of type' INTRODUCTION ' :done 
treatment of type' DISCUSSION ' :done 




treatment of type' CONCLUSION ' :done 
treatment of type' COMPLAINT ' :done 




treatment of type' CLAIM ' :done 
treatment of type' ARGUMENT ' :done 




treatment of type' APPLICATION ' :done 
treatment of type' ANALYSIS ' :done 
treatment of type' AMENDMENT ' :done 




treatment of type' ACT ' :done 




In [17]:
class_result

Unnamed: 0,section_type,section,estimated_type
132,STATEMENT OF FACTS,II. STATEMENT OF FACTS,STATEMENT OF FACTS
133,STATEMENT OF THE CASE,D. Sufficiency of the Evidence in the case,STATEMENT OF THE CASE
134,AMENDMENT,A. First Amendment,AMENDMENT
135,AMENDMENT,B. Fourth Amendment,AMENDMENT
136,ANALYSIS,V. ANALYSIS,ANALYSIS
137,AMENDMENT,C. Fifth Amendment,AMENDMENT
138,STANDARD OF REVIEW,C) STANDARDS OF REVIEW,STANDARD OF REVIEW
139,CONCLUSION,III. Conclusion,CONCLUSION
140,INTRODUCTION,A) INTRODUCTION,INTRODUCTION
141,DISCUSSION,Discussion,DISCUSSION


In [18]:
###performance of subpart 1 (part 2)

#accuracy
print('accuracy = ',accuracy_score(class_result.section_type , class_result.estimated_type))

accuracy =  0.8676470588235294


In [19]:
######subpart 2:

###extracting headers from first part result
true_indices = list(np.argwhere(result_part1 == True).flatten())
true_indices[:] = [x+len(parser)-len(tested_data) for x in true_indices]
Pheaders = parser.iloc[true_indices, 1:3] 
Pheaders = Pheaders.reset_index(drop=True)
#len(Pheaders[Pheaders['section_type']=='None'].index) --> to verify number of wrong result
#Pheaders['section_type'].replace('None', 'wrongly_estimated',inplace=True)

#list of classes
concap = Pheaders.section_type.unique()

##generate features/expected-classification-result dataframe
featdf = Pheaders[['section_type','section']]
# insert features in dataframe
for i in flist:
    featdf.loc[:,i] = Pheaders['section'].apply(lambda st:i in ' '.join([stemmer.stem(i) for i in st.split(' ')]))  # st.lower()
# insert expected results in dataframe
for i in concap:
    featdf.insert(0, i,  Pheaders['section_type'].apply(lambda st:st.lower() == i.lower()))
#reset index of dataframe
featdf.reset_index(drop=True) 
featdf

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,TESTIMONY,STATE,COMPLAINT,REMOVAL,STANDARD OF REVIEW,ARGUMENT,CLAIM,CONCLUSION,INTRODUCTION,STATEMENT OF FACTS,...,action,counsel,assist,fact,case,chen,famili,polici,plan,father
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,True,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False


In [20]:
###prediction

#initialise the result dataframe
class_result2= featdf[['section_type','section']].reset_index(drop = True)
class_result2.loc[:,'estimated_type']='None'
#############
#make estimations
for i in range(0, len(concap)):

    #training of classifier fromo subpart1
    
    X21_train = featuresdf.iloc[:int(len(featuresdf)*0.66),len(conca)+2:].values
    Y21_train = featuresdf.iloc[:int(len(featuresdf)*0.66),featuresdf.columns.get_loc(concap[len(concap)-1-i])].values
    sc_X= StandardScaler()
    X21_train=sc_X.fit_transform(X21_train)
    classifier21 = RFECV(LogisticRegression(random_state=0),scoring='accuracy')
    classifier21.fit(X21_train, Y21_train)
    
    #predict the subpart2
    X22 = featdf.iloc[:,len(concap)+2:].values
    Y22 = featdf.iloc[:,i].values
    #X22 = featdf.iloc[:,i].values
    print('treatment of type\'',featdf.columns[i],'\' :done ')
  
    #feature scaling
    X22 = sc_X.fit_transform(X22)
    #predict the test set result
    Y_pred22=classifier21.predict(X22)

    #store perdiction for class i in the result dataframe
    Y_true2 = class_result2.index[Y_pred22==True]
    for tr_ind in Y_true2:
        if class_result2.loc[tr_ind,'estimated_type']=='None':
            class_result2.loc[tr_ind,'estimated_type'] =featdf.columns[i]
    
    print('training for type\'',featuresdf.columns[featuresdf.columns.get_loc(concap[len(concap)-1-i])],'\' :done ')        




treatment of type' TESTIMONY ' :done 
training for type' TESTIMONY ' :done 
treatment of type' STATE ' :done 
training for type' STATE ' :done 




treatment of type' COMPLAINT ' :done 
training for type' COMPLAINT ' :done 
treatment of type' REMOVAL ' :done 
training for type' REMOVAL ' :done 




treatment of type' STANDARD OF REVIEW ' :done 
training for type' STANDARD OF REVIEW ' :done 
treatment of type' ARGUMENT ' :done 
training for type' ARGUMENT ' :done 
treatment of type' CLAIM ' :done 
training for type' CLAIM ' :done 




treatment of type' CONCLUSION ' :done 
training for type' CONCLUSION ' :done 
treatment of type' INTRODUCTION ' :done 
training for type' INTRODUCTION ' :done 




treatment of type' STATEMENT OF FACTS ' :done 
training for type' STATEMENT OF FACTS ' :done 
treatment of type' APPLICATION ' :done 
training for type' APPLICATION ' :done 




treatment of type' ACT ' :done 
training for type' ACT ' :done 
treatment of type' DISCUSSION ' :done 
training for type' DISCUSSION ' :done 




treatment of type' JURISDICTIONAL STATEMENT ' :done 
training for type' JURISDICTIONAL STATEMENT ' :done 
treatment of type' STATEMENT OF THE CASE ' :done 
training for type' STATEMENT OF THE CASE ' :done 




treatment of type' ANALYSIS ' :done 
training for type' ANALYSIS ' :done 
treatment of type' PROCEDURE ' :done 
training for type' PROCEDURE ' :done 




treatment of type' LEGAL FRAMEWORK ' :done 
training for type' LEGAL FRAMEWORK ' :done 




In [21]:
class_result2

Unnamed: 0,section_type,section,estimated_type
0,LEGAL FRAMEWORK,Legal Framework,LEGAL FRAMEWORK
1,PROCEDURE,B. Procedure,PROCEDURE
2,ANALYSIS,II. ANALYSIS,ANALYSIS
3,STATEMENT OF THE CASE,B. Ineffective Assistance in the case,STATEMENT OF THE CASE
4,JURISDICTIONAL STATEMENT,1. Opening the Door For Prejudicial Hearsay,
5,DISCUSSION,III. DISCUSSION,DISCUSSION
6,ACT,C. Precedent Under Federal Communications Acts,
7,APPLICATION,III. Application,APPLICATION
8,STATEMENT OF FACTS,2. Fact of Failing to Request A Limiting Instr...,STATEMENT OF FACTS
9,STATEMENT OF FACTS,3. Double Jeopardy Fact,STATEMENT OF FACTS


In [22]:
###performance of subpart 1 (part 2)

correctely_labeled = sum(class_result2.section_type == class_result2.estimated_type)

#accuracy
print('accuracy = ',accuracy_score(class_result2.section_type , class_result2.estimated_type))

#correctely labeled
print('correctely_labeled = ',correctely_labeled )

accuracy =  0.82
correctely_labeled =  41


In [23]:
####### part_3_preprocessing: ####### apply the two classifers to a document

In [24]:
#select a html document;output:fname

from tkinter import Tk
from tkinter.filedialog import askopenfilename
Tk().withdraw()
fname = askopenfilename(title="Ouvrir votre document",filetypes=[('html files', '.html'),('all files','.*')])

In [25]:
#document preprocessing

#read the selected document as xml
html = open(fname, encoding='latin-1')
soup = BeautifulSoup(html,"lxml").get_text()
#eliminate spaces in start & end of the file
soup=soup.lstrip('\n').rstrip('\n')
#split file into paragraphs
para= soup.split(sep='\n\n')
# eliminate newlines(\n) from paragraphs
para_tmp=[]
for s in para:
    para_tmp.append(s.replace('\n',' '))
para=para_tmp
#remove pages' numbers
para_tmp = []
for line in para:
    if len(line.lstrip(' ').rstrip(' ')) > 4:
        para_tmp.append(line)
para = para_tmp
##########
#concatenate the paragraphs into one string
data='\t\n\t'.join(para)
#Parsing data using pandas dataframe
TESTDATA = StringIO(data)
parsed_doc=pd.read_csv(TESTDATA,sep='\t\n\t',names=['section'])
#treating missing data
parsed_doc=parsed_doc.dropna()
#solving type problems(in case)
parsed_doc.section=parser.section.astype(str)
parsed_doc



Unnamed: 0,section
0,"Nieves v. Secretary, DOC"
1,"Nos. 10-1315, 10-1397 ..."
2,Factual Background
3,"COOK, Circuit Judge. General Medicine, P.C. (â..."
4,I. INTRODUCTION
5,This matter began as a contract dispute betwee...
6,The district court reopened the case in April ...
7,Counsel for both General and Horizon presented...
8,1 The consent judgment states in pertinent par...
9,"The district court received evidence, heard or..."


In [26]:
#Identifying headers in Document
parsed_doc = feature_application(parsed_doc)
doc_prop = parsed_doc.iloc[:,1:].values
headers_prediction=classifier1.predict(doc_prop)
headers_prediction
#parsed_doc

array([False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False])

['TRUE\tINTRODUCTION\tI. BACKGROUND', 'TRUE\tSTATEMENT OF FACTS\tA. Substantive Facts', 'TRUE\tLEGAL FRAMEWORK\tLegal Framework', 'TRUE\tPROCEDURE\tB. Procedure', 'TRUE\tSTATEMENT OF THE CASE\tB. Ineffective Assistance in the case', 'TRUE\tJURISDICTIONAL STATEMENT\t1. Opening the Door For Prejudicial Hearsay', 'TRUE\tDISCUSSION\tIII. DISCUSSION', 'TRUE\tACT\tC. Precedent Under Federal Communications Acts', 'TRUE\tAPPLICATION\tIII. Application', 'TRUE\tSTATEMENT OF FACTS\t2. Fact of Failing to Request A Limiting Instruction', 'TRUE\tSTATEMENT OF FACTS\t3. Double Jeopardy Fact', 'TRUE\tINTRODUCTION\tBACKGROUND', 'TRUE\tSTATEMENT OF FACTS\tD. Sufficiency of the Evidence Fact', 'TRUE\tACT\tC. Precedent Under Federal Communications Acts.', 'TRUE\tACT\tB. The Pole Attachment Act.', '\nTRUE\tCONCLUSION\tIII. CONCLUSION', '\n                                                14', 'TRUE\tINTRODUCTION\tI. BACKGROUND', 'TRUE\tLEGAL FRAMEWORK\tLegal Framework', "TRUE\tCLAIM\tThompson's Claim", 'TRUE\

Number Of sections : 167
Number Of correctly identified sections : 166
Number Of headers : 51
Number Of Actual Identified headers : 50
Number Of correctly identified headers : 50
Number Of headers identified with regular expression : 54
Precision of our approach : 0.9803921568627451
Precision of Regular expression method : 0.92


In [41]:
########################################################################

TESTIMONY  :  8
STATEMENT OF THE CASE  :  9
STATEMENT OF FACTS  :  22
STATE  :  5
STANDARD OF REVIEW  :  12
REMOVAL  :  4
PROCEDURE  :  7
PRELIMINARY STATEMENT  :  4
PREEMPTION  :  2
NOTICE  :  3
MOTION  :  4
LEGAL FRAMEWORK  :  5
JURISDICTIONAL STATEMENT  :  6
INTRODUCTION  :  19
DISCUSSION  :  12
CONCLUSION  :  17
COMPLAINT  :  8
CLAIM  :  6
ARGUMENT  :  9
APPLICATION  :  8
ANALYSIS  :  10
AMENDMENT  :  11
ACT  :  9


In [None]:
#using regular expression approach
######read one document and storage preparation function (html to csv)

def doc_prep (url):
    #read file as xml
    html = open(url, encoding='latin-1')
    soup = BeautifulSoup(html,"lxml").get_text()
    #########
    #file format processing
    
    #eliminate spaces in start & end of the file
    soup=soup.lstrip('\n').rstrip('\n')
    #split file into paragraphs
    para= soup.split(sep='\n\n')
    # eliminate newlines(\n) from paragraphs
    return para

reg_para = []
for url in files:
    reg_para = reg_para+doc_prep(url)

output = []
for w in reg_para[int(len(reg_para) * 0.75):]:
    if (len(w) > 3 and w[0] != "N" ):
         if (len(w) < 70 ) or (re.search('^.+[1-9IVa-fiv]{1,3}[.|)]\s*[A-Za-z].+$' , w)) and w[0] != 'N' : output.append(w)
ReNbIdentifiedHeaders = len(output)            
print(output)

In [None]:
#comparason with regex approach

NbIdentifiedSections = cm[0,0] + cm[1,1]
NbHeaders = cm[1,0] + cm[1,1]
NbIdentifiedHeaders = cm[0,1] + cm[1,1]
NbCorrectHeaders = cm[1,1]
#ReNbIdentifiedHeaders = int(ReNbIdentifiedHeaders / 4)

print("Number Of sections :" , int(NbSections / 4))
print("Number Of correctly identified sections :" , NbIdentifiedSections )
print("Number Of headers :" , NbHeaders)
print("Number Of Actual Identified headers :" , NbIdentifiedHeaders)
print("Number Of correctly identified headers :" , NbCorrectHeaders)
print("Number Of headers identified with regular expression :" , ReNbIdentifiedHeaders)
print("Precision of our approach :" , NbCorrectHeaders / NbHeaders)
print("Precision of Regular expression method :" , 1 - (ReNbIdentifiedHeaders - NbIdentifiedHeaders) / NbIdentifiedHeaders)


In [None]:
for i in range(0,len(conca)):
    print(featuresdf.columns[i],' : ',sum(featuresdf.iloc[:,i].values))