In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
df = pd.read_csv('../../data/AI_ML_Challenge_Training_Data_Set_1_v1.csv')

In [3]:
df = df.rename(columns={'Clause Text':'clause_text',"Classification":'label'})
df = df.drop(columns=['Clause ID'])
df.head()

Unnamed: 0,clause_text,label
0,18. Governing Law: This Agreement shall be gov...,0
1,"1.8 Modification. We may modify, update, or di...",1
2,Except as otherwise expressly provided in this...,0
3,8.3. The benefit and burdens of this Ag...,1
4,DEFINITIONS,0


In [4]:
df['label'].value_counts()

0    6407
1    1472
Name: label, dtype: int64

## Preprocess text

In [5]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
stop_words = set(stopwords.words('english'))
no_nonsense_re = re.compile(r'^[a-zA-Z^508]+$')
def strip_nonsense(doc,remove_stop_words=False,port_stem=False):
    """
    Returns stemmed lowercased alpha-only substrings from a string that are b/w 3 and 17 chars long. 
    It keeps the substring `508`.
    
    Parameters:
        doc (str): the text of a single FBO document.
        
    Returns:
        words (str): a string of space-delimited lower-case alpha-only words (except for `508`)
    """
    
    doc = doc.lower()
    doc = doc.split()
    words = ''
    for word in doc:
        m = re.match(no_nonsense_re, word)
        if m:
            match = m.group()
            if remove_stop_words and match in stop_words:
                continue
            else:
                if port_stem == True:
                    match_len = len(match)
                    if match_len <= 17 and match_len >= 3:
                        porter = PorterStemmer()
                        stemmed = porter.stem(match)
                        words += stemmed + ' '
                else:
                    words+= match+ ' '
    return words

In [6]:
# Example
'''
COMPANY warrants that the SOFTWARE will, for a period of sixty (60) days from the date of your receipt, perform substantially in accordance with SOFTWARE written materials accompanying it. Except as just stated,The parties acknowledge that the Software is provided ""AS IS,"" and may not be functional on every machine or in every environment.  Except as set forth herein,  COMPANY DISCLAIMS ALL WARRANTIES RELATING TO THE SOFTWARE, EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES AGAINST INFRINGEMENT OF THIRD PARTY RIGHTS, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE."
Label - 1, Not Acceptable
'''

# not removing stop words

'''
company warrants that the software for a period of sixty days from the date of your perform substantially in accordance with software written materials accompanying except as just parties acknowledge that the software is provided and may not be functional on every machine or in every except as set forth company disclaims all warranties relating to the express or but not limited any warranties against infringement of third party merchantability and fitness for a particular ,1
'''

# not include port stemming
'''
company warrants software period sixty days date perform substantially accordance software written materials accompanying except parties acknowledge software provided may functional every machine every except set forth company disclaims warranties relating express limited warranties infringement third party merchantability fitness particular
Label - 1, Not Acceptable
'''

# if include port stemming
'''
compani warrant softwar period sixti day date perform substanti accord softwar written materi accompani except parti acknowledg softwar provid may function everi machin everi except set forth compani disclaim warranti relat express limit warranti infr third parti merchant fit particular 
Label - 1, Acceptable
'''

'\ncompani warrant softwar period sixti day date perform substanti accord softwar written materi accompani except parti acknowledg softwar provid may function everi machin everi except set forth compani disclaim warranti relat express limit warranti infr third parti merchant fit particular \nLabel - 1, Acceptable\n'

In [7]:
df.columns

Index(['clause_text', 'label'], dtype='object')

In [8]:
df['label'].astype(int)

0       0
1       1
2       0
3       1
4       0
       ..
7874    0
7875    1
7876    0
7877    0
7878    1
Name: label, Length: 7879, dtype: int64

In [9]:
df['clause_text'].str.len().value_counts()

13      76
9       58
8       54
12      49
11      43
        ..
1210     1
1226     1
1230     1
1278     1
4108     1
Name: clause_text, Length: 1718, dtype: int64

In [10]:
df['clause_text'].apply(strip_nonsense)

0       governing this agreement shall be governed by ...
1       we may or discontinue the software any of thei...
2       except as otherwise expressly provided in this...
3       the benefit and burdens of this agreement may ...
4                                            definitions 
                              ...                        
7874                  the is hereby granted by company a 
7875    this end user license agreement is a binding a...
7876    the financial cumulative liability of company ...
7877    the customer acknowledges that all intellectua...
7878    company software is not that means that compan...
Name: clause_text, Length: 7879, dtype: object

In [11]:
destination_folder = '../../data'
train_test_ratio = 0.90 #95/5 split, 95% train, 5% test
train_valid_ratio = 0.80 # 80/20 split
# preprocess text
# lowercase -> removes nonsense -> remove stop words -> Port Stemming (i.e. removes suffix)
df['clause_text'] = df['clause_text'].apply(strip_nonsense)
df['label'] = df['label'].astype(int)
# Drop rows with empty text (i.e. clauses less than two characters)
df.drop( df[df['clause_text'].str.len() < 2].index, inplace=True)
# split according to label
df_good= df[df['label'] == 0]# acceptable, good clauses
df_bad = df[df['label'] == 1]# not acceptable, bad clauses
# oversample bad class examples to deal with imbalance
num_good_examples = df_good.shape[0]
df_bad = df_bad.sample(num_good_examples,replace=True)
assert df_bad.shape[0] == df_good.shape[0]

# train-test split
df_good_full_train, df_good_test = train_test_split(df_good, train_size = train_test_ratio, random_state = 1)
df_bad_full_train, df_bad_test = train_test_split(df_bad, train_size = train_test_ratio, random_state = 1)
# train valid split
df_good_train, df_good_valid = train_test_split(df_good_full_train, train_size = train_valid_ratio, random_state = 1)
df_bad_train, df_bad_valid = train_test_split(df_bad_full_train, train_size = train_valid_ratio, random_state = 1)

# concatenate splits of different labels
df_train = pd.concat([df_good_train, df_bad_train], ignore_index=True, sort=False)
df_valid = pd.concat([df_good_valid, df_bad_valid], ignore_index=True, sort=False)
df_test = pd.concat([df_good_test, df_bad_test], ignore_index=True, sort=False)

In [12]:
# Write preprocessed data
df_train.to_csv(destination_folder + '/train.csv', index=False)
df_valid.to_csv(destination_folder + '/valid.csv', index=False)
df_test.to_csv(destination_folder + '/test.csv', index=False)
# not acceptable clauses
df_bad.to_csv(destination_folder + '/bad.csv', index=False)

In [14]:
print(df_train.shape[0])
print(df_valid.shape[0])
print(df_test.iloc[0])

8896
2224
clause_text    governing this agreement is governed by the fe...
label                                                          0
Name: 0, dtype: object
