In [1]:
import pandas as pd
import numpy as np
from nltk import word_tokenize, pos_tag, pos_tag_sents
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [2]:
df = pd.read_csv("possible_cleaned_data.csv")

In [3]:
df

Unnamed: 0,content,font_pro,font_size,label
0,SERVICES AGREEMENT,DejaVuSerif-Bold,15,1
1,"THIS SERVICES AGREEMENT (this “Agreement”), eﬀ...",DejaVuSerif,14,0
2,"NUGENE INC. (“NuGene” or “NG”), a Califo...",DejaVuSerif-Bold,15,1
3,; and,DejaVuSerif,14,0
4,"KBHJJ,",DejaVuSerif-Bold,15,1
...,...,...,...,...
202025,levels set,DejaVuSans,20,0
202026,"June 30, 2005 grant date, a 12.5% equity award...",DejaVuSans,20,0
202027,full upon the Executive's termination for any ...,DejaVuSans,20,0
202028,Bonus,DejaVuSans,20,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202030 entries, 0 to 202029
Data columns (total 4 columns):
content      202030 non-null object
font_pro     202030 non-null object
font_size    202030 non-null int64
label        202030 non-null int64
dtypes: int64(2), object(2)
memory usage: 6.2+ MB


In [5]:
df['content']

0                                        SERVICES AGREEMENT
1         THIS SERVICES AGREEMENT (this “Agreement”), eﬀ...
2         NUGENE  INC.  (“NuGene”  or  “NG”),  a  Califo...
3                                                    ;  and
4                                                    KBHJJ,
                                ...                        
202025                                           levels set
202026    June 30, 2005 grant date, a 12.5% equity award...
202027    full upon the Executive's termination for any ...
202028                                                Bonus
202029    -- MICHAEL KARPHEDEN Signed: /s/ Hakan Wretsel...
Name: content, Length: 202030, dtype: object

In [6]:
texts = df['content'].tolist()

In [7]:
tagged_texts = pos_tag_sents(map(word_tokenize, texts))        # parts of speech tagging

In [8]:
df['POS'] = tagged_texts

In [9]:
X = pd.DataFrame(df, columns=['POS', 'font_pro', 'font_size'])

In [10]:
for x in range(len(X)):
    X['POS'][x] = str(X['POS'][x])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
le = LabelEncoder()
X.POS = le.fit_transform(X.POS)
X.font_pro = le.fit_transform(X.font_pro)
X.font_size = le.fit_transform(X.font_size)

In [12]:
y = pd.DataFrame(df, columns=['label'])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [14]:
# x_train = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False).fit_transform(x_train)
# x_test = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False).fit_transform(x_test)
# y_train = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False).fit_transform(y_train)
# y_test = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False).fit_transform(y_test)

In [15]:
# x_train = x_train.toarray()
# x_test = x_test.toarray()
# y_train = y_train.toarray()
# y_test = y_test.toarray()

In [16]:
# encoder = LabelEncoder()
# encoder.fit(x_train)

In [17]:
# y_train = encoder.transform(y_train)
# y_test = encoder.transform(y_test)

In [18]:
treeclf = DecisionTreeClassifier(criterion = "gini", splitter = "best", min_samples_split = 2, min_samples_leaf = 3)

In [19]:
treeclf.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [20]:
predictions = treeclf.predict(X_test)

In [21]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     49881
           1       1.00      1.00      1.00     10728

    accuracy                           1.00     60609
   macro avg       1.00      1.00      1.00     60609
weighted avg       1.00      1.00      1.00     60609



In [22]:
accuracy = accuracy_score(predictions, y_test)                             # before over sampling

In [23]:
accuracy

0.9999835008002111

In [24]:
y_train_series = y_train.iloc[:,0]

In [25]:
sm = SMOTE(random_state = 42)

In [26]:
X_train_res, y_train_res = sm.fit_resample(X_train, y_train_series)        # Over Smapling using SMOTE

In [27]:
treeclf.fit(X_train_res, y_train_res)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [28]:
predictions = treeclf.predict(X_test)

In [29]:
accuracy = accuracy_score(predictions, y_test)

In [31]:
accuracy

0.9943737728720157

In [59]:
predictions

array([0, 0, 0, ..., 1, 1, 1])

In [None]:
# Below code is for testing.

In [109]:
df_test = pd.read_csv("possible_cleaned_test_data.csv")

In [110]:
texts = df_test['content'].tolist()

In [111]:
tagged_texts = pos_tag_sents(map(word_tokenize, texts))

In [112]:
df_test['POS'] = tagged_texts

In [113]:
X = pd.DataFrame(df_test, columns=['POS', 'font_pro', 'font_size'])

In [114]:
for x in range(len(X)):
    X['POS'][x] = str(X['POS'][x])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [116]:
# le = LabelEncoder()
X.POS = le.fit_transform(X.POS)
# X.font_pro = le.fit_transform(X.font_pro)
# X.font_size = le.fit_transform(X.font_size)

In [117]:
test_pre = treeclf.predict(X)

In [127]:
list(le.inverse_transform(test_pre))

["[(',', ','), ('with', 'IN'), ('the', 'DT'), ('grade', 'NN'), ('lowered', 'VBN'), ('by', 'IN'), ('one', 'CD'), ('grade', 'NN'), (',', ','), ('with', 'IN'), ('a', 'DT'), ('one-day', 'JJ'), ('window', 'NN'), ('.', '.')]",
 "[(',', ','), ('with', 'IN'), ('the', 'DT'), ('grade', 'NN'), ('lowered', 'VBN'), ('by', 'IN'), ('one', 'CD'), ('grade', 'NN'), (',', ','), ('with', 'IN'), ('a', 'DT'), ('one-day', 'JJ'), ('window', 'NN'), ('.', '.')]",
 "[(',', ','), ('with', 'IN'), ('the', 'DT'), ('grade', 'NN'), ('lowered', 'VBN'), ('by', 'IN'), ('one', 'CD'), ('grade', 'NN'), (',', ','), ('with', 'IN'), ('a', 'DT'), ('one-day', 'JJ'), ('window', 'NN'), ('.', '.')]",
 "[(',', ','), ('with', 'IN'), ('the', 'DT'), ('grade', 'NN'), ('lowered', 'VBN'), ('by', 'IN'), ('one', 'CD'), ('grade', 'NN'), (',', ','), ('with', 'IN'), ('a', 'DT'), ('one-day', 'JJ'), ('window', 'NN'), ('.', '.')]",
 "[(',', ','), ('with', 'IN'), ('the', 'DT'), ('grade', 'NN'), ('lowered', 'VBN'), ('by', 'IN'), ('one', 'CD'), ('gr

In [118]:
test_pre

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,