In [1]:
import os
import textract
import pickle
import re

from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier

corpus = []
path = input("Folder Path: ")
stopWords = set(stopwords.words('english'))
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-zA-Z .#+_]')


def get_text(loc):
    try:
        return textract.process(loc).decode("utf-8")
    except textract.exceptions.ExtensionNotSupported as e:
        print(e)
        return ''

    
def get_clean_text(text):
    """
        text: a string

        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text  # HTML decoding
#     text = text.lower() # lowercase text
    text = text.replace('\n', ' ').replace('\r', '')
    text = REPLACE_BY_SPACE_RE.sub(' ', text)  # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text)  # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in stopWords)  # delete stopwors from text
    return text


Folder Path: files


In [2]:
targets = ['questions','solution']
Y = []

for (root, dirs, files) in os.walk(path, topdown=False):
    print(root, dirs, len(files))
    for file in files:
        targetVal = targets.index(root[len(path)+1:])
        try:
#             file_features(file, root)
            text1 = get_text(os.path.join(root, file))
            clean = get_clean_text(text1).lower()
#             print(file)
            corpus.append(clean)
            Y.append(targetVal)
        except:
            pass
# print(corpus)

files/questions [] 2337
files/solution [] 1648
files ['questions', 'solution'] 0


In [3]:
print(corpus)
vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words=stopWords, max_features=70000)


X = vectorizer.fit_transform(corpus).toarray()
print(vectorizer.get_feature_names())
pickle.dump(vectorizer, open('vectorizer.sav', 'wb'))
# print(X)
# print(corpus.vectorizer(ngram_range(1,3)))

print(X.shape)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)





(3977, 70000)


In [4]:
# print(vectorizer.get_stop_words())

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)


# Y_train = X_train["type"]
# X_train = X_train.drop("type", axis=1)
# X_test  = X_test.drop("type", axis=1).copy()

In [6]:
print(X_train.shape, len(Y_train), X_test.shape, len(Y_test))

(2664, 70000) 2664 (1313, 70000) 1313


In [7]:
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [8]:
y_pred = classifier.predict(X_test)

In [9]:
print(confusion_matrix(Y_test,y_pred))
print(classification_report(Y_test,y_pred))
print(accuracy_score(Y_test, y_pred))

[[713  80]
 [ 24 496]]
              precision    recall  f1-score   support

           0       0.97      0.90      0.93       793
           1       0.86      0.95      0.91       520

    accuracy                           0.92      1313
   macro avg       0.91      0.93      0.92      1313
weighted avg       0.93      0.92      0.92      1313

0.9207920792079208


In [10]:
with open('text_classifier.sav', 'wb') as picklefile:
    pickle.dump(classifier,picklefile)

In [11]:
path = input("Folder Path: ")
vectorizer = pickle.load(open('vectorizer.sav', 'rb'))
with open('text_classifier.sav', 'rb') as training_model:
    model = pickle.load(training_model)

Folder Path: test


In [12]:
corpus2 = []
targets = ['questions','solution']
Y2 = []

for (root, dirs, files) in os.walk(path, topdown=False):
    print(root, dirs, len(files))
    for file in files:
        targetVal = targets.index(root[len(path)+1:])
        try:
#             file_features(file, root)
            text1 = get_text(os.path.join(root, file))
#             print(file)
            corpus2.append(text1)
            Y2.append(targetVal)
        except:
            pass

# vectorizer = TfidfVectorizer(ngram_range=(1,1), stop_words=stopWords)
# X_test2 = vectorizer.fit_transform(corpus).toarray()

# y_pred2 = model.predict(X_test2)


test/questions [] 203
test/solution [] 225
test ['questions', 'solution'] 0


In [13]:

# vectorizer = TfidfVectorizer(ngram_range=(1,1), stop_words=stopWords)
# X_test2 = vectorizer.fit_transform(corpus).toarray()

# y_pred2 = model.predict(X_test2)
# print(confusion_matrix(Y2, y_pred2))
# print(classification_report(Y2, y_pred2))
# print(accuracy_score(Y2, y_pred2))

print(corpus2)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [14]:
A = vectorizer.transform(corpus2)

In [15]:
print(A)

  (0, 69383)	0.13896451238180993
  (0, 69237)	0.040521863450038796
  (0, 69139)	0.067926000619915
  (0, 68799)	0.07370497123301019
  (0, 68431)	0.11362632159574876
  (0, 67946)	0.20607433276749804
  (0, 67688)	0.0981449197506782
  (0, 67678)	0.06819313802181294
  (0, 63737)	0.12598436699344964
  (0, 63713)	0.051080847182226455
  (0, 62385)	0.14684099700651806
  (0, 62236)	0.04472277929941137
  (0, 60358)	0.08345522116563886
  (0, 60229)	0.06986760555948243
  (0, 57580)	0.11067753593307103
  (0, 57061)	0.0667623296478093
  (0, 56935)	0.10970525148336226
  (0, 54468)	0.06064154328386125
  (0, 52529)	0.07183746036854775
  (0, 50804)	0.15919904240421895
  (0, 50798)	0.16397070763446348
  (0, 50556)	0.2600211804132292
  (0, 50393)	0.10750842655001915
  (0, 50329)	0.06753125762992099
  (0, 46166)	0.1047979947084686
  :	:
  (300, 1865)	0.006411012860711893
  (300, 1824)	0.019983130802970213
  (300, 1772)	0.014722900630588473
  (300, 1749)	0.019983130802970213
  (300, 1734)	0.02519012296651959

In [16]:
y_pred2 = model.predict(A)

print(y_pred2.shape, len(Y2))


print(confusion_matrix(Y2, y_pred2))
print(classification_report(Y2, y_pred2))
print(accuracy_score(Y2, y_pred2))

(301,) 301
[[ 95   3]
 [ 34 169]]
              precision    recall  f1-score   support

           0       0.74      0.97      0.84        98
           1       0.98      0.83      0.90       203

    accuracy                           0.88       301
   macro avg       0.86      0.90      0.87       301
weighted avg       0.90      0.88      0.88       301

0.8770764119601329


In [17]:
print(y_pred2)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 1 1 0 1 0
 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 0 1 1 0 0 1 1 1 1 1 1 0 1 0 1 1 1 1 0 0 1 0 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1
 0 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 0 1
 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 0 1 1 1]
