In [1]:
import os
import re
import spacy
import numpy as np
import pandas as pd
from tqdm import tqdm
from functions import *
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.metrics import f1_score
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

In [2]:
nlp = spacy.load("en")
path = './documents'
filenames = os.listdir(path)

In [3]:
nlp.vocab["cannot"].is_stop = False
nlp.vocab["give"].is_stop = False
nlp.vocab["never"].is_stop = False
nlp.vocab["nothing"].is_stop = False
nlp.vocab["third"].is_stop = False
nlp.vocab["without"].is_stop = False
nlp.vocab["neither"].is_stop = False
nlp.vocab["do"].is_stop = False
nlp.vocab["not"].is_stop = False

In [4]:
data = pd.DataFrame(columns=["id", "text"])

for index in tqdm(range(len(filenames))):
    filename = filenames[index]
    document = open(path+"/"+filename, encoding='utf-8', errors='ignore').read()
    data.loc[index] = [int(re.findall('\d+', filename)[0]), document]

100%|██████████| 434/434 [00:01<00:00, 417.38it/s]


In [5]:
data.drop_duplicates(subset=["text"], inplace=True)
data.reset_index(inplace=True, drop=True)

In [7]:
replacements = {' .':'.', " 's":"", '(s)':'', '(s':'', '   ':' '}
preprocessed_data = pd.DataFrame(columns=["id", "processed_text"])

for index in tqdm(range(len(data))):
    document = nlp(data.loc[index,"text"])
    filtered_sentence = [word for word in document if word.text in {'.', '?'} or word.is_stop is False and word.is_punct == False]
    filtered_sentence = ' '.join(map(str, filtered_sentence))
    filtered_sentence = multi_replace(filtered_sentence, replacements)
    filtered_sentence = str(filtered_sentence).lower()
    preprocessed_data.loc[index] = [data.loc[index,"id"], filtered_sentence]

100%|██████████| 275/275 [04:08<00:00,  1.97it/s]


In [8]:
preprocessed_data.to_csv("preprocessed_data_dropped_limited_features.csv", encoding='utf-8', index = False)

In [9]:
preprocessed_data['word_count'] = preprocessed_data['processed_text'].apply(lambda x: word_count(x))

In [10]:
preprocessed_data['sentence_count'] = preprocessed_data['processed_text'].apply(lambda x: sentence_count(x))

In [11]:
preprocessed_data['avg_sentence_length'] = preprocessed_data['word_count'].astype("float")/preprocessed_data['sentence_count'].astype("float")

In [12]:
preprocessed_data['syllables_count'] = preprocessed_data['processed_text'].apply(lambda x: syllables_count(x))

In [13]:
preprocessed_data['avg_syllables_per_word'] = preprocessed_data['processed_text'].apply(lambda x: avg_syllables_per_word(x))

In [31]:
preprocessed_data['difficult_words'] = preprocessed_data['processed_text'].apply(lambda x: difficult_words(x))

In [32]:
preprocessed_data['poly_syllable_count'] = preprocessed_data['processed_text'].apply(lambda x: poly_syllable_count(x))

In [56]:
preprocessed_data['flesch_reading_ease'] = preprocessed_data['processed_text'].apply(lambda x: flesch_reading_ease(x))

In [57]:
preprocessed_data['gunning_fog'] = preprocessed_data['processed_text'].apply(lambda x: gunning_fog(x))

In [58]:
preprocessed_data['smog_index'] = preprocessed_data['processed_text'].apply(lambda x: smog_index(x))

In [59]:
preprocessed_data['dale_chall_readability_score'] = preprocessed_data['processed_text'].apply(lambda x: dale_chall_readability_score(x))

In [60]:
preprocessed_data['is_email_addresses'] = preprocessed_data['processed_text'].apply(lambda x: is_email_addresses(x))

In [61]:
preprocessed_data['use_cookies'] = preprocessed_data['processed_text'].apply(lambda x: use_cookies(x))

In [62]:
preprocessed_data['is_minor'] = preprocessed_data['processed_text'].apply(lambda x: is_minor(x))

In [132]:
preprocessed_data.to_csv("processed_data_dropped_all_features.csv", encoding='utf-8', index = False)

In [113]:
y = pd.read_csv("training_labels.csv")
preprocessed_data["id"] = preprocessed_data["id"].astype("uint32")
preprocessed_data = pd.merge(preprocessed_data, y, on="id")
y = preprocessed_data.loc[:,"score"]
del preprocessed_data["score"]

In [114]:
columns_list = preprocessed_data.columns

In [115]:
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(preprocessed_data, y)
X_res = pd.DataFrame(X_res, columns = columns_list)
y_res = pd.DataFrame(y_res, columns=["score"])

In [117]:
X_res.to_csv("processed_data_oversampled_all_features.csv", encoding='utf-8', index = False)

In [119]:
features = ['word_count', 'sentence_count', 'syllables_count', 'difficult_words', 'poly_syllable_count', 'flesch_reading_ease',
       'gunning_fog', 'smog_index', 'dale_chall_readability_score', 'is_email_addresses', 'use_cookies', 'is_minor']

In [123]:
minor_synonyms = set(['minor', 'child', 'kid', 'youngster'])
geolocation_synonyms = set(['geo-location', 'geolocation', 'location', 'position'])
vendor_synonyms = set(['vendor', 'vender', 'seller', 'marketer'])

In [126]:
X_train, X_test, y_train, y_test = train_test_split(X_res[features], y_res, test_size=0.33, random_state=42)

In [127]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, np.ravel(y_train))
pred = model.predict(X_test)

In [128]:
f1_score(y_test, pred, average='weighted')

0.8114380844901176