In [1]:
import spacy
import os
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from functions import *
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

In [2]:
nlp = spacy.load("en")
path = 'D:/Preperation for Hiring/Data Security Hackathon at IIT/Mission-Mars/documents'

dataset = pd.DataFrame(columns=["id", "text"])
y = pd.read_csv("training_labels.csv", usecols=["id", "score"])
filenames = os.listdir(path)

In [3]:
replacements = {' .':'.', " 's":"", '(s)':'', '(s':'', '   ':' '}

for index in tqdm(range(len(filenames))):
    filename = filenames[index]
    document = open(path+"/"+filename, encoding='utf-8', errors='ignore').read()
    document = nlp(document)
    filtered_sentence = [word for word in document if word.text in {'.', '?'} or word.is_stop is False and word.is_punct == False]
    filtered_sentence = ' '.join(map(str, filtered_sentence))
    filtered_sentence = multi_replace(filtered_sentence, replacements)
    filtered_sentence = str(filtered_sentence).lower()
    dataset.loc[index] = [int(re.findall('\d+', filename)[0]), filtered_sentence]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 434/434 [10:29<00:00,  1.15it/s]


In [7]:
dataset["id"] = dataset["id"].astype("uint32")
y["id"] = y["id"].astype("uint32")
final_dataset = pd.merge(dataset, y, on='id')

In [8]:
final_dataset = final_dataset.drop_duplicates(subset=["text"])

In [9]:
final_dataset.to_csv("final_dataset_dropped_limited_features.csv", encoding='utf-8', index = False)

In [10]:
final_dataset['word_count'] = final_dataset['text'].apply(lambda x: word_count(x))
final_dataset['sentence_count'] = final_dataset['text'].apply(lambda x: sentence_count(x))
final_dataset['avg_sentence_length'] = final_dataset['word_count'].astype("float")/final_dataset['sentence_count'].astype("float")
final_dataset['syllables_count'] = final_dataset['text'].apply(lambda x: syllables_count(x))
final_dataset['avg_syllables_per_word'] = final_dataset['text'].apply(lambda x: avg_syllables_per_word(x))
final_dataset['difficult_words'] = final_dataset['text'].apply(lambda x: difficult_words(x))
final_dataset['poly_syllable_count'] = final_dataset['text'].apply(lambda x: poly_syllable_count(x))
final_dataset['flesch_reading_ease'] = final_dataset['text'].apply(lambda x: flesch_reading_ease(x))
final_dataset['gunning_fog'] = final_dataset['text'].apply(lambda x: gunning_fog(x))
final_dataset['smog_index'] = final_dataset['text'].apply(lambda x: smog_index(x))
final_dataset['dale_chall_readability_score'] = final_dataset['text'].apply(lambda x: dale_chall_readability_score(x))

In [11]:
final_dataset['is_email_addresses'] = final_dataset['text'].apply(lambda x: is_email_addresses(x))
final_dataset['use_cookies'] = final_dataset['text'].apply(lambda x: use_cookies(x))

In [12]:
final_dataset.to_csv("final_dataset_dropped_all_features.csv", encoding='utf-8', index = False)

In [24]:
columns_list = final_dataset.columns.tolist()
del columns_list[2]

In [26]:
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(final_dataset.drop("score", axis=1), final_dataset["score"])
X_res = pd.DataFrame(X_res, columns = columns_list)
y_res = pd.DataFrame(y_res, columns=["score"])

680
680


In [27]:
final_dataset.to_csv("final_dataset_oversampled_all_features.csv", encoding='utf-8', index = False)

In [29]:
features = ['word_count', 'sentence_count', 'syllables_count', 'difficult_words', 'poly_syllable_count', 'flesch_reading_ease',
       'gunning_fog', 'smog_index', 'dale_chall_readability_score', 'is_email_addresses', 'use_cookies']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_res[features], y_res, test_size=0.33, random_state=42)
model = SVC(gamma='auto')
model.fit(X_train, np.ravel(y_train))
y_pred = model.predict(X_test)
f1_score(y_test, y_pred, average='weighted')

0.8315705751834784

In [33]:
minor_synonyms = set(['minor', 'child', 'kid', 'youngster'])
geolocation_synonyms = set(['geo-location', 'geolocation', 'location', 'position'])
vendor_synonyms = set(['vendor', 'vender', 'seller', 'marketer'])

In [34]:
# final_dataset['is_minor'] = final_dataset['text'].apply(lambda x: is_minor(x))
# final_dataset['is_how_collect'] = 
# final_dataset['is_geo_location'] = 
# final_dataset['is_vendor'] = 
# final_dataset['is_not_sell'] = 
# final_dataset['is_sell'] = 
# final_dataset['is_share'] = 
# final_dataset['is_not_share'] = 