# Feature Creation Notebook

This notebook contains the code the add numeric features based on the words in each sentence.  Some of the feature creations steps take a long time, so only run this notebook once and save the output so it can be loaded (instead of created) all times in the future.

Jump to [here](#load_x) if you already have completed the feature generation steps and want to add more to the feature set.

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from typing import List, Set

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, accuracy_score

In [None]:
RANDOM_SEED = 17
project_base = os.path.dirname(os.path.realpath('.'))
print(f'Project base path: {project_base}')

In [None]:
data_path = os.path.join(project_base, 'data', 'WikiLarge_Train.csv')
full_df = pd.read_csv(data_path)
print(f'full_df column names: {list(full_df)}')
print(f'full training data df shape: {full_df.shape}')

In [None]:
# balanced split between the two classes
full_df.label.value_counts()

In [None]:
kaggle_data_path = os.path.join(project_base, 'data', 'WikiLarge_Test.csv')
kaggle_full_df = pd.read_csv(kaggle_data_path)
print(f'full_df column names: {list(kaggle_full_df)}')
print(f'full training data df shape: {kaggle_full_df.shape}')

In [None]:
kaggle_full_df.drop('id', axis=1, inplace=True)

In [None]:
# put the kaggle evaluation set and the training set in one df for now
# makes features generation a little easier
# will split files back out at end of notebook

full_df = full_df.append(kaggle_full_df)
full_df = full_df.reset_index(drop=True)
print(full_df.shape)
assert full_df.shape[0] == len(set(full_df.index))

In [None]:
full_df.head()

In [None]:
# split features and labels - not really necessary in this step
X = pd.DataFrame(full_df.original_text.copy())
y = full_df.label.copy()

In [None]:
X.head()

### Add features that do not need to be `fit_transformed`

Because these features do not depend on a fit and transformation, they will be calculated on the whole data set at once.

In [None]:
# split the sentence into the induvidual words
# no cleaning/tokenization/lemmatization
# TODO: try tokenizing or lemmatizing to get more matches with external data sources
X['sentence_word_list'] = X.original_text.str.split()

In [None]:
# number of words in sentence
X['word_count'] = X.sentence_word_list.str.len()

In [None]:
# number of dale chale words in sentence
dc_path = os.path.join(project_base, 'data', 'dale_chall.txt')
dc_df = pd.read_csv(dc_path, header=None)
dc_set = set(dc_df[0])

In [None]:
# number of dale chale words in each sentence
X['dale_chale_overlap_count'] = X.sentence_word_list.apply(lambda a: len(set(a).intersection(dc_set)))

In [None]:
# read in the age of acquisition data
# wierd formatting...

aoa_path = os.path.join(project_base, 'data', 'AoA_51715_words.csv')
aoa_df = pd.read_csv(aoa_path, encoding='Windows-1252')
aoa_df.fillna(0, inplace=True)
aoa_df.head()

In [None]:
# set the word as the index, will make for faster lookups
aoa_df_w_index = aoa_df.set_index(['Word'])

In [None]:
# check df shape
aoa_df_w_index.shape

In [None]:
# create a set of the words, will be used for filtering in next cell
aoa_word_set = set(aoa_df.Word.values)

In [None]:
# only keep words that appear in both the sentence and the age of acquisition df
X['aoa_overlap'] = X.sentence_word_list.apply(lambda a: set(a).intersection(aoa_word_set))

In [None]:
# helper functions
# TODO: extract and add as external .py file

# max age of acquisition --> how hard is the hardest word in the sentence
def get_sentence_max_value(word_set: Set[str], ref_df: pd.DataFrame, search_value: str):
    max_value = 0
    for word in word_set:
        current_word_max_value = ref_df.loc[(word), search_value]
        if current_word_max_value > max_value:
            max_value = current_word_max_value
    return max_value

# average age of acquisition --> how hard is the average word in the sentence
def get_sentence_avg_value(word_set: Set[str], ref_df: pd.DataFrame, search_value: str):
    total_value = 0
    word_count = len(word_set)
    for word in word_set:
        current_word_value = ref_df.loc[(word), search_value]
        total_value += current_word_value
    try: 
        avg_val = total_value / word_count
    except ZeroDivisionError:
        avg_val = 0
    return avg_val

# quick test of functions
assert get_sentence_max_value(['a','aardvark','abacus'], aoa_df_w_index, 'Freq_pm') == 20415.27
assert get_sentence_avg_value(['a','aardvark','abacus'], aoa_df_w_index, 'Freq_pm') == ((20415.27 + 0.41 + 0.24) / 3)

In [None]:
%%time
# this takes a while, like 9-10 minutes... but you only have to do it once

# finds the max value in the sentence of each metric listed in `search_strings`

search_strings = ['Nletters', 'Nphon', 'Nsyll', 'AoA_Kup', 'Perc_known', 'AoA_Kup_lem']
metrics_dict = {}
aoa_overlap_list = X['aoa_overlap'].to_list()
        
for search_string in search_strings:
    current_vals_list = []
    for idx, overlap_set in enumerate(aoa_overlap_list):
        if idx % 100000 == 0:
            print(f'Finished word {idx}, search string {search_string}')
        # max vals
        current_val = get_sentence_max_value(word_set=overlap_set, ref_df=aoa_df_w_index, search_value=search_string)
        current_vals_list.append(current_val)
        
    current_dict_key = search_string + '_max'
    metrics_dict[current_dict_key] = current_vals_list
    

In [None]:
# turn the dict into a df
aoa_max_metrics_df = pd.DataFrame.from_dict(metrics_dict)
# should have same number of rows as words in X
print(f'Aoa Max Metrics DF shape: {aoa_max_metrics_df.shape}')
aoa_max_metrics_df.head()

In [None]:
X.shape

In [None]:
len(set(X.index))

In [None]:
# add the df to the existing X df
# X = X.merge(aoa_max_metrics_df, left_index=True, right_index=True)
X = pd.concat([X, aoa_max_metrics_df], axis=1)
print(X.shape)

In [None]:
X.head()

In [None]:
%%time
# same as above, just done in a new cell for convenience/clarity
# this also takes a while, like 6-7 minutes...

# finds the avg value in the sentence of each metric listed in `search_strings`

search_strings = ['Nletters', 'Nphon', 'Nsyll', 'AoA_Kup', 'Perc_known', 'AoA_Kup_lem']
metrics_dict = {}
aoa_overlap_list = X['aoa_overlap'].to_list()
        
for search_string in search_strings:
    current_vals_list = []
    for idx, overlap_set in enumerate(aoa_overlap_list):
        if idx % 100000 == 0:
            print(f'Finished word {idx}, search string {search_string}')
        # avg vals
        current_val = get_sentence_avg_value(word_set=overlap_set, ref_df=aoa_df_w_index, search_value=search_string)
        current_vals_list.append(current_val)
        
    current_dict_key = search_string + '_avg'
    metrics_dict[current_dict_key] = current_vals_list

In [None]:
aoa_avg_metrics_df = pd.DataFrame.from_dict(metrics_dict)
# should have same number of rows as words in X
print(f'Aoa Max Metrics DF shape: {aoa_avg_metrics_df.shape}')
aoa_avg_metrics_df.head()

In [None]:
X = pd.concat([X, aoa_avg_metrics_df], axis=1)
print(X.shape)

In [None]:
X.drop(['aoa_overlap'], axis=1, inplace=True)

In [None]:
# read in the concreteness file
concreteness_path = os.path.join(project_base, 'data', 'Concreteness_ratings_Brysbaert_et_al_BRM.txt')
concreteness_df = pd.read_csv(concreteness_path, sep='\t')
concreteness_df.fillna(0, inplace=True)
concreteness_df.head()

In [None]:
conc_df_w_index = concreteness_df.set_index(['Word'])

In [None]:
conc_word_set = set(concreteness_df.Word.values)

In [None]:
X['conc_overlap'] = X.sentence_word_list.apply(lambda a: set(a).intersection(conc_word_set))

In [None]:
%%time
# pretty fast, only takes 5-6 minutes

# finds the max value in the sentence of each metric listed in `search_strings`

search_strings = ['Bigram', 'Conc.M', 'Conc.SD', 'Unknown', 'Total', 'Percent_known', 'SUBTLEX']
metrics_dict = {}
conc_overlap_list = X['conc_overlap'].to_list()
        
for search_string in search_strings:
    current_vals_list = []
    for idx, overlap_set in enumerate(conc_overlap_list):
        if idx % 100000 == 0:
            print(f'Finished word {idx}, search string {search_string}')
        # max vals
        current_val = get_sentence_max_value(word_set=overlap_set, ref_df=conc_df_w_index, search_value=search_string)
        current_vals_list.append(current_val)
        
    current_dict_key = search_string + '_max'
    metrics_dict[current_dict_key] = current_vals_list

In [None]:
conc_max_metrics_df = pd.DataFrame.from_dict(metrics_dict)
# should have same number of rows as words in X
print(f'Conc Max Metrics DF shape: {conc_max_metrics_df.shape}')
conc_max_metrics_df.head()

In [None]:

X = pd.concat([X, conc_max_metrics_df], axis=1)
print(X.shape)

In [None]:
%%time
# also pretty fast, only takes 1-2 minutes

# finds the avg value in the sentence of each metric listed in `search_strings`

search_strings = ['Bigram', 'Conc.M', 'Conc.SD', 'Unknown', 'Total', 'Percent_known', 'SUBTLEX']
metrics_dict = {}
conc_overlap_list = X['conc_overlap'].to_list()
        
for search_string in search_strings:
    current_vals_list = []
    for idx, overlap_set in enumerate(conc_overlap_list):
        if idx % 100000 == 0:
            print(f'Finished word {idx}, search string {search_string}')
        # max vals
        current_val = get_sentence_max_value(word_set=overlap_set, ref_df=conc_df_w_index, search_value=search_string)
        current_vals_list.append(current_val)
        
    current_dict_key = search_string + '_avg'
    metrics_dict[current_dict_key] = current_vals_list

In [None]:
conc_avg_metrics_df = pd.DataFrame.from_dict(metrics_dict)
# should have same number of rows as words in X
print(f'Conc Avg Metrics DF shape: {conc_avg_metrics_df.shape}')
conc_avg_metrics_df.head()

In [None]:
X = pd.concat([X, conc_avg_metrics_df], axis=1)
print(X.shape)

In [None]:
# remove the columns used for feature generation, but are not actually features

# X.drop(['original_text', 'sentence_word_list', 'conc_overlap'], inplace=True, axis=1)
X.drop(['sentence_word_list', 'conc_overlap'], inplace=True, axis=1)
X.head()

In [None]:
kaggle_X = X.iloc[-kaggle_full_df.shape[0]:,:].copy()
kaggle_y = y.iloc[-kaggle_full_df.shape[0]:].copy()
print(kaggle_X.shape)
print(kaggle_y.shape)
assert kaggle_X.shape[0] == kaggle_full_df.shape[0]

In [None]:
X = X.iloc[:-kaggle_full_df.shape[0],:].copy()
print(X.shape)
y = y.iloc[:-kaggle_full_df.shape[0]].copy()
print(y.shape)

In [None]:
# save the df to disk, next time we will just read it so we dont have to wait for all the processing
X.to_csv(os.path.join(project_base, 'data', 'cleaned_data', 'X_updated.csv'), index=False)
y.to_csv(os.path.join(project_base, 'data', 'cleaned_data', 'y_updated.csv'), index=False)
kaggle_X.to_csv(os.path.join(project_base, 'data', 'cleaned_data', 'kaggle_X_updated.csv'), index=False)
kaggle_y.to_csv(os.path.join(project_base, 'data', 'cleaned_data', 'kaggle_y_updated.csv'), index=False)

### Skip To Here If Feature Creation Complete

<a id='load_x'></a>

In [None]:
# read the dfs in

X = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data','X_updated.csv'))
y = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data','y_updated.csv'))
kaggle_X = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data','kaggle_X_updated.csv'))
kaggle_y = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data','kaggle_y_updated.csv'))

In [None]:
print(kaggle_X.shape)

### Add Word Vector Features

In [None]:
import nltk
from nltk.corpus import stopwords
import re
import gensim
import numpy as np

In [None]:
gensim.__version__

In [None]:
tokenized_train_items = []
stopwords = stopwords.words('english')

for b in tqdm(X['original_text']):
    tokenized_train_items.append([a for a in re.findall('(\w+)', b) if a.lower() not in stopwords])

In [None]:
model_wp = gensim.models.KeyedVectors.load(os.path.join(project_base, 'models','wikipedia.100.word-vecs.kv'))

In [None]:
def generate_dense_features(tokenized_texts, word_vectors): 
    results = []
    for doc in tokenized_texts:
        current_vec = []
        for word in doc:
            try:
                current_vec.append(word_vectors.wv[word])
            except KeyError:
                continue
        if len(current_vec) == 0:
            results.append(np.zeros(model_wp.vector_size))
        else:
            results.append(np.array(current_vec).mean(axis=0))
    return np.array(results)

In [None]:
X_train_wp = generate_dense_features(tokenized_train_items, model_wp)

In [None]:
np.save(os.path.join(project_base, 'data', 'cleaned_data','average_word_vecs'), X_train_wp)

### Skip to Here is WV Features Already Created 

In [None]:
avg_wvs = np.load(os.path.join(project_base, 'data', 'cleaned_data','average_word_vecs.npy'))

Create data with full 100-dimensional WV features

In [None]:
wv_cols = [f'wv_col_{i}' for i in range(100)]
X_full_wvs = pd.concat([X, pd.DataFrame(avg_wvs[:len(X), :], columns = wv_cols)], axis=1)
print(f'Full Df with word vectors shape: {X_full_wvs.shape}')

In [None]:
X_full_kaggle_wvs = pd.concat([kaggle_X, pd.DataFrame(avg_wvs[len(X):, :], columns = wv_cols)], axis=1)
print(f'Full Df with word vectors shape: {X_full_kaggle_wvs.shape}')

Reduced Vectors via PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA()
pca.fit(avg_wvs)

In [None]:
for i in range(1, 101):
    print(f'{i} Components explain {pca.explained_variance_ratio_[:i].sum()*100:.2f}% variance')

In [None]:
pca_components = 30
pca = PCA(n_components=pca_components)
pca.fit(avg_wvs)

In [None]:
wv_pca_cols = [f'wv_pca_cols{i}' for i in range(pca_components)]
X_pca_wvs = pd.concat([X, pd.DataFrame(pca.transform(avg_wvs[:len(X), :]), columns = wv_pca_cols)], axis=1)
print(f'Full Df with word vectors shape: {X_pca_wvs.shape}')

In [None]:
X_pca_kaggle_wvs = pd.concat([kaggle_X, pd.DataFrame(pca.transform(avg_wvs[len(X):, :]), columns = wv_pca_cols)], axis=1)
print(f'Full Df with word vectors shape: {X_full_kaggle_wvs.shape}')

In [None]:
# save the data
X_pca_wvs.to_csv(os.path.join(project_base, 'data', 'cleaned_data', 'X_pca_updated.csv'), index=False)
X_pca_kaggle_wvs.to_csv(os.path.join(project_base, 'data', 'cleaned_data', 'kaggle_X_pca_updated.csv'), index=False)