# LFTK Feature Extraction 

By: Jimuel Celeste, Jr. 

Objective: To demonstrate feature extraction with LFTK.

In [1]:
import lftk 
import luigi
import pandas as pd
import spacy 



In [2]:
nlp = spacy.load("en_core_web_sm")
sample_file = '/Users/jimuelcelestejr/Downloads/Dataset/Thesis - Text Transcripts/taukdial-169-3.txt'
with open(sample_file, "r") as f:
	text = f.read()
	doc = nlp(text)
doc

 trouble okay but this one tells a story and i'd like you to look at everything that's happening and tell me a story about what you see with a beginning middle and an end a little girl sees a cat in a tree um our father our element

In [5]:
features_list = lftk.search_features(
    domain='*',
	family='*',
	language='general',
	return_format='list_key'
)
features_list

['t_word',
 't_stopword',
 't_punct',
 't_uword',
 't_sent',
 't_char',
 'a_word_ps',
 'a_char_ps',
 'a_char_pw',
 't_n_ent',
 'a_n_ent_pw',
 'a_n_ent_ps',
 'simp_adj_var',
 'simp_adp_var',
 'simp_adv_var',
 'simp_aux_var',
 'simp_cconj_var',
 'simp_det_var',
 'simp_intj_var',
 'simp_noun_var',
 'simp_num_var',
 'simp_part_var',
 'simp_pron_var',
 'simp_propn_var',
 'simp_punct_var',
 'simp_sconj_var',
 'simp_sym_var',
 'simp_verb_var',
 'simp_space_var',
 'root_adj_var',
 'root_adp_var',
 'root_adv_var',
 'root_aux_var',
 'root_cconj_var',
 'root_det_var',
 'root_intj_var',
 'root_noun_var',
 'root_num_var',
 'root_part_var',
 'root_pron_var',
 'root_propn_var',
 'root_punct_var',
 'root_sconj_var',
 'root_sym_var',
 'root_verb_var',
 'root_space_var',
 'corr_adj_var',
 'corr_adp_var',
 'corr_adv_var',
 'corr_aux_var',
 'corr_cconj_var',
 'corr_det_var',
 'corr_intj_var',
 'corr_noun_var',
 'corr_num_var',
 'corr_part_var',
 'corr_pron_var',
 'corr_propn_var',
 'corr_punct_var',
 'cor

In [6]:
extractor = lftk.Extractor(docs=doc)
features = extractor.extract(features=features_list)

In [7]:
features

{'t_word': 51,
 't_stopword': 30,
 't_punct': 0,
 't_uword': 38,
 't_sent': 4,
 't_char': 184,
 'a_word_ps': 12.75,
 'a_char_ps': 46.0,
 'a_char_pw': 3.608,
 't_n_ent': 0,
 'a_n_ent_pw': 0.0,
 'a_n_ent_ps': 0.0,
 'simp_adj_var': 1.0,
 'simp_adp_var': 1.0,
 'simp_adv_var': 0,
 'simp_aux_var': 1.0,
 'simp_cconj_var': 0.5,
 'simp_det_var': 0.375,
 'simp_intj_var': 1.0,
 'simp_noun_var': 0.917,
 'simp_num_var': 0,
 'simp_part_var': 1.0,
 'simp_pron_var': 0.667,
 'simp_propn_var': 0,
 'simp_punct_var': 0,
 'simp_sconj_var': 0,
 'simp_sym_var': 0,
 'simp_verb_var': 0.714,
 'simp_space_var': 1.0,
 'root_adj_var': 1.0,
 'root_adp_var': 2.0,
 'root_adv_var': 0,
 'root_aux_var': 1.414,
 'root_cconj_var': 1.0,
 'root_det_var': 1.061,
 'root_intj_var': 1.414,
 'root_noun_var': 3.175,
 'root_num_var': 0,
 'root_part_var': 1.0,
 'root_pron_var': 2.0,
 'root_propn_var': 0,
 'root_punct_var': 0,
 'root_sconj_var': 0,
 'root_sym_var': 0,
 'root_verb_var': 1.89,
 'root_space_var': 1.0,
 'corr_adj_var': 