<a href="https://colab.research.google.com/github/jon-chun/sentimenttime/blob/main/sentimenttime_part0_text_features_alpha.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **DTW Clustering with dtaidistance**

* https://github.com/wannesm/dtaidistance

# **Setup**

In [None]:
# !git clone https://github.com/alan-turing-institute/sktime.git

In [None]:
# %cd sktime

In [None]:
# !pip install --editable .

In [None]:
# Missing Transformers

!pip install sktime[all_extras]

In [None]:
!pip install tsfresh

In [None]:
!pip install dtaidistance[all]

In [None]:
%matplotlib inline

In [None]:
import random
import array

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('display.width',1000)
pd.set_option('max_colwidth', 1000) 

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
stopwords_st = set(stopwords.words('english'))
stopwords_st.discard('not')
stopwords_st.discard('no')

In [None]:
from IPython.display import Image

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [None]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (30,10)

# **Read Data**

In [None]:
# Connect to Google gDrive

# Flag to indicate first run through code 
flag_first_run = True

from google.colab import drive, files
drive.mount('/gdrive')
%cd /gdrive/MyDrive/

In [None]:
gdrive_subdir = "./research/2021/sa_book_code/books_sa/cdickens_greatexpectations" #@param {type:"string"}


In [None]:
flag_first_run = True

CORPUS_FULL = 'Great Expectations by Charles Dickens (1861)'
CORPUS_SUBDIR = gdrive_subdir
corpus_filename = CORPUS_SUBDIR

# Change to working subdirectory
if flag_first_run == True:
  full_path_str = gdrive_subdir
  flag_first_run = False
else:
  full_path_str = f'/gdrive/MyDrive{gdrive_subdir[1:]}'

%cd $full_path_str

In [None]:
!ls -altr *.csv

In [None]:
corpus_sents_df = pd.read_csv('corpus_text_sents_raw_cdickens_greatexpectations.csv')

In [None]:
corpus_sents_df.rename(columns={'Unnamed: 0':'sent_no'}, inplace=True)

In [None]:
corpus_unified_df = pd.read_csv('sum_sentiments_all31_sents_cdickens_cdickens_greatexpectations.csv')
corpus_unified_df.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
corpus_unified_df.head(2)
corpus_unified_df.info()

# **Descriptive Statistics: Linguistic**

* https://github.com/Perevalov/LinguaF
* https://github.com/LSYS/LexicalRichness

In [None]:
!pip install pip install linguaf

### **General Counts**

In [None]:
!pip install lexicalrichness

In [None]:
corpus_sents_df.head()
corpus_sents_df.info()

In [None]:
corpus_sents_df['len_char'] = corpus_sents_df['sent_raw'].apply(lambda x: len(x))

In [None]:
sum_stat_str = str(corpus_sents_df['len_char'].describe())
print(sum_stat_str)
print('\n')
stat_len_char_str = '\n'.join(x.split('.')[0] for x in sum_stat_str.split('\n')[:-1])
print(stat_len_char_str)

In [None]:
fig, ax1 = plt.subplots()
sns.kdeplot(data=corpus_sents_df, x="len_char", ax=ax1)
ax1.set_xlim((corpus_sents_df["sent_no"].min(), int(corpus_sents_df["sent_no"].max()/8)))
ax2 = ax1.twinx()
sns.histplot(data=corpus_sents_df, x="len_char", discrete=True, ax=ax2)
ax2.set_title(f'{CORPUS_FULL}\nSentence Length (chars) Histogram')
ax2.text(.9, 0.8,f'Descriptive Statistics:\n-----------------------\n {stat_len_char_str}', fontsize=12, ha='center', va='center', transform = ax2.transAxes);

In [None]:
corpus_sents_df['len_word'] = corpus_sents_df['sent_raw'].apply(lambda x: len(x.split()))
sum_stat_str = str(corpus_sents_df['len_word'].describe())
print(sum_stat_str)
print('\n')
stat_len_word_str = '\n'.join(x.split('.')[0] for x in sum_stat_str.split('\n')[:-1])
print(stat_len_word_str)

In [None]:
fig, ax1 = plt.subplots()
sns.kdeplot(data=corpus_sents_df, x="len_char", ax=ax1)
ax1.set_xlim((corpus_sents_df["sent_no"].min(), int(corpus_sents_df["sent_no"].max()/8)))
ax2 = ax1.twinx()
sns.histplot(data=corpus_sents_df, x="len_char", discrete=True, ax=ax2)
ax2.set_title(f'{CORPUS_FULL}\nSentence Length (words) Histogram')
ax2.text(.9, 0.8,f'Descriptive Statistics:\n-----------------------\n {stat_len_word_str}', fontsize=12, ha='center', va='center', transform = ax2.transAxes);

In [None]:
corpus_sents_df.head()

### **Library: LexicalRichness**

* https://github.com/LSYS/LexicalRichness (20210815 26s)

In [None]:
from lexicalrichness import LexicalRichness

In [None]:
corpus_raw_str = ' '.join(corpus_sents_df['sent_raw'])
corpus_raw_str[:500]

In [None]:
lex = LexicalRichness(corpus_raw_str)

# Return word count.
corpus_word_ct = lex.words
print(f'corpus_word_ct: {corpus_word_ct}')

# Return (unique) word count.
corpus_unique_word_cd = lex.terms
print(f'corpus_unique_word_cd: {corpus_unique_word_cd}')

# Return type-token ratio (TTR) of text.
corpus_ttr = lex.ttr
print(f'corpus_ttr: {corpus_ttr}')

# Return root type-token ratio (RTTR) of text.
corpus_rttr = lex.rttr
print(f'corpus_rttr: {corpus_rttr}')

# Return corrected type-token ratio (CTTR) of text.
corpus_rttr = lex.cttr
print(f'corpus_rttr: {corpus_rttr}')

# Return mean segmental type-token ratio (MSTTR).
corpus_msttr = lex.msttr(segment_window=25)
print(f'corpus_msttr: {corpus_msttr}')

# Return moving average type-token ratio (MATTR).
corpus_mattr = lex.mattr(window_size=25)
print(f'corpus_mattr: {corpus_mattr}')

# Return Measure of Textual Lexical Diversity (MTLD).
corpus_mtld = lex.mtld(threshold=0.72)
print(f'corpus_mtld: {corpus_mtld}')

# Return hypergeometric distribution diversity (HD-D) measure.
corpus_hdd = lex.hdd(draws=42)
print(f'corpus_hdd: {corpus_hdd}')


### **Library: LinguaF**

* https://github.com/Perevalov/LinguaF (20210614 2s)

In [None]:
from linguaf import descriptive_statistics as ds

documents = list(corpus_sents_df['sent_raw'])

corpus_char_ct = ds.char_count(documents)
print(f'corpus_char_ct: {corpus_char_ct}')

corpus_letter_ct = ds.letter_count(documents)
print(f'corpus_letter_ct: {corpus_letter_ct}')

corpus_punct_ct = ds.punctuation_count(documents)
print(f'corpus_punct_ct: {corpus_punct_ct}')

corpus_digit_ct = ds.digit_count(documents)
print(f'corpus_digit_ct: {corpus_digit_ct}')

corpus_syllable_ct = ds.syllable_count(documents)
print(f'corpus_syllable_ct: {corpus_syllable_ct}')

corpus_sent_ct = len(documents)
print(f'corpus_sent_ct: {corpus_sent_ct}')

corpus_avg_syllable_word = ds.avg_syllable_per_word(documents)
print(f'corpus_avg_syllable_word: {corpus_avg_syllable_word}')

corpus_avg_word_len = ds.avg_word_length(documents)
print(f'corpus_avg_word_len: {corpus_avg_word_len}')

corpus_avg_sent_len = ds.avg_sentence_length(documents)
print(f'corpus_avg_sent_len: {corpus_avg_sent_len}')

corpus_avg_word_sent = ds.avg_words_per_sentence(documents)
print(f'corpus_avg_word_sent: {corpus_avg_word_sent}')

corpus_avg_syllable_ct = ds.avg_syllable_per_word(documents)
print(f'corpus_avg_syllable_ct: {corpus_avg_syllable_ct}')

"""
    Number of characters char_count
    Number of letters letter_count
    Number of punctuation characters punctuation_count
    Number of digits digit_count
    Number of syllables syllable_count
    Number of sentences sentence_count
    Number of n-syllable words number_of_n_syllable_words
    Average syllables per word avg_syllable_per_word
    Average word length avg_word_length
    Average sentence length avg_sentence_length
    Average words per sentence avg_words_per_sentence

Additional methods:

    Get lexical items (nouns, adjectives, verbs, adverbs) get_lexical_items
    Get n-grams get_ngrams
    Get sentences get_sentences
    Get words get_words
    Tokenize tokenize
    Remove punctuation remove_punctuation
    Remove digits remove_digits
""";


In [None]:
%%time

# NOTE: 7987s (>>1hr)

from linguaf import syntactical_complexity as sc

corpus_mdd_complexity = sc.mean_dependency_distance(documents)
print(f'corpus_mdd_complexity: {corpus_mdd_complexity}')

In [None]:
%time

# NOTE: 23s

from linguaf import lexical_diversity as ld

corpus_lexical_density = ld.lexical_density(documents)
print(f'corpus_lexical_density: {corpus_lexical_density}')

corpus_type_token_ratio = ld.type_token_ratio(documents)
print(f'corpus_type_token_ratio: {corpus_type_token_ratio}')

corpus_log_type_token_ratio = ld.log_type_token_ratio(documents)
print(f'corpus_log_type_token_ratio: {corpus_log_type_token_ratio}')

corpus_summer_index = ld.summer_index(documents)
print(f'corpus_summer_index: {corpus_summer_index}')

corpus_root_type_token_ratio = ld.root_type_token_ratio(documents)
print(f'corpus_root_type_token_ratio: {corpus_root_type_token_ratio}')

"""


    Lexical Density (LD) lexical_density
    Type Token Ratio (TTR) type_token_ratio
    Herdan's Constant or Log Type Token Ratio (LogTTR) log_type_token_ratio
    Summer's Index summer_index
    Root Type Token Ratio (RootTTR) root_type_token_ratio


""";

In [None]:
from linguaf import readability as r

# NOTE: 74s

corpus_flesch_reading_ease = r.flesch_reading_ease(documents)
print(f'corpus_flesch_reading_ease: {corpus_flesch_reading_ease}')

corpus_read_flesch = r.flesch_kincaid_grade(documents)
print(f'corpus_read_flesch: {corpus_read_flesch}')

corpus_automated_readability_index = r.automated_readability_index(documents)
print(f'corpus_automated_readability_index: {corpus_automated_readability_index}')

corpus_automated_readability_index_simple = r.automated_readability_index_simple(documents)
print(f'corpus_automated_readability_index_simple: {corpus_automated_readability_index_simple}')

corpus_coleman_readability = r.coleman_readability(documents)
print(f'corpus_coleman_readability: {corpus_coleman_readability}')

corpus_easy_listening = r.easy_listening(documents)
print(f'corpus_easy_listening: {corpus_easy_listening}')

"""


    Flesch Reading Ease (FRE) flesch_reading_ease
    Flesch-Kincaid Grade (FKG) flesch_kincaid_grade
    Automated Readability Index (ARI) automated_readability_index
    Simple Automated Readability Index (sARI) automated_readability_index_simple
    Coleman's Readability Score coleman_readability
    Easy Listening Score easy_listening

""";

### **Library: textstat**

* https://github.com/shivam5992/textstat (20210816 689s) 

In [None]:
!pip install textstat

In [None]:
import textstat

In [None]:
test_data = (
    "Playing games has always been thought to be important to "
    "the development of well-balanced and creative children; "
    "however, what part, if any, they should play in the lives "
    "of adults has never been researched that deeply. I believe "
    "that playing games is every bit as important for adults "
    "as for children. Not only is taking time out to play games "
    "with our children and other adults valuable to building "
    "interpersonal relationships but is also a wonderful way "
    "to release built up tension."
)

type(test_data)

In [None]:
test_data[:75]

In [None]:
corpus_raw_str[:500]

In [None]:
%%time

# NOTE: 

test_data = corpus_raw_str

corpus_flesch_reading_ease = textstat.flesch_reading_ease(test_data)
print(f'corpus_flesch_reading_ease: {corpus_flesch_reading_ease}')

corpus_flesch_kincaid_grade = textstat.flesch_kincaid_grade(test_data)
print(f'corpus_flesch_kincaid_grade: {corpus_flesch_kincaid_grade}')

corpus_smog_index = textstat.smog_index(test_data)
print(f'corpus_smog_index: {corpus_smog_index}')

corpus_coleman_liau_index = textstat.coleman_liau_index(test_data)
print(f'corpus_coleman_liau_index: {corpus_coleman_liau_index}')

corpus_automated_readability_index = textstat.automated_readability_index(test_data)
print(f'corpus_automated_readability_index: {corpus_automated_readability_index}')

corpus_dale_chall_readability_score = textstat.dale_chall_readability_score(test_data)
print(f'corpus_dale_chall_readability_score: {corpus_dale_chall_readability_score}')

corpus_difficult_words = textstat.difficult_words(test_data)
print(f'corpus_difficult_words: {corpus_difficult_words}')

corpus_linsear_write_formula = textstat.linsear_write_formula(test_data)
print(f'corpus_linsear_write_formula: {corpus_linsear_write_formula}')

corpus_gunning_fog = textstat.gunning_fog(test_data)
print(f'corpus_gunning_fog: {corpus_gunning_fog}')

corpus_text_standard = textstat.text_standard(test_data)
print(f'corpus_text_standard: {corpus_text_standard}')

corpus_fernandez_huerta = textstat.fernandez_huerta(test_data)
print(f'corpus_fernandez_huerta: {corpus_fernandez_huerta}')

corpus_szigriszt_pazos = textstat.szigriszt_pazos(test_data)
print(f'corpus_szigriszt_pazos: {corpus_szigriszt_pazos}')

corpus_gutierrez_polini = textstat.gutierrez_polini(test_data)
print(f'corpus_gutierrez_polini: {corpus_gutierrez_polini}')

corpus_crawford = textstat.crawford(test_data)
print(f'corpus_crawford: {corpus_crawford}')

corpus_gulpease_index = textstat.gulpease_index(test_data)
print(f'corpus_gulpease_index: {corpus_gulpease_index}')

corpus_osmane = textstat.osman(test_data)
print(f'corpus_osmane: {corpus_osmane}')

### **Library: NLTK**

### **Library: PyLex**

* https://github.com/techcentaur/PyLex (20180606 58s)

# **Feature Extraction: Time Series**

## **Simple Summary Statistics**

In [None]:
corpus_unified_df.info()

In [None]:

sns.set(rc = {'figure.figsize':(20,10)})
# plt.figure(figsize = (30,10))

In [None]:
# sns.set(rc={'figure.figsize':(30,10)})
p = sns.displot(corpus_unified_df, x='baseline_sentimentr_stdscaler_roll10', kind='kde');

In [None]:
from scipy.stats import norm

ax = sns.distplot(corpus_unified_df["baseline_sentimentr_stdscaler_roll10"], fit=norm, kde=False)

In [None]:
fig, ax1 = plt.subplots()
ax1 = sns.displot(corpus_unified_df["baseline_sentimentr_stdscaler_roll10"])
ax1.set_xlim((corpus_unified_df["sent_no"].min(), int(corpus_unified_df["sent_no"].max()/8)))
ax2 = ax1.twinx()
sns.histplot(data=corpus_unified_df, x="baseline_sentimentr_stdscaler_roll10", discrete=False, ax=ax2)
ax2.set_title(f'{CORPUS_FULL}\nSentence Length (words) Histogram')
ax2.text(.9, 0.9,f'Descriptive Statistics:\n-----------------------\n {stat_len_word_str}', fontsize=12, ha='center', va='center', transform = ax.transAxes);

In [None]:
fig, ax1 = plt.subplots()
# sns.kdeplot(data=corpus_unified_df["baseline_sentimentr_stdscaler_roll10"], x=corpus_unified_df["sent_no"], ax=ax1)
sns.displot(data=corpus_unified_df, x='baseline_sentimentr_stdscaler_roll10', kind='kde', ax=ax1)
ax1.set_xlim((corpus_unified_df["sent_no"].min(), int(corpus_unified_df["sent_no"].max()/2)))
ax2 = ax1.twinx()
sns.histplot(data=corpus_unified_df, x="baseline_sentimentr_stdscaler_roll10", bins=100, ax=ax2)
# ax2.set_title(f'{CORPUS_FULL}\nSentence Sentiment (baseline_sentimentr_stdscaler_roll10) Histogram');
# ax2.text(.9, 0.9,f'Descriptive Statistics:\n-----------------------\n {stat_len_word_str}', fontsize=12, ha='center', va='center', transform = ax.transAxes);

In [None]:
corpus_unified_df['baseline_sentimentr_stdscaler_roll10'].plot()
corpus_unified_df['baseline_syuzhet_stdscaler_roll10'].plot()
plt.legend(loc='best')
plt.title(f'Great Expectations by Charles Dickens\nDiachronic Sentiment over {sent_ct} Sentences using Standard Scaler + SMA 10%');

In [None]:
ts_stdscaler_roll_df = corpus_unified_df.filter(like='roll10').copy()
sent_ct = ts_stdscaler_roll_df.shape[0]
# print(f'Time Series Count: {sent_ct}')
ts_stdscaler_roll_df.filter(regex='^(sentimentr|syuzhet|transformer)',axis=1).plot()
plt.legend(loc='best')
plt.title(f'Great Expectations by Charles Dickens\nDiachronic Sentiment over {sent_ct} Sentences using Standard Scaler + SMA 10%');

## **Library: TSfresh**

* https://github.com/blue-yonder/tsfresh (20210709 509s) 

In [None]:
# !pip install transformers

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sktime.datasets import load_basic_motions
from sktime.datasets import load_arrow_head
from sktime.transformers.series_as_features.summarize import TSFreshFeatureExtractor
from sktime.forecasting.base import ForecastingHorizon
from sklearn.ensemble import RandomForestRegressor
from sktime.forecasting.compose import ReducedTimeSeriesRegressionForecaster
from sklearn.pipeline import make_pipeline
from sktime.datasets import load_airline
from sktime.forecasting.model_selection import temporal_train_test_split

In [None]:
X, y = load_arrow_head(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
from tsfresh.examples.robot_execution_failures import download_robot_execution_failures, \
    load_robot_execution_failures
download_robot_execution_failures()
timeseries, y = load_robot_execution_failures()

timeseries.shape

In [None]:
print(timeseries.head())

In [None]:
import matplotlib.pyplot as plt
timeseries[timeseries['id'] == 3].plot(subplots=True, sharex=True, figsize=(10,10))
plt.show();

In [None]:
timeseries[timeseries['id'] == 20].plot(subplots=True, sharex=True, figsize=(10,10))
plt.show();

In [None]:
%%time

# NOTE: 47s

from tsfresh import extract_features
extracted_features = extract_features(timeseries, column_id="id", column_sort="time")

In [None]:
# Extracted features
print(f'Original Feature Count: {timeseries.shape}')

print(f'Extracted Feature Count: {extracted_features.shape}')

In [None]:
%%time

# NOTE: 30s

from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

impute(extracted_features)
features_filtered = select_features(extracted_features, y)

In [None]:
# Relevant features
print(f'Original Feature Count: {timeseries.shape}')

print(f'Relevant Feature Count: {features_filtered.shape}')

In [None]:
from tsfresh.feature_extraction import feature_calculators

In [None]:
%whos DataFrame

In [None]:
corpus_unified_df.fillna(0, inplace=True)

In [None]:
ts_model_ser = corpus_unified_df['baseline_sentimentr_stdscaler_roll10']

ts_abs_eng = feature_calculators.abs_energy(ts_model_ser)
print(f'ts_abs_eng: {ts_abs_eng}')

ts_sum_of_changes = feature_calculators.absolute_sum_of_changes(ts_model_ser)
print(f'ts_sum_of_changes: {ts_sum_of_changes}')

ts_acf = feature_calculators.acf(ts_model_ser)
print(f'ts_acf: {ts_acf}')

ts_adfuller = feature_calculators.adfuller(ts_model_ser)
print(f'ts_adfuller: {ts_adfuller}')

# ts_approximate_entropy = feature_calculators.approximate_entropy(ts_model_ser)
# print(f'ts_approximate_entropy: {ts_approximate_entropy}')

# ts_agg_autocorrelation = feature_calculators.agg_autocorrelation(ts_model_ser)
# print(f'ts_agg_autocorrelation: {ts_agg_autocorrelation}')

# ts_agg_linear_trend = feature_calculators.agg_linear_trend(ts_model_ser)
# print(f'agg_linear_trend: {agg_linear_trend}')

# ts_augmented_dickey_fuller = feature_calculators.augmented_dickey_fuller(ts_model_ser)
# print(f'ts_augmented_dickey_fuller: {ts_augmented_dickey_fuller}')



## **Library: sktime**

* https://github.com/alan-turing-institute/sktime (20210818 4.4k)

Our aim is to make the time series analysis ecosystem more interoperable and usable as a whole. sktime provides a unified interface for distinct but related time series learning tasks. It features dedicated time series algorithms and tools for composite model building including pipelining, ensembling, tuning and reduction that enables users to apply an algorithm for one task to another.

sktime also provides interfaces to related libraries, for example scikit-learn, statsmodels, tsfresh, PyOD and fbprophet, among others.

For deep learning, see our companion package: sktime-dl.

### **Library: sktime-dl**

* https://github.com/sktime/sktime-dl (20210812 453s) 

### **Library: dl-4-tsc**

* https://github.com/hfawaz/dl-4-tsc (20200406 872s)

This is the companion repository for our paper titled "Deep learning for time series classification: a review" published in Data Mining and Knowledge Discovery, also available on ArXiv.

## **Library: Darts**

* https://github.com/unit8co/dart (20210818 2.3k) 

darts is a Python library for easy manipulation and forecasting of time series. It contains a variety of models, from classics such as ARIMA to deep neural networks. The models can all be used in the same way, using fit() and predict() functions, similar to scikit-learn. The library also makes it easy to backtest models, and combine the predictions of several models and external regressors. Darts supports both univariate and multivariate time series and models. The neural networks can be trained on multiple time series, and some of the models offer probabilistic forecasts.

Currently, the library contains the following features:

Forecasting Models: A large collection of forecasting models; from statistical models (such as ARIMA) to deep learning models (such as N-BEATS). See table of models below.

Data processing: Tools to easily apply (and revert) common transformations on time series data (scaling, boxcox, …)

Metrics: A variety of metrics for evaluating time series' goodness of fit; from R2-scores to Mean Absolute Scaled Error.

Backtesting: Utilities for simulating historical forecasts, using moving time windows.

Regression Models: Possibility to predict a time series from lagged versions of itself and of some external covariate series, using arbitrary regression models (e.g. scikit-learn models).

Multiple series training: All neural networks, as well as RegressionModels (incl. LinearRegressionModel and RandomForest) support being trained on multiple series.

Past and Future Covariates support: Some models support past-observed and/or future-known covariate time series as inputs for producing forecasts.

Multivariate Support: Tools to create, manipulate and forecast multivariate time series.

Probabilistic Support: TimeSeries objects can (optionally) represent stochastic time series; this can for instance be used to get confidence intervals.

Filtering Models: Darts offers three filtering models: KalmanFilter, GaussianProcessFilter, and MovingAverage, which allow to filter time series, and in some cases obtain probabilistic inferences of the underlying states/values.

## **Library: Cesium**

* http://cesium-ml.org/docs/feature_table.html
* http://cesium-ml.org/docs/auto_examples/plot_EEG_Example.html#sphx-glr-auto-examples-plot-eeg-example-py

In [None]:
from cesium import featurize

features_to_use = ["amplitude",
                   "percent_beyond_1_std",
                   "maximum",
                   "max_slope",
                   "median",
                   "median_absolute_deviation",
                   "percent_close_to_median",
                   "minimum",
                   "skew",
                   "std",
                   "weighted_average"]

fset_cesium = featurize.featurize_time_series(times=eeg["times"],
                                              values=eeg["measurements"],
                                              errors=None,
                                              features_to_use=features_to_use)

print(fset_cesium.head())

## **Library: FeatureTools**

* https://featuretools.alteryx.com/en/stable/ (Extract Deep Features)
* https://www.featuretools.com/demos/ (Forecast examples)


In [None]:
import featuretools as ft

In [None]:
data = ft.demo.load_mock_customer()

# **Forecasting**

## **Library: TSLearn**

* https://github.com/tslearn-team/tslearn (20210818 1.8k)

tslearn is a Python package that provides machine learning tools for the analysis of time series. This package builds on (and hence depends on) scikit-learn, numpy and scipy libraries.

## **Library: Informer2020**

* https://github.com/zhouhaoyi/Informer2020 (20210812 1.6k)

Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting (AAAI'21 Best Paper). This is the origin Pytorch implementation of Informer in the following paper: Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting. Special thanks to Jieqi Peng@cookieminions for building this repo.

# **Classification**

## **Library: tsai**

* https://github.com/timeseriesAI/tsai

# **AutoML Time Series**

## **Library: MS NNI**

* https://github.com/microsoft/nni

# **END**

## **Compute Distance Matrix between 2 Series**

In [None]:
from dtaidistance import dtw
from dtaidistance import dtw_visualisation as dtwvis
import numpy as np
s1 = np.array([0., 0, 1, 2, 1, 0, 1, 0, 0, 2, 1, 0, 0])
s2 = np.array([0., 1, 2, 3, 1, 0, 0, 0, 2, 1, 0, 0, 0])
path = dtw.warping_path(s1, s2)
dtwvis.plot_warping(s1, s2, path, filename="warp.png")

In [None]:
Image(filename='warp.png') 

In [None]:
res = [random.randrange(1, 50, 1) for i in range(7)]

In [None]:
# Option #1: pandas

s1 = [0, 0, 1, 2, 1, 0, 1, 0, 0]
s2 = [0, 1, 2, 0, 0, 0, 0, 0, 0]
distance = dtw.distance(s1, s2)
print(distance)

In [None]:
print(dtw.distance.__doc__)

In [None]:
%%timeit

# 100 datapoints: 159ms
# 300 datapoints: 1.5s
# 500 datapoints: 4.3s
# 1k datapoints: 17.7s
# 5k datapoints: ?(1.55s)
# 10k datapoints: >15m

dist_ls = []

for i in range(10):
  r1 = [random.randrange(1, 50, 1) for i in range(10000)]
  r2 = [random.randrange(1, 50, 1) for i in range(10000)]
  dist_fl = dtw.distance(r1, r2)
  dist_ls.append(dist_fl)

print(f'Mean: {sum(dist_ls)/len(dist_ls)}')

In [None]:
%%timeit

# 100 datapoints: 159ms
# 300 datapoints: 1.5s
# 500 datapoints: 4.3s
# 1k datapoints: 17.7s
# 5k datapoints: ?(1.55s)
# 10k datapoints: 

dist_ls = []

for i in range(10):
  r1 = [random.randrange(1, 50, 1) for i in range(5000)]
  r2 = [random.randrange(1, 50, 1) for i in range(5000)]
  dist_fl = dtw.distance(r1, r2)
  dist_ls.append(dist_fl)

print(f'Mean: {sum(dist_ls)/len(dist_ls)}')

In [None]:
# Option #2 (30-3000x faster) c implementation requires array w/doubles (and optionally max_dist pruning)

s1 = array.array('d',[0, 0, 1, 2, 1, 0, 1, 0, 0])
s2 = array.array('d',[0, 1, 2, 0, 0, 0, 0, 0, 0])
d = dtw.distance_fast(s1, s2, use_pruning=True)
print(d)

In [None]:
%%timeit

# 100 datapoints: 3ms
# 300 datapoints: 12.3ms
# 500 datapoints: 26ms
# 1k datapoints: 81.3ms
# 5k datapoints: 1.55s
# 10k datapoints: 6s

dist_fast_ls = []

for i in range(10):
  r1 = array.array('d',[random.randrange(1, 50, 1) for i in range(1000)])
  r2 = array.array('d',[random.randrange(1, 50, 1) for i in range(1000)])
  dist_fl = dtw.distance_fast(r1, r2)
  dist_fast_ls.append(dist_fl)

print(f'Mean: {sum(dist_fast_ls)/len(dist_fast_ls)}')

In [None]:
# Option #3: Numpy array with doubles or floats

s1 = np.array([0, 0, 1, 2, 1, 0, 1, 0, 0], dtype=np.double)
s2 = np.array([0.0, 1, 2, 0, 0, 0, 0, 0, 0])
d = dtw.distance_fast(s1, s2, use_pruning=True)

In [None]:
%%timeit

# 100 datapoints: 3ms
# 300 datapoints: 12.3ms
# 500 datapoints: 26ms
# 1k datapoints: 82ms
# 5k datapoints: 1.55s
# 10k datapoints: 6s

dist_c_ls = []

for i in range(10):
  r1 = np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double)
  r2 = np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double)
  dist_fl = dtw.distance_fast(r1, r2)
  dist_c_ls.append(dist_fl)

print(f'Mean: {sum(dist_c_ls)/len(dist_c_ls)}')

## **Visualize Warping Paths**

In [None]:
s1 = [0, 0, 1, 2, 1, 0, 1, 0, 0]
s2 = [0, 1, 2, 0, 0, 0, 0, 0, 0]
distance, paths = dtw.warping_paths(s1, s2)
print(distance)
print(paths)

In [None]:
x = np.arange(0, 20, .5)
s1 = np.sin(x)
s2 = np.sin(x - 1)
random.seed(1)
for idx in range(len(s2)):
    if random.random() < 0.05:
        s2[idx] += (random.random() - 0.5) / 2
d, paths = dtw.warping_paths(s1, s2, window=25, psi=2)
best_path = dtw.best_path(paths)
dtwvis.plot_warpingpaths(s1, s2, paths, best_path)

## **Compute Distance Matrix between Set of Series**

In [None]:
from dtaidistance import dtw
import numpy as np
series = [
    np.array([0, 0, 1, 2, 1, 0, 1, 0, 0], dtype=np.double),
    np.array([0.0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0]),
    np.array([0.0, 0, 1, 2, 1, 0, 0, 0])]
ds = dtw.distance_matrix_fast(series)

In [None]:
series = np.matrix([
    [0.0, 0, 1, 2, 1, 0, 1, 0, 0],
    [0.0, 1, 2, 0, 0, 0, 0, 0, 0],
    [0.0, 0, 1, 2, 1, 0, 0, 0, 0]])

ds = dtw.distance_matrix_fast(series)

In [None]:
ds

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
[random.randrange(1, 50, 1) for i in range(10)]
print('\n')
[random.randrange(1, 50, 1) for i in range(10)]
print('\n')
[random.randrange(1, 50, 1) for i in range(10)]
print('\n')
[random.randrange(1, 50, 1) for i in range(10)]

In [None]:
series = [
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double)]
ds = dtw.distance_matrix_fast(series)

In [None]:
ds

In [None]:
# Can be distributed and parallelized

In [None]:
ts_stdscaler_roll_df.info()

In [None]:
ts_sentiments_np = ts_stdscaler_roll_df.to_numpy()
ts_sentiments_np.shape

In [None]:
series = ts_sentiments_np
ds = dtw.distance_matrix_fast(series)

In [None]:
from dtaidistance import dtw
import numpy as np
series = [
    np.array(ts_sentiment_df[''], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double)]
ds = dtw.distance_matrix_fast(series)

In [None]:
ds

# **Stylometry**

## **Library: StyloR**

* https://github.com/computationalstylistics/stylo (20210804 106s)
* https://github.com/JoannaBy/DHSI2021-Stylometry (2021 Tutorial)

Python:

* https://github.com/worldwise001/stylometry

## **Web: Rolling Stylometry**

* https://github.com/stylo-explorer/rolling-stylometry-explorer (20210808 6s) 

## **Library: ScatterText**

* https://github.com/JasonKessler/scattertext (20210707 1.6k)

## **Library: PySty**

* https://github.com/mikekestemont/pystyl (20180426 52s)
* https://github.com/mikekestemont/pystyl/blob/master/A%20Walk%20Through%20PyStyl.ipynb 

## **Library: TestFeatureSelection**

* https://github.com/StatguyUser/TextFeatureSelection (20210812 21s)

Python library for feature selection for text features. It has filter method and genetic algorithm for improving text classification models. Helps improve your machine learning models 

## **Library: Stylometry**

* https://github.com/worldwise001/stylometry (20150409 13s) 
* https://github.com/jpotts18/stylometry (20191215 103s) 

## **Library: PASTEL (Persona: Gender, Age, Country, Politics, Education), Ethic, TOW)**

* https://github.com/dykang/PASTEL (20200316 22s)

## **Jupyter: Authorship Chi-Squared Test**

* https://github.com/travisrussell/pale_fire_analysis/blob/master/code/pale_fire_chi_squared.ipynb

## **Library: Translate Author Style 4 Anonymity PyTorch**

* https://github.com/rakshithShetty/A4NT-author-masking

## **Library: Tweet Bot Detector**

* https://github.com/omerjaved11/Author_Profiling_clef19 (20190817 3s)

## **Kaggle Competitions**

Spooky Author Identification
* https://www.kaggle.com/c/spooky-author-identification/code
* https://www.kaggle.com/christopher22/stylometry-identify-authors-by-sentence-structure

CommonLit Readibility:
* https://www.kaggle.com/c/commonlitreadabilityprize/code


# **NLP Classification**

## **Library: TextFeatureSelection**

* https://github.com/StatguyUser/TextFeatureSelection

TextFeatureSelection is a Python library which helps improve text classification models through feature selection. It has 3 methods TextFeatureSelection, TextFeatureSelectionGA (Genetic Algorithm) and TextFeatureSelectionEnsemble methods respectively

# **Visualize Hierarchical Clustering**

In [None]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,10)

In [None]:
from dtaidistance import clustering
# Custom Hierarchical clustering
model1 = clustering.Hierarchical(dtw.distance_matrix_fast, {})
cluster_idx = model1.fit(series)
# Augment Hierarchical object to keep track of the full tree
model2 = clustering.HierarchicalTree(model1)
cluster_idx = model2.fit(series)
# SciPy linkage clustering
model3 = clustering.LinkageTree(dtw.distance_matrix_fast, {})
cluster_idx = model3.fit(series)

In [None]:
model3.plot("myplot.png")
Image(filename='myplot.png') 

In [None]:
ts_labels = ['SentimentR',
             'SyuzhetR',
             'TextBlob',
             'Flair',
             'Stanza',
             'Logistic Regression',
             'LSTM',
             'CNN',
             'RoBERTa 15 Large',
             'T5']


fig, ax = plt.subplots(nrows=1, ncols=2, gridspec_kw={'width_ratios': [1, 4]}, figsize=(30, 10))
# show_ts_label = lambda idx: "ts-" + str(idx)
show_ts_label = lambda idx: ts_labels[idx]
model3.plot("hierarchy.png", axes=ax, show_ts_label=show_ts_label,
           show_tr_label=True, ts_label_margin=-100,
           ts_left_margin=5, ts_sample_length=1)

In [None]:
Image(filename='hierarchy.png') 

In [None]:
from dtaidistance import clustering

# Custom Hierarchical clustering
# model1 = clustering.Hierarchical(dtw.distance_matrix_fast, {})
# cluster_idx = model1.fit(series)

# Augment Hierarchical object to keep track of the full tree
# model2 = clustering.HierarchicalTree(model1)
# cluster_idx = model2.fit(series)



# SciPy linkage clustering
model3 = clustering.LinkageTree(dtw.distance_matrix_fast, {})
cluster_idx = model3.fit(series)

In [None]:
model3.plot("myplot.png")