<a href="https://colab.research.google.com/github/jon-chun/sentimenttime/blob/main/ts_dtw_clustering_dtaidistance_20210818.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **DTW Clustering with dtaidistance**

* https://github.com/wannesm/dtaidistance

## **Setup**

In [None]:
!pip install dtaidistance[all]

In [None]:
%matplotlib inline

In [None]:
import random
import array

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [None]:
from dtaidistance import dtw
from dtaidistance import dtw_visualisation as dtwvis

In [None]:
from IPython.display import Image

In [None]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (30,10)

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## **Configure Matplotlib Styel for Print LaTex or Online Web Quality**

* http://aeturrell.com/2018/01/31/publication-quality-plots-in-python/
* https://github.com/jbmouret/matplotlib_for_papers
* https://ipython-books.github.io/61-using-matplotlib-styles/
* https://jakevdp.github.io/PythonDataScienceHandbook/04.11-settings-and-stylesheets.html

Tech:
* https://github.com/jbmouret/matplotlib_for_papers#median-vs-mean
* https://github.com/garrettj403/SciencePlots (req latex)
* https://timodenk.com/blog/exporting-matplotlib-plots-to-latex/ (export to *.pgf, req latex)
* https://matplotlib.org/stable/tutorials/introductory/customizing.html 

Guidelines:
* https://www.overleaf.com/learn/latex/Inserting_Images
* https://news.ycombinator.com/item?id=19425637 
* https://towardsdatascience.com/two-simple-steps-to-create-colorblind-friendly-data-visualizations-2ed781a167ec 

Basics
* http://aeturrell.com/2018/01/31/publication-quality-plots-in-python/
* http://www.jesshamrick.com/2016/04/13/reproducible-plots/

In [None]:
cfgdir = matplotlib.get_configdir()
cfgdir

In [None]:
from pathlib import Path
p = Path(cfgdir)
stylelib = (p / 'stylelib')
stylelib.mkdir(exist_ok=True)
path = stylelib / 'mycustomstyle.mplstyle'

In [None]:
# http://aeturrell.com/2018/01/31/publication-quality-plots-in-python/

# set 'figure.dpi: 125' for online web viewing

path.write_text('''
xtick.color: 323034
ytick.color: 323034
text.color: 323034
lines.markeredgecolor: black
patch.facecolor        : bc80bd
patch.force_edgecolor  : True
patch.linewidth: 0.8
scatter.edgecolors: black
grid.color: b1afb5
axes.titlesize: 16
legend.title_fontsize: 12
xtick.labelsize: 12
ytick.labelsize: 12
axes.labelsize: 12
font.size: 10
axes.prop_cycle : (cycler('color', ['bc80bd' ,'fb8072', 'b3de69','fdb462','fccde5','8dd3c7','ffed6f','bebada','80b1d3', 'ccebc5', 'd9d9d9']))
mathtext.fontset: stix
font.family: STIXGeneral
lines.linewidth: 2
legend.frameon: True
legend.framealpha: 0.8
legend.fontsize: 10
legend.edgecolor: 0.9
legend.borderpad: 0.2
legend.columnspacing: 1.5
legend.labelspacing:  0.4
text.usetex: False
axes.titlelocation: left
axes.formatter.use_mathtext: True
axes.autolimit_mode: round_numbers
axes.labelpad: 3
axes.formatter.limits: -4, 4
axes.labelcolor: black
axes.edgecolor: black
axes.linewidth: 0.6
axes.spines.right : False
axes.spines.top : False
axes.grid: False
figure.titlesize: 18
figure.dpi: 300
''')

In [None]:
mpl.style.reload_library()

In [None]:
with mpl.style.context(['ggplot', 'mycustomstyle']):
    doplot()

In [None]:
# Matplotlib style settings for LaTex print quality
# set dpi=120 for online web viewing



import matplotlib.pyplot as plt
plt.style.use('plot_style.txt')


# **Get Data**

In [None]:
# Connect to Google gDrive

# Flag to indicate first run through code 
flag_first_run = True

from google.colab import drive, files
drive.mount('/gdrive')
%cd /gdrive/MyDrive/

In [None]:
gdrive_subdir = "./research/2021/sa_book_code/books_sa/cdickens_greatexpectations" #@param {type:"string"}


In [None]:
flag_first_run = True

CORPUS_SUBDIR = gdrive_subdir
corpus_filename = CORPUS_SUBDIR

# Change to working subdirectory
if flag_first_run == True:
  full_path_str = gdrive_subdir
  flag_first_run = False
else:
  full_path_str = f'/gdrive/MyDrive{gdrive_subdir[1:]}'

%cd $full_path_str

In [None]:
!ls -altr *.csv

In [None]:
corpus_unified_df = pd.read_csv('sum_sentiments_all31_sents_cdickens_cdickens_greatexpectations.csv')

In [None]:
corpus_unified_df.info()

In [None]:
corpus_unified_df['baseline_sentimentr_stdscaler_roll10'].plot()
corpus_unified_df['baseline_syuzhet_stdscaler_roll10'].plot()
sent_ct = corpus_unified_df.shape[0]
plt.legend(loc='best')
plt.title(f'Great Expectations by Charles Dickens\nDiachronic Sentiment over {sent_ct} Sentences using Standard Scaler + SMA 10%');

In [None]:
ts_sentiments_df = corpus_unified_df.filter(like='roll10').copy()
ts_sentiments_df.fillna(0, inplace=True)
# print(f'Time Series Count: {sent_ct}')
ts_sentiments_df.filter(regex='^(sentimentr|syuzhet|transformer)',axis=1).plot()
plt.legend(loc='best')
plt.title(f'Great Expectations by Charles Dickens\nDiachronic Sentiment over {sent_ct} Sentences using Standard Scaler + SMA 10%');

## **Compute Distance Matrix between 2 Series**

In [None]:
from dtaidistance import dtw
from dtaidistance import dtw_visualisation as dtwvis
import numpy as np
s1 = np.array([0., 0, 1, 2, 1, 0, 1, 0, 0, 2, 1, 0, 0])
s2 = np.array([0., 1, 2, 3, 1, 0, 0, 0, 2, 1, 0, 0, 0])
path = dtw.warping_path(s1, s2)
dtwvis.plot_warping(s1, s2, path, filename="warp.png")

In [None]:
Image(filename='warp.png') 

In [None]:
res = [random.randrange(1, 50, 1) for i in range(7)]

In [None]:
# Option #1: pandas

s1 = [0, 0, 1, 2, 1, 0, 1, 0, 0]
s2 = [0, 1, 2, 0, 0, 0, 0, 0, 0]
distance = dtw.distance(s1, s2)
print(distance)

In [None]:
print(dtw.distance.__doc__)

In [None]:
%%timeit

# 100 datapoints: 159ms
# 300 datapoints: 1.5s
# 500 datapoints: 4.3s
# 1k datapoints: 17.7s
# 5k datapoints: ?(1.55s)
# 10k datapoints: >15m

dist_ls = []

for i in range(10):
  r1 = [random.randrange(1, 50, 1) for i in range(10000)]
  r2 = [random.randrange(1, 50, 1) for i in range(10000)]
  dist_fl = dtw.distance(r1, r2)
  dist_ls.append(dist_fl)

print(f'Mean: {sum(dist_ls)/len(dist_ls)}')

In [None]:
%%timeit

# 100 datapoints: 159ms
# 300 datapoints: 1.5s
# 500 datapoints: 4.3s
# 1k datapoints: 17.7s
# 5k datapoints: ?(1.55s)
# 10k datapoints: 

dist_ls = []

for i in range(10):
  r1 = [random.randrange(1, 50, 1) for i in range(5000)]
  r2 = [random.randrange(1, 50, 1) for i in range(5000)]
  dist_fl = dtw.distance(r1, r2)
  dist_ls.append(dist_fl)

print(f'Mean: {sum(dist_ls)/len(dist_ls)}')

In [None]:
# Option #2 (30-3000x faster) c implementation requires array w/doubles (and optionally max_dist pruning)

s1 = array.array('d',[0, 0, 1, 2, 1, 0, 1, 0, 0])
s2 = array.array('d',[0, 1, 2, 0, 0, 0, 0, 0, 0])
d = dtw.distance_fast(s1, s2, use_pruning=True)
print(d)

In [None]:
%%timeit

# 100 datapoints: 3ms
# 300 datapoints: 12.3ms
# 500 datapoints: 26ms
# 1k datapoints: 81.3ms
# 5k datapoints: 1.55s
# 10k datapoints: 6s

dist_fast_ls = []

for i in range(10):
  r1 = array.array('d',[random.randrange(1, 50, 1) for i in range(1000)])
  r2 = array.array('d',[random.randrange(1, 50, 1) for i in range(1000)])
  dist_fl = dtw.distance_fast(r1, r2)
  dist_fast_ls.append(dist_fl)

print(f'Mean: {sum(dist_fast_ls)/len(dist_fast_ls)}')

In [None]:
# Option #3: Numpy array with doubles or floats

s1 = np.array([0, 0, 1, 2, 1, 0, 1, 0, 0], dtype=np.double)
s2 = np.array([0.0, 1, 2, 0, 0, 0, 0, 0, 0])
d = dtw.distance_fast(s1, s2, use_pruning=True)

In [None]:
%%timeit

# 100 datapoints: 3ms
# 300 datapoints: 12.3ms
# 500 datapoints: 26ms
# 1k datapoints: 82ms
# 5k datapoints: 1.55s
# 10k datapoints: 6s

dist_c_ls = []

for i in range(10):
  r1 = np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double)
  r2 = np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double)
  dist_fl = dtw.distance_fast(r1, r2)
  dist_c_ls.append(dist_fl)

print(f'Mean: {sum(dist_c_ls)/len(dist_c_ls)}')

## **Visualize Warping Paths**

In [None]:
s1 = [0, 0, 1, 2, 1, 0, 1, 0, 0]
s2 = [0, 1, 2, 0, 0, 0, 0, 0, 0]
distance, paths = dtw.warping_paths(s1, s2)
print(distance)
print(paths)

In [None]:
x = np.arange(0, 20, .5)
s1 = np.sin(x)
s2 = np.sin(x - 1)
random.seed(1)
for idx in range(len(s2)):
    if random.random() < 0.05:
        s2[idx] += (random.random() - 0.5) / 2
d, paths = dtw.warping_paths(s1, s2, window=25, psi=2)
best_path = dtw.best_path(paths)
dtwvis.plot_warpingpaths(s1, s2, paths, best_path)

## **Compute Distance Matrix between Set of Series**

In [None]:
series = [
    np.array([0, 0, 1, 2, 1, 0, 1, 0, 0], dtype=np.double),
    np.array([0.0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0]),
    np.array([0.0, 0, 1, 2, 1, 0, 0, 0])]
ds = dtw.distance_matrix_fast(series)

In [None]:
series = np.matrix([
    [0.0, 0, 1, 2, 1, 0, 1, 0, 0],
    [0.0, 1, 2, 0, 0, 0, 0, 0, 0],
    [0.0, 0, 1, 2, 1, 0, 0, 0, 0]])

ds = dtw.distance_matrix_fast(series)

In [None]:
ds

In [None]:
[random.randrange(1, 50, 1) for i in range(10)]
print('\n')
[random.randrange(1, 50, 1) for i in range(10)]
print('\n')
[random.randrange(1, 50, 1) for i in range(10)]
print('\n')
[random.randrange(1, 50, 1) for i in range(10)]

In [None]:
series = [
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double),
    np.array([random.randrange(1, 50, 1) for i in range(1000)], dtype=np.double)]
ds = dtw.distance_matrix_fast(series)

In [None]:
ds

In [None]:
# Can be distributed and parallelized

### **Sentence Sentiment Time Series**

In [None]:
ts_sentiments_df.isna().any()

In [None]:
ts_sentiments_df.info()

In [None]:
ts_sentiments_np = ts_sentiments_df.to_numpy().T
ts_sentiments_np.shape

In [None]:
%%time

# NOTE: 2m

ds = dtw.distance_matrix_fast(ts_sentiments_np)

In [None]:
series = [
    np.array(list(ts_sentiments_df['baseline_syuzhet_stdscaler_roll10']), dtype=np.double),
    np.array(list(ts_sentiments_df['baseline_sentimentr_stdscaler_roll10']), dtype=np.double),
    np.array(list(ts_sentiments_df['baseline_bing_stdscaler_roll10']), dtype=np.double)]
ds = dtw.distance_matrix_fast(series)

In [None]:
type(ds)

In [None]:
ds.shape

# **Visualize Hierarchical Clustering**

In [None]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (30, 10)

In [None]:
%%time

# NOTE: 2m

from dtaidistance import clustering
# Custom Hierarchical clustering
# model1 = clustering.Hierarchical(dtw.distance_matrix_fast, {})
# cluster_idx = model1.fit(series)

# Augment Hierarchical object to keep track of the full tree
# model2 = clustering.HierarchicalTree(model1)
# cluster_idx = model2.fit(series)


# SciPy linkage clustering
model3 = clustering.LinkageTree(dtw.distance_matrix_fast, {})
cluster_idx = model3.fit(ts_sentiments_np) # (series)

In [None]:
model3.plot("myplot.png")
Image(filename='myplot.png')

In [None]:
ts_labels = ['SentimentR',
             'SyuzhetR',
             'TextBlob',
             'Flair',
             'Stanza',
             'Logistic Regression',
             'LSTM',
             'CNN',
             'RoBERTa 15 Large',
             'T5']


fig, ax = plt.subplots(nrows=1, ncols=2, gridspec_kw={'width_ratios': [1, 4]}, figsize=(30, 10))
# show_ts_label = lambda idx: "ts-" + str(idx)
show_ts_label = lambda idx: ts_labels[idx]
model3.plot("hierarchy.png", axes=ax, show_ts_label=show_ts_label,
           show_tr_label=True, ts_label_margin=-100,
           ts_left_margin=5, ts_sample_length=1)

In [None]:
Image(filename='hierarchy.png') 

In [None]:
from dtaidistance import clustering

# Custom Hierarchical clustering
# model1 = clustering.Hierarchical(dtw.distance_matrix_fast, {})
# cluster_idx = model1.fit(series)

# Augment Hierarchical object to keep track of the full tree
# model2 = clustering.HierarchicalTree(model1)
# cluster_idx = model2.fit(series)



# SciPy linkage clustering
model3 = clustering.LinkageTree(dtw.distance_matrix_fast, {})
cluster_idx = model3.fit(series)

In [None]:
model3.plot("myplot.png")