In [16]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.datasets import load_iris, load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error

import librosa as lr

from glob import glob

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Time Series and Machine Learning Primer

In [None]:
df = pd.read_csv('apple.csv')
df.head()

In [None]:
df.shape

In [None]:
df.plot('Date', 'Close')

In [None]:
data = load_iris()
iris = pd.DataFrame(data.data, columns=data.feature_names)
iris['label'] = data.target

x_train, x_test, y_train, y_test = train_test_split(iris[['petal length (cm)', 'petal width (cm)']], iris[['label']])

sns.scatterplot(x='petal length (cm)', y='petal width (cm)',
                hue=y_train['label'], data=x_train, palette='Set1')

In [None]:
model = LinearSVC().fit(x_train, y_train)

y_pred = model.predict(x_test)
print('Accuracy: ', accuracy_score(y_pred, y_test))

sns.scatterplot(x='petal length (cm)', y='petal width (cm)',
                hue=y_pred, data=x_test, palette='Set1')
plt.title('Predicted Labels')

In [None]:
sns.scatterplot(x='petal length (cm)', y='petal width (cm)',
                hue=y_test['label'], data=x_test, palette='Set1')
plt.title('True Labels')

In [None]:
data = load_boston()
boston = pd.DataFrame(data.data, columns=data.feature_names)
boston.head()

In [None]:
x = boston[['AGE']]
y = boston['RM']

x_train, x_test, y_train, y_test = train_test_split(x, y)

sns.scatterplot(x=x_train['AGE'], y=y_train)

In [None]:
model = linear_model.LinearRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print(mean_absolute_error(y_pred, y_test))

In [None]:
sns.scatterplot(x=x_test['AGE'], y=y_test, color='green')
sns.scatterplot(x=x_test['AGE'], y=y_pred, color='red')

In [None]:
sns.scatterplot(x=y_test, y=y_pred)

In [None]:
files = glob('../input/heartbeat-sounds/set_a/*.wav')
len(files)

In [None]:
audio, sfreq = lr.load(files[0])
print(sfreq)
print(len(audio))

In [None]:
# Create time array
indices = np.arange(0, len(audio))
time = indices / sfreq

In [None]:
# time stamp of (N-1)th datapoint
final_time = (len(audio) - 1) / sfreq
time = np.linspace(0, final_time, sfreq) # should be 1 or ? instead of 'sfreq'

In [None]:
audio, sfreq = lr.load(files[0])
time = np.arange(0, len(audio)) / sfreq

fig, ax = plt.subplots(figsize=(12, 4))
ax.plot(time, audio)
ax.set(xlabel='Time (s)', ylabel='Sound Amplitude')

plt.show()

In [None]:
df = pd.read_csv('../input/nyse/prices.csv')
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')
df.head()

In [None]:
data = df.pivot(columns='symbol', values='open')
data = data.loc[:, ['AAPL', 'FB', 'NFLX', 'V', 'XOM']]
data.head()

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
for column in data.columns:
    data[column].plot(ax=ax, label=column)
ax.legend()
plt.show()

## Time Series as Inputs to a Model 

In [3]:
data_dir = '../input/heartbeat-sounds/'

In [4]:
def plot_audio_path(path):
    name = path.split('/')[-1]
#     path = data_dir + path
#     print(path)
    audio, sfreq = lr.load(path)
    time = np.arange(audio.shape[-1]) / sfreq
    plt.figure(figsize=(12, 4))
    plt.plot(time, audio)
    plt.title(name)

In [5]:
def plot_audio(audio):
    time = np.arange(audio.shape[-1]) / sfreq
    plt.figure(figsize=(12, 4))
    plt.plot(time, audio)

### Set A

In [None]:
df = pd.read_csv('../input/heartbeat-sounds/set_a.csv')
df.head()

In [None]:
df.label.unique()

In [None]:
print(len(df.query("label == 'artifact'")))
print(len(df.query("label == 'extrahls'")))
print(len(df.query("label == 'murmur'")))
print(len(df.query("label == 'normal'")))

In [None]:
audio, sfreq = lr.load(data_dir + df.fname[0])
time = np.arange(0, len(audio)) / sfreq
print(len(audio), sfreq)
print(type(audio))
print(audio.shape)

In [None]:
data = pd.DataFrame()

In [None]:
files = glob(data_dir + 'set_a/*.wav')

In [None]:
data = pd.DataFrame()

means = []
maxs = []
stds = []

for i, name in enumerate(df.fname):
    path = data_dir + name
    if path not in files:
        print(i, ' not found')
        continue
    audio, sfreq = lr.load(data_dir + name)
    means.append(np.mean(audio))
    maxs.append(np.max(audio))
    stds.append(np.std(audio))

data['mean'] = means
data['max'] = maxs
data['std'] = stds
data.head()

In [None]:
data['label'] = df['label'].copy()
data.head()

In [None]:
data.shape

In [None]:
X = data.drop('label', axis=1)
y = data['label']

x_train, x_test, y_train, y_test = train_test_split(X, y)

In [None]:
model = LinearSVC()
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_test)
print(accuracy_score(y_pred, y_test))

### Set B

In [6]:
files = glob(data_dir + 'set_b/*.wav')
len(files)

656

In [7]:
labels = [file.split('/')[-1].split('_')[0] for file in files]

In [8]:
np.unique(labels)

array(['Bunlabelledtest', 'extrastole', 'murmur', 'normal'], dtype='<U15')

In [9]:
data = pd.DataFrame()

means = []
maxs = []
stds = []

for path in files:
    audio, sfreq = lr.load(path)
    means.append(np.mean(audio))
    maxs.append(np.max(audio))
    stds.append(np.std(audio))

data['mean'] = means
data['max'] = maxs
data['std'] = stds
data.head()

Unnamed: 0,mean,max,std
0,0.000134,1.003413,0.134785
1,0.000364,1.057866,0.217987
2,0.000275,0.901249,0.102249
3,0.000585,0.341814,0.027648
4,6.8e-05,0.971062,0.084254


In [10]:
data['label'] = labels
data.head()

Unnamed: 0,mean,max,std,label
0,0.000134,1.003413,0.134785,normal
1,0.000364,1.057866,0.217987,Bunlabelledtest
2,0.000275,0.901249,0.102249,extrastole
3,0.000585,0.341814,0.027648,murmur
4,6.8e-05,0.971062,0.084254,Bunlabelledtest


In [11]:
data.shape

(656, 4)

In [12]:
X = data.drop('label', axis=1)
y = data['label']

x_train, x_test, y_train, y_test = train_test_split(X, y)

In [20]:
model = LinearSVC()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
print(accuracy_score(y_pred, y_test))

0.49390243902439024


In [21]:
model = RandomForestClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
print(accuracy_score(y_pred, y_test))

0.45121951219512196


