# Visualizing Time Series

In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 

import warnings

warnings.filterwarnings('ignore')
sns.set()

In [None]:
discoveries = pd.read_csv('../data/discoveries.csv')

discoveries['date'] = pd.to_datetime(discoveries['date'])

# Set the date column as the index of your DataFrame discoveries
discoveries = discoveries.set_index('date')

# Plot the time series in your DataFrame
ax = discoveries.plot(color='blue')

# Specify the x-axis label inyour plot
ax.set_xlabel('Date')

# Specify the y-axis label in your plot
ax.set_ylabel('Number of great discoveries')

# Show plot
plt.show()


In [None]:
# Import the matplotlib.pyplot sub-module
import matplotlib.pyplot as plt

# Use the fivethirtyeight style
plt.style.use('fivethirtyeight')

# Plot the time series
ax1 = discoveries.plot()
ax1.set_title('FiveThirtyEight Style')
plt.show()

In [None]:
# Import the matplotlib.pyplot sub-module
import matplotlib.pyplot as plt

# Use the ggplot style
plt.style.use('ggplot')
ax2 = discoveries.plot()

# Set the title
ax2.set_title('ggplot Style')
plt.show()

In [None]:
# Plot a line chart of the discoveries DataFrame using the specified arguments
ax = discoveries.plot(color='blue', figsize=(8, 3), linewidth=2, fontsize=6)

# Specify the title in your plot
ax.set_title('Number of great inventions and scientific discoveries from 1860 to 1959', fontsize=8)

# Show plot
plt.show()

# Slicing time

In [None]:
discoveries

In [None]:
discoveries['1900':'1920']

In [None]:
discoveries['1900-01':'1905-01']

In [None]:
discoveries['1900-01':'1905-01'].plot(color='blue', fontsize=12)

# Markers 

In [None]:
discoveries.asfreq('Y')
ax = discoveries.plot(color='blue', fontsize=12)
ax.axvline(x=(pd.Timestamp('1902-01-01')), color='red', linestyle='--')
ax.axhline(y=5, color='green', linestyle='--')
ax.set_xlabel('year')
ax.set_ylabel('number of discoveries')

plt.show()

In [None]:
# Regions

In [None]:
discoveries.asfreq('Y')
ax = discoveries.plot(color='blue', fontsize=12)

ax.axvspan(pd.Timestamp('1901-01-01'), pd.Timestamp('1903-01-01'), color='red', alpha=0.2)
ax.axhspan(4,5 , color='green', linestyle='--', alpha=0.2)

ax.set_xlabel('year')
ax.set_ylabel('number of discoveries')

plt.show()

# Cleaning time series

In [None]:
co2 = pd.read_csv('../data/ch2_co2_levels.csv', parse_dates=['datestamp'], index_col='datestamp')
co2

In [None]:
co2.isnull().sum()

In [None]:
co2.notnull().sum()

We can fill empty values with the mean of the time series, the value before or after.

In [None]:
co2.fillna(method='ffill')
co2.fillna(method='bfill')
co2.fillna(value=co2.mean())

# Plot aggregates of your data
## Moving averages 

Used for: 
- smooth out short term fluctuations
- removing outliers
- highlighting long term trends or cycles



In [None]:
co2_mean = co2.rolling(window=52).mean()

In [None]:
ax=co2_mean.plot()
ax.set_xlabel('Date')
ax.set_ylabel('The value of my Y axis')
ax.set_title('52 weeks rolling mean of my time series')

plt.show()

# Aggregating by month, week...


In [None]:
index_month =co2.index.month
index_month

In [None]:
co2_by_month = co2.groupby(index_month).mean()
co2_by_month.plot() 
plt.show()

In [None]:
co2_levels = co2

In [None]:
# Compute the 52 weeks rolling mean of the co2_levels DataFrame
ma = co2_levels.rolling(window=52).mean()

# Compute the 52 weeks rolling standard deviation of the co2_levels DataFrame
mstd = co2_levels.rolling(window=52).std()

# Add the upper bound column to the ma DataFrame
ma['upper'] = ma['co2'] + (2 * mstd['co2'])

# Add the lower bound column to the ma DataFrame
ma['lower'] = ma['co2'] - (2 * mstd['co2'])

# Plot the content of the ma DataFrame
ax = ma.plot(linewidth=0.8, fontsize=6)

# Specify labels, legend, and show the plot
ax.set_xlabel('Date', fontsize=10)
ax.set_ylabel('CO2 levels in Mauai Hawaii', fontsize=10)
ax.set_title('Rolling mean and variance of CO2 levels\nin Mauai Hawaii from 1958 to 2001', fontsize=10)
plt.show()

## Histograms 

## Boxplots 

## Kerne Density Plots

In [None]:
# Generate a boxplot
ax = co2_levels.boxplot()

# Set the labels and display the plot
ax.set_xlabel('CO2', fontsize=10)
ax.set_ylabel('Boxplot CO2 levels in Maui Hawaii', fontsize=10)
plt.legend(fontsize=10)
plt.show()

In [None]:
# Generate a histogram
ax = co2_levels.plot(kind='hist', bins=50, fontsize=6)

# Set the labels and display the plot
ax.set_xlabel('CO2', fontsize=10)
ax.set_ylabel('Histogram of CO2 levels in Maui Hawaii', fontsize=10)
plt.legend(fontsize=10)
plt.show()

In [None]:
# Display density plot of CO2 levels values
ax = co2_levels.plot(kind='density', linewidth=4, fontsize=6)

# Annotate x-axis labels
ax.set_xlabel('CO2', fontsize=10)

# Annotate y-axis labels
ax.set_ylabel('Density plot of CO2 levels in Maui Hawaii', fontsize=10)

plt.show()


# Autocorrelation and Partial Autocorrelation
**Autocorrelation** is measured as the correlation between a time series and a delayed copy of itself

For example, an autocorrelation of order 3 returns the correlation between a time series at points and its own values lagged by 3 time points. 

It is used to find repetitive patterns or periodic signal in time series

Statsmodels has an implementation for autocorrelations


In [None]:
# Import required libraries
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from statsmodels.graphics import tsaplots

co2_levels = pd.read_csv('../data/ch2_co2_levels.csv', index_col='datestamp').dropna()

# Display the autocorrelation plot of your time series
fig = tsaplots.plot_acf(co2_levels['co2'], lags=24)

# Show plot
plt.show()

If autocorrelation values are close to 0, then values between consecutive observations are not correlated with one another. Inversely, autocorrelations values close to 1 or -1 indicate that there exists strong positive or negative correlations between consecutive observations, respectively.

In order to help you asses how trustworthy these autocorrelation values are, the plot_acf() function also returns confidence intervals (represented as blue shaded regions). If an autocorrelation value goes beyond the confidence interval region, you can assume that the observed autocorrelation value is statistically significant.

the partial autocorrelation function (PACF) measures the correlation coefficient between a time-series and lagged versions of itself. However, it extends upon this idea by also removing the effect of previous time points. For example, a partial autocorrelation function of order 3 returns the correlation between our time series (t_1, t_2, t_3, …) and its own values lagged by 3 time points (t_4, t_5, t_6, …), but only after removing all effects attributable to lags 1 and 2.



In [None]:
# Import required libraries
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from statsmodels.graphics import tsaplots

# Display the partial autocorrelation plot of your time series
fig = tsaplots.plot_pacf(co2_levels['co2'], lags=24)

# Show plot
plt.show()

# Seasonality, trend and noise in time series data

Most time series can be decomposed in 3 major components: 
- **Seasonality**:  periodic pattern
- **Trend**: consistent upwords or downwards slope
- **Noise**: unexplained variance/volatility

## Time series decomposition 

Automated method to extract the components. Statsmodels has an implementation


In [None]:
import statsmodels.api as sm 
from pylab import rcParams 

co2_levels = pd.read_csv('../data/ch2_co2_levels.csv', parse_dates=['datestamp'], index_col='datestamp').dropna()

rcParams['figure.figsize'] = 11,9 
decomposition = sm.tsa.seasonal_decompose(co2_levels['co2'], period=52)
fig = decomposition.plot() 

plt.show()

In [None]:
print(dir(decomposition))

In [None]:
decomposition.seasonal.plot()

In [None]:
decomposition.trend.plot()

# Review

In [None]:
airline = pd.read_csv('../data/ch3_airline_passengers.csv', parse_dates=['Month'], index_col='Month')
airline

In [None]:
# Plot the time series in your DataFrame
ax = airline.plot(color='blue', fontsize=12)

# Add a red vertical line at the date 1955-12-01
ax.axvline('1955-12-01', color='red', linestyle='--')

# Specify the labels in your plot
ax.set_xlabel('Date', fontsize=12)
ax.set_title('Number of Monthly Airline Passengers', fontsize=12)
plt.show()

In [None]:
# Print out the number of missing values
print(airline.isna().sum())

# Print out summary statistics of the airline DataFrame
print(airline.describe())

In [None]:
# Display boxplot of airline values
ax = airline.boxplot()

# Specify the title of your plot
ax.set_title('Boxplot of Monthly Airline\nPassengers Count', fontsize=20)
plt.show()

In [None]:
# Get month for each dates from the index of airline
index_month = airline.index.month

# Compute the mean number of passengers for each month of the year
mean_airline_by_month = airline.groupby(index_month).mean()

# Plot the mean number of passengers for each month of the year
mean_airline_by_month.plot()
plt.legend(fontsize=20)
plt.show()

In [None]:
# Import statsmodels.api as sm
import statsmodels.api as sm

# Perform time series decompositon
decomposition = sm.tsa.seasonal_decompose(airline)

# Extract the trend and seasonal components
trend = decomposition.trend
seasonal = decomposition.seasonal

In [None]:
trend.plot()

# Working with more than one time series



In [None]:
meat = pd.read_csv('../data/ch4_meat.csv', parse_dates=['date'], index_col='date')

# Review the first five lines of the meat DataFrame
print(meat.head(5))

# Print the summary statistics of the DataFrame
print(meat.describe())

In [None]:
ax = meat.plot(figsize=(12,8), fontsize=14)
ax.set_xlabel('Date')
ax.set_ylabel('Production Volume (in tons)')

plt.show()

In [None]:
ax = df.plot.area(figsize=(12,8), fontsize=14)

ax.set_xlabel('Date')
ax.set_ylabel('Production Volume (in tons)')

plt.show()

## Plotting multiple time series

Matplotlib default values may not be optimal to display many time series. Sometime series are assigned with the same color.

In [None]:
ax = meat.plot(figsize=(12,8), fontsize=14, colormap='Dark2')
ax.set_xlabel('Date')
ax.set_ylabel('Production Volume (in tons)')

plt.show()

## Enhancing plots with information

In [None]:
ax = meat.plot(figsize=(12,8), fontsize=14, colormap='Dark2')
df_summary = np.round(meat.describe())
ax.table(cellText=df_summary.values, 
        colWidths=[0.3]*len(meat.columns), 
        rowLabels=df_summary.index, 
         colLabels = df_summary.columns, 
         loc='top'
        )

plt.show()

## Dealing with different scales

In [None]:
meat['veal'].plot()

In [None]:
meat.plot(subplots=True, sharex=True, sharey=False, layout=(6,2), linewidth=0.5)
plt.show()

# Find relationships between two variables

In the field of Statistics, the correlation coefficient is a measure used to determine the strength or lack of relationship between two variables:
- **Pearson's coefficient** can be used to compute the correlation coefficient between variables for which the relationship is thought to be linear
- **Kendall Tau** or **Spearman rank** can be used to compute the correlation coefficient between variables for which the relationship is thought to be non-linear

In [None]:
from scipy.stats import pearsonr, spearmanr, kendalltau

x=[1,2,4,7]
y=[1,3,4,8]

print(pearsonr(x, y))
print(spearmanr(x, y))
print(kendalltau(x, y))

In [None]:
corr_p = meat[['beef', 'veal', 'turkey']].corr(method='pearson')
print(corr_p)

In [None]:
corr_s = meat[['beef', 'veal', 'turkey']].corr(method='spearman')
print(corr_s)

In [None]:
corr_t = meat[['beef', 'veal', 'turkey']].corr(method='kendall')
print(corr_t)

In [None]:
sns.heatmap(corr_p, annot=True)

In [None]:
sns.clustermap(corr_p)