In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [None]:
DATA_DIRECTORY='../data/analysis'

In [None]:
plt.style.use('fivethirtyeight')

# 1. Line Plots

## 1.1. Create time series line plots 

In [None]:
df = pd.read_csv('{}/discoveries.csv'.format(DATA_DIRECTORY), parse_dates=['date'], index_col='date')
df.plot(figsize=(10,10))
plt.show()

In [None]:
df.plot(figsize=(10,10))
plt.show()

In [None]:
plt.style.available

In [None]:
ax = df.plot(color='blue')

In [None]:
ax = df.plot(color='red')

In [None]:
ax = df.plot(color='blue', figsize=(10,10))
ax.set_xlabel('Date')
ax.set_ylabel('Number of great discoveries')
ax.set_title('Number of great inventions and scientific discoveries from 1860 to 1959')
plt.show()

## 1.2. Customize the time series plot

In [None]:
df_subset = df['1860':'1870']
ax = df_subset.plot(color='blue', fontsize=14)
plt.show()

In [None]:
# adding markers
ax = df.plot(color='blue', figsize=(12,10))
ax.set_xlabel('Date')
ax.set_ylabel('Number of great discoveries')
ax.axvline('1920-01-01', color='red', linestyle='--')
ax.axhline(4, color='green', linestyle='--')

In [None]:
# Highlighting regions of interest
ax = df.plot(color='blue', figsize=(15,10))
ax.set_xlabel('Date')
ax.set_ylabel('Number of great discoveries')
ax.axvspan('1890-01-01', '1910-01-01', color='red', alpha=0.3)
ax.axhspan(8, 6, color='green', alpha=0.3)

# 2. Summary Statistics and Diagnostics

## 2.1. Clean the time series data

In [None]:
co2_levels = pd.read_csv('{}/co2_levels.csv'.format(DATA_DIRECTORY), parse_dates=['datestamp'], index_col='datestamp')
co2_levels.head()

In [None]:
# count missing values 
co2_levels.isnull().sum()

In [None]:
# Replacing missing values in a DataFrame
co2_levels = co2_levels.fillna(method='bfill')

# 2.2. Plot aggregates of the data

In [None]:
# The moving average model
co2_levels_mean = co2_levels.rolling(window=52).mean()
ax = co2_levels_mean.plot(figsize=(12,10))
ax.set_xlabel("Date")
ax.set_ylabel("The values of the Y axis")
ax.set_title("52 weeks rolling mean of the time series")
plt.show()

In [None]:
# Plotting aggregate values of the time series
index_month = co2_levels.index.month
co2_levels_by_month = co2_levels.groupby(index_month).mean()
co2_levels_by_month.plot(figsize=(12,10))
plt.title('Monthly aggregation of the co2 level time series')
plt.show()

In [None]:
# Plotting aggregate values of the time series
index_year = co2_levels.index.year
co2_levels_by_year = co2_levels.groupby(index_year).mean()
co2_levels_by_year.plot(figsize=(12,10))
plt.title('Yearly aggregation of the co2 level time series')
plt.show()

## 2.3. Summarize the values in the dataset

In [None]:
# Summarizing the data with boxplots
ax1 = co2_levels.boxplot(figsize=(12,10))
ax1.set_ylabel('Co2 levels')
ax1.set_title('Boxplot for the co2 levels data')

In [None]:
# Summarizing the data with histograms
ax2 = co2_levels.plot(kind='hist', bins=100, figsize=(15,10))
ax2.set_xlabel('Co2 levels value')
ax2.set_ylabel('Frequency of values in the co2 levels data')
ax2.set_title('Histogram of the co2 levels data 100 bins')
plt.show()

In [None]:
# Summarizing the data with density plots
ax3 = co2_levels.plot(kind='density', linewidth=2, figsize=(15,10))
ax3.set_xlabel('co2 levels data values')
ax3.set_ylabel('Density values of the co2 levels data')
ax3.set_title('Density plot of the co2 levels data')

# 3.Seasonality, Trend, and Noise


## 3.1. Autocorrelation and Partial Autocorrelation

In [None]:
# Plotting autocorrelations
import matplotlib.pyplot as plt
from statsmodels.graphics import tsaplots
fig = tsaplots.plot_acf(co2_levels['co2'], lags=40)
plt.show()

In [None]:
# Plotting partial autocorrelations
import matplotlib.pyplot as plt
from statsmodels.graphics import tsaplots
fig = tsaplots.plot_pacf(co2_levels['co2'], lags=40)
plt.show()

## 3.2. Seasonality, trend, and noise in time series data

In [None]:
import statsmodels.api as sm
import matplotlib.pyplot as plt
from pylab import rcParams
co2_levels.plot()

In [None]:
#Time series decomposition 
rcParams['figure.figsize'] = 11, 15  # resizing the image to be big enough for us 

decomposition = sm.tsa.seasonal_decompose(co2_levels['co2'])
fig = decomposition.plot()
plt.show()

In [None]:
# Trend component in time series
decomp_trend = decomposition.trend
ax = decomp_trend.plot(figsize=(14, 2))
ax.set_xlabel('Date')
ax.set_ylabel('CO2 - trend')
ax.set_title('CO2 - trend')
plt.show()

In [None]:
# Seasonality component in time series

decomp_seasonal = decomposition.seasonal
ax = decomp_seasonal.plot(figsize=(14, 2))
ax.set_xlabel('Date')
ax.set_ylabel('CO2 - seasonality')
ax.set_title('CO2 - seasonality')
plt.show()

In [None]:
#Noise component in time series
decomp_resid = decomposition.resid
ax = decomp_resid.plot(figsize=(14, 2))
ax.set_xlabel('Date')
ax.set_ylabel('CO2 - noise')
ax.set_title('CO2 - noise')
plt.show()

## 3.3. Analyzing airline data

In [None]:
# upload the airline data
airline = pd.read_csv('{}/airline_passengers.csv'.format(DATA_DIRECTORY), parse_dates=['Month'], index_col='Month')

# Plot the time series in the DataFrame
ax = airline.plot(color='blue', fontsize=12, figsize=(12,10))

# Add a red vertical line at the date 1955-12-01
ax.axvline('1955-12-01', color='red', linestyle='--')

# Specify the labels in the plot
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Number of Monthly Airline Passengers', fontsize=12)
ax.set_title('Number of Monthly Airline Passengers')
plt.show()

In [None]:
# Print out the number of missing values
airline.isnull().sum()

In [None]:
# Print out summary statistics of the airline DataFrame
airline.describe()

In [None]:
# Display boxplot of airline values
ax = airline.boxplot()

# Specify the title of the plot
ax.set_title('Boxplot of Monthly Airline\nPassengers Count', fontsize=20)
plt.show()

In [None]:
# Get month for each dates from the index of airline
index_month = airline.index.month

# Compute the mean number of passengers for each month of the year
mean_airline_by_month = airline.groupby(index_month).mean()

# Plot the mean number of passengers for each month of the year
mean_airline_by_month.plot()
plt.legend(fontsize=20)
plt.show()

In [None]:
import statsmodels.api as sm

# Perform time series decompositon
decomposition = sm.tsa.seasonal_decompose(airline)

# Extract the trend and seasonal components
trend = decomposition.trend
seasonal = decomposition.seasonal

airline_decomposed = pd.DataFrame({'Trend':trend, 'Seasonal':seasonal})

In [None]:
# Print the first 5 rows of airline_decomposed
airline_decomposed.head(5)

In [None]:
# Plot the values of the airline_decomposed DataFrame
ax = airline_decomposed.plot(figsize=(12, 6), fontsize=15)

In [None]:
# Specify axis labels
ax.set_xlabel('Date', fontsize=15)
plt.legend(fontsize=15)
plt.show()

# 4. Visualizing multiple time series

## 4.1 Working with more than one time series

In [None]:
meat = pd.read_csv('{}/meat.csv'.format(DATA_DIRECTORY), parse_dates=['date'], index_col='date')

In [None]:
meat.head(5)

In [None]:
# plotting multiple time series
ax = meat.plot(figsize=(15, 10), fontsize=14)
plt.show()

In [None]:
# Area charts
ax = meat.plot.area(figsize=(15, 10), fontsize=14)

## 4.2. Plot multiple time series

In [None]:
ax = meat.plot(colormap='Dark2', figsize=(14, 7))
ax.set_xlabel('Date')
ax.set_ylabel('Production Volume (in tons)')
plt.show()

In [None]:
ax = meat.plot(colormap='Dark2', figsize=(5, 8))
df_summary = meat.describe()

# Specify values of cells in the table
ax.table(cellText=df_summary.values,
          # Specify width of the table
          colWidths=[0.5]*len(meat.columns),
          # Specify row labels
          rowLabels=df_summary.index,
          # Specify column labels
          colLabels=df_summary.columns,
          # Specify location of the table
          loc='top')
plt.show()

In [None]:
# Facet plots
meat.plot(subplots=True,
        linewidth=0.5,
        layout=(2, 4),
        figsize=(16, 12),
        sharex=False,
        sharey=False)
plt.show()

## 4.3. Find relationships between multiple time series


In [None]:
# Computing Correlation Matrices
from scipy.stats.stats import pearsonr
from scipy.stats.stats import spearmanr
from scipy.stats.stats import kendalltau

Pearson correlation matrix

In [None]:
corr_p = meat[['beef', 'veal','turkey']].corr(method='pearson')
corr_p

Spearman correlation matrix

In [None]:
corr_s = meat[['beef', 'veal','turkey']].corr(method='spearman')
corr_s

In [None]:
import seaborn as sns
corr_mat = meat.corr(method='pearson')
sns.heatmap(corr_mat)

In [None]:
# Clustermap
sns.clustermap(corr_mat)

# 5. Case Study: Unemployment Rate

## 5.1. Summary stats

In [None]:
# Read in jobs file
jobs = pd.read_csv('{}/employment.csv'.format(DATA_DIRECTORY))

In [None]:
# Print the first five lines of the DataFrame
jobs.head(5)

In [None]:
# Check the type of each column in the DataFrame
jobs.dtypes

In [None]:
# Convert the datestamp column to a datetime object
jobs['datestamp'] = pd.to_datetime(jobs['datestamp'])

# Set the datestamp columns as the index of the DataFrame
jobs = jobs.set_index('datestamp')

In [None]:
# Check the number of missing values in each column
jobs.isnull().sum()

In [None]:
# Generate a boxplot
jobs.boxplot(fontsize=6, vert=False)
plt.show()

In [None]:
# Generate numerical summaries
jobs.describe()

In [None]:
# Facet plots of the jobs dataset
jobs.plot(subplots=True, layout=(4, 4),
          figsize=(30, 16),
          sharex=True,
          sharey=False)
plt.show()

In [None]:
# Annotating events in the jobs dataset
ax = jobs.plot(figsize=(20, 14), colormap='Dark2')
ax.axvline('2008-01-01', color='black', linestyle='--')
ax.axvline('2009-01-01', color='black', linestyle='--')

In [None]:
#Monthly averages in the jobs dataset
index_month = jobs.index.month
jobs_by_month = jobs.groupby(index_month).mean()
jobs_by_month

In [None]:
ax = jobs_by_month.plot(figsize=(12, 5), colormap='Dark2')
ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left')

## 5.2. Seasonality, trend, and noise in the time series

In [None]:
# Decomposing multiple time series with Python dictionaries
# Import the statsmodel library
import statsmodels.api as sm
# Initialize a dictionary
dict_ts = {}
# Extract the names of the time series
ts_names = jobs.columns
ts_names

In [None]:
# Run time series decomposition
for ts in ts_names:
    ts_decomposition = sm.tsa.seasonal_decompose(jobs[ts])
    dict_ts[ts] = ts_decomposition

In [None]:
# Initialize a new dictionary for each component
dict_trend = {}
dict_seasonal = {}
dict_resid = {}

In [None]:
# Extract the trend component
for ts in ts_names:
    dict_trend[ts] = dict_ts[ts].trend
    dict_seasonal[ts] = dict_ts[ts].seasonal
    dict_resid[ts] = dict_ts[ts].resid

# Convert to a DataFrame
trend_df = pd.DataFrame.from_dict(dict_trend)
seasonal_df = pd.DataFrame.from_dict(dict_seasonal)
resid_df = pd.DataFrame.from_dict(dict_resid)

In [None]:
seasonal_df.plot()
plt.title('Seasonal component of the jobs data set')
plt.xlabel('Years')

In [None]:
trend_df.plot()
plt.title('Trend component of the jobs data set')
plt.xlabel('Years')

In [None]:
resid_df.plot()
plt.title('Noise component of the jobs data set')
plt.xlabel('Years')

## 5.3. Compute correlations between time series

### Plotting a clustermap of the jobs correlation matrix

In [None]:
# Get correlation matrix of the seasonality_df DataFrame
trend_corr = trend_df.corr(method='spearman')

In [None]:
# Customize the clustermap of the seasonality_corr
fig = sns.clustermap(trend_corr, annot=True, linewidth=0.4,figsize=(15,10))

In [None]:
plt.setp(fig.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
plt.setp(fig.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)