# Exploring Time Series

In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

import warnings
warnings.filterwarnings('ignore')


## Loading Data

In [None]:
# We only read the columns we need, thanks to usecols param
divorce = pd.read_csv('../data/divorces.csv', usecols=['Divorce_date', 'Marriage_duration', 'Date_of_marriage'])

In [None]:
divorce

## Importing DateTime data 

Datetime data has to be explicitly declared to Pandas

In [None]:
divorce.dtypes

In [None]:
# Dates can be parsed directly when reading from the filesystem
divorce = pd.read_csv('../data/divorces.csv', parse_dates=['Divorce_date'], usecols=['Divorce_date', 'Marriage_duration', 'Date_of_marriage'])

In [None]:
divorce

In [None]:
divorce.dtypes

In [None]:
# Dates can be parsed as well once the data is loaded into memory
divorce['Date_of_marriage'] = pd.to_datetime(divorce['Date_of_marriage'])

In [None]:
divorce.dtypes

In [None]:
# pd.to_datetime can merge year, moth, day columns into a single brand new datetime one. For this to work, columns have to be named "month", "day", "year"

# The other way around:

divorce['Date_of_marriage_month'] = divorce['Date_of_marriage'].dt.month

In [None]:
divorce['Date_of_marriage_month']

## Visualizing patterns over time

In the lineplots, the blue line represents the mean while the lighter blue area represents the confidence interval with 95% of the probability for the population.

In [None]:
sns.lineplot(data=divorce, x='Date_of_marriage_month', y='Marriage_duration')

## Correlation

Correlation describes the direction and strength of the relationship between two variables

Correlations can help us using variables to predict future outcomes


In [None]:
divorce = pd.read_csv('../data/divorces.csv',             
            usecols=['Divorce_date', 'Marriage_duration', 'Date_of_marriage', 'Num_Children', 'Monthly_income_partner_man_peso', 'Monthly_income_partner_woman_peso'],
            parse_dates=['Divorce_date', 'Date_of_marriage'])

divorce['Date_of_marriage_year'] = divorce['Date_of_marriage'].dt.year

The corr() method calculates the Pearson correlation coefficient, that only pays attention to linear relationships

In [None]:
divorce[['Monthly_income_partner_man_peso',
       'Monthly_income_partner_woman_peso', 'Date_of_marriage_year',
       'Marriage_duration', 'Num_Children']].corr()

It turns out to be very handy to represent the correlation matrix with a heatmap:

In [None]:
sns.heatmap(divorce[['Monthly_income_partner_man_peso',
       'Monthly_income_partner_woman_peso', 'Date_of_marriage_year',
       'Marriage_duration', 'Num_Children']].corr(), annot=True)

Scatter plots are a very nice complement to our correlation matrix, since there we can see if the relationships between our variables are actually linear or not.

In [None]:
sns.scatterplot(data=divorce, x='Monthly_income_partner_man_peso', y='Monthly_income_partner_woman_peso')

In [None]:
#The pairplot goes beyond scatterplots and shows a scatterplot for each pair or variables in the dataframe
sns.pairplot(data=divorce)

In [None]:
#the vars parameter helps us decluttering the plot
sns.pairplot(data=divorce, vars=['Divorce_date', 'Monthly_income_partner_man_peso',
       'Monthly_income_partner_woman_peso', 'Num_Children'])

## Factor relationships and distributions



In [None]:
divorce = pd.read_csv('../data/divorces.csv',             
            usecols=['Divorce_date', 'Marriage_duration', 'Date_of_marriage', 'Num_Children', 'Monthly_income_partner_man_peso', 'Monthly_income_partner_woman_peso', 'Level_of_education_partner_man', 'Level_of_education_partner_woman'],
            parse_dates=['Divorce_date', 'Date_of_marriage'])

divorce['Date_of_marriage_year'] = divorce['Date_of_marriage'].dt.year

In [None]:
divorce = pd.read_csv('../data/divorces.csv')

In [None]:
divorce.columns

In [None]:
divorce['Level_of_education_partner_man'].value_counts()

In [None]:
sns.histplot(data=divorce, x='Marriage_duration', hue='Level_of_education_partner_man')

In [None]:
# Kdeplots offer a clearer way to display this kind of information, but keep in mind that there is some smoothing happening here. Its very important to properly parametrize that smoothing
sns.kdeplot(data=divorce, x='Marriage_duration', hue='Level_of_education_partner_man')

In [None]:
# To avoid showing negative marriage duration we can use the cut parameter
sns.kdeplot(data=divorce, x='Marriage_duration', hue='Level_of_education_partner_man', cut=0)

In [None]:
sns.kdeplot(data=divorce, x='Marriage_duration', hue='Level_of_education_partner_man', cut=0, cumulative=True)