In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Visualizing date-time data with UFOs!

In [2]:
# Read in the dataset
ufo = pd.read_csv(Path.joinpath(Path.cwd().parent, 'data', 'ufo2.csv'))
ufo.columns

FileNotFoundError: [Errno 2] No such file or directory: '/Users/austinlasseter/atelier/generalassembly/intuit-ds-13/04-pandas-data-visualization/lecture/data/ufo2.csv'

In [None]:
# Check out the time variable
ufo['Time'].describe()

In [None]:
# What type of var is it?
ufo['Time'].dtype

In [None]:
# convert a string to the datetime format
ufo['Time'] = pd.to_datetime(ufo['Time'], infer_datetime_format=True)
ufo['Time'].dtype # Now it's dt format

In [None]:
# create new variables!
ufo['Date'] = ufo['Time'].dt.date
ufo.head()

In [None]:
# Create some additional variables
ufo['Year']=ufo['Time'].dt.year 
ufo['Month']=ufo['Time'].dt.month 
ufo['Day']=ufo['Time'].dt.day 
ufo['Hour']=ufo['Time'].dt.hour 
ufo['Weekday']=ufo['Time'].dt.weekday 
ufo.head()

In [None]:
# also allows you to do datetime "math"
(ufo.Time.max() - ufo.Time.min()).days  

## Creating a categorical variable with proper sorting

In [None]:
# Create a weekday variable using map and numeration
ufo['weekday_name']=ufo['Weekday'].map({0:'Sunday',
                                       1: 'Monday',
                                       2: 'Tuesday',
                                       3: 'Wednesday',
                                       4: 'Thursday',
                                       5: 'Friday',
                                       6: 'Saturday'})
ufo.head()

In [None]:
# note that this creates an "object" type variable
ufo['weekday_name'].dtypes

In [None]:
# but the problem with this is that it doesn't display in the right order.
ufo['weekday_name'].value_counts()

In [None]:
# this lack of order also affects your `groupby` sorting.
daycounts = ufo.groupby(['weekday_name'])[['weekday_name']].count().sort_index(ascending=True)
daycounts

In [None]:
# and it screws up the ordering in your visualizations.
daycounts.plot(kind='bar');

In [None]:
# the solution is to create a new var with dtype 'category'
ufo['weekday_cats']=pd.Series(ufo['Weekday'].map({0:'Sunday',
                                       1: 'Monday',
                                       2: 'Tuesday',
                                       3: 'Wednesday',
                                       4: 'Thursday',
                                       5: 'Friday',
                                       6: 'Saturday'})
                              , dtype='category') # this is the money line
ufo['weekday_cats'].dtypes

In [None]:
# you also have to set the category order as a list
day_order = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']
ufo['weekday_cats'] = ufo['weekday_cats'].cat.set_categories(day_order, ordered=True)

In [None]:
# now the days will appear in the proper order!
ufo['weekday_cats'].value_counts(ascending=True)

In [None]:
# this will also transfer to groupby statements
daycounts = ufo.groupby(['weekday_cats'])[['weekday_cats']].count().sort_index()
daycounts

In [None]:
# and this sort order will also transfer to your visualizations
daycounts.plot(kind='bar');

## Plotting line charts with time series data

In [None]:
# which dates were the most frequent ufo sightings?
ufo['Date'].value_counts(ascending=False).head(5)

In [None]:
# # Set the Time variable as the dataframe index (This will make plotting timelines easier)
# ufo['Time2']=ufo['Time']
# ufo=ufo.set_index('Time2')

In [None]:
# Create a variable to count the number of sightings by date
# ufo['sightings']=1
# dates = ufo.groupby('Date')['sightings'].sum()
# ufo = ufo.drop('sightings', axis=1)
# dates.sort_values(ascending=False).head()

In [None]:
# Create a new dataframe of sightings
# dates_df = pd.DataFrame(dates)
# dates_df= dates_df.reset_index()
# dates_df.sort_values('sightings', ascending=False).head()

In [None]:
# Merge the daily number of sightings back into the original datafram
# ufo2 = pd.merge(ufo, dates_df, on='Date', how='left')
# ufo2.to_csv('../data/ufo2.csv')

In [None]:
# what's the average number of sightings on a given day?
ufo['sightings'].mean()

In [None]:
# Trend over years
sns.set(rc={'figure.figsize':(18, 6)})
sns.lineplot(x='Year', y='sightings', data=ufo);

In [None]:
# Trend over weekday

sns.lineplot(x='weekday_name', y='sightings', data=ufo, ci=None);

In [None]:
# Trend over time of day

sns.lineplot(x='Hour', y='sightings', data=ufo);

In [None]:
# Trend over month

sns.lineplot(x='Month', y='sightings', data=ufo);

## Plot multiple lines in color

In [None]:
# First, let's shorten the number of color categories. There are too many!
print(ufo['Colors Reported'].value_counts().head(15))

In [None]:
# we can create a new variable with only 4 categories.
ufo['color']='other'
ufo.loc[ufo['Colors Reported']=='ORANGE', 'color']='orange'
ufo.loc[ufo['Colors Reported']=='RED', 'color']='red'
ufo.loc[ufo['Colors Reported']=='GREEN', 'color']='green'
ufo['color'].value_counts()

In [None]:
# Alternative: we can create new columns for specific colors using numpy. This is called "one-hot encoding".
ufo['orange']=np.where(ufo['Colors Reported']=='ORANGE', 1, 0)
ufo['red']=np.where(ufo['Colors Reported']=='RED', 1, 0)
ufo['green']=np.where(ufo['Colors Reported']=='GREEN', 1, 0)
ufo['other_color']=np.where(ufo['orange']+ufo['red']+ufo['green']==0, 1, 0)
ufo.head()

In [None]:
# Create a new dataset by collapsing the original dataframe on color and date
ufo['sighting']=1
dates = ufo.groupby(['Year', 'color'])['sighting'].sum()
dates_df = pd.DataFrame(dates)
dates_df= dates_df.reset_index()
dates_df.head()
dates_df[dates_df['Year']==2009]

In [None]:
# Plot the lines
sns.set(rc={'figure.figsize':(18, 6)})
sns.lineplot(x="Year", y="sighting", data=dates_df);

In [None]:
# Restrict the dataset to only the last 20 years (this makes the chart a lot easier to read)
dates_df2=dates_df.loc[(dates_df['Year']>2000) & (dates_df['Year']<2015) & (dates_df['color']!='other')]

In [None]:
# Now plot the lines for this shorter time period
sns.set(rc={'figure.figsize':(18, 6)})
sns.lineplot(x="Year", y="sighting", data=dates_df2);

In [None]:
# Plot the lines in color!!
sns.set(rc={'figure.figsize':(18, 6)})
sns.lineplot(x="Year", y="sighting", hue="color", palette=['green', 'orange', 'red'], data=dates_df2);

In [None]:
# you can also do this with a for-loop, using 3 filters, one for each color and use hex codes.
for (colorchoice, linechoice) in [('#FD9903','orange'), ('#04D221','green'), ('#FD1604', 'red') ]:
    sns.lineplot(x="Year", y="sighting", color=colorchoice, data=dates_df2[dates_df2['color']==linechoice]);

In [None]:
# an alternative is to use "style" instead of "hue".
sns.lineplot(x="Year", y="sighting", data=dates_df2, style='color', markers=True, dashes=True);