# Tate Temporary Exhibition Data Processing
This notebook will include detaials of the process to visualisation of the types and durations of the Tate Temporary Exhibitions

In [None]:
#%pip install matplotlib
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

exhibit_df = pd.read_excel('../data/tate_modern_temporary_exhibitions_2008to2016_public.xlsx')

In [None]:
# First look at data

exhibit_df

It is noted that:
- empty cells are marked 'NaN'
- row 95 is blank and can be deleted

In [None]:
exhibit_df.drop(94, inplace = True)

In [None]:
exhibit_df.describe(include = 'all')

In [None]:
exhibit_df.info()

Columns of interest will be Exhibition, Era, Country, Medium, Start Day and End Day

The next commands are to confirm limited data available in other columns.

In [None]:
visitor_stats_df = exhibit_df['Visitors'].dropna()
print("These are the few populated visitor column values. Not sufficient data for useful analysis. \n")
print(visitor_stats_df)
Co_org_df = exhibit_df['Co-Organisers'].dropna()
print("These are the few populated Co-organiser vlaules. Not sufficient data for useful analysis. \n")
print(Co_org_df)

In [None]:
# One important column is Medium.  The categories of this column is investigated.
medium_df = exhibit_df.groupby(['Medium'])['Exhibition'].count()
print(medium_df)

In [None]:
# One Medium value is blank and this will be researched to find a suitable value. Index 32 Damien Hirst. This includes Scluptures so medium is set to Sculpture.
import numpy as np
exhibit_df['Medium'].replace(np.nan, 'Sculpture', inplace=True)
print(exhibit_df.iloc[32])

In [None]:
# There are some issues with trailing spaces and multiple categories which will need to be cleaned.abs
# As sculpture is the Medium for comparison, this has been selected as represented Medium if multiple are provided.
#If scultpture not present, the first medium is used.

exhibit_df['Medium'].replace(to_replace='Film, Installation', value='Film', inplace=True)
exhibit_df['Medium'].replace(to_replace='Painting, Drawing', value='Painting', inplace=True)
exhibit_df['Medium'].replace(to_replace='Painting, Sculpture', value='Sculpture', inplace=True)
exhibit_df['Medium'].replace(to_replace='Sculpture, Installation', value='Sculpture', inplace=True)

exhibit_df['Medium'] = exhibit_df['Medium'].str.strip()
#medium_df = exhibit_df.groupby(['Medium'])['Exhibition'].count()
exhibit_df['Medium'].value_counts()

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import datetime as dt

"""
To find the width of the horizontal bars it was necessary to calculate the duration of the event.

This has been calculated using Start Day and End Day but should be the same value as duration column.  Could be a quality check.
"""
# checking data type is datetime
exhibit_df['Start Day'] = pd.to_datetime(exhibit_df['Start Day'])
exhibit_df['End Day'] = pd.to_datetime(exhibit_df['End Day'])

# finding the left hand edge of horizontal bar.  The axis of the graph should start at the date of the first exhibition (exhibit_df['Start Day'].min()).dt.days)
exhibit_df['days_to_start'] = (exhibit_df['Start Day'] - exhibit_df['Start Day'].min()).dt.days
exhibit_df['days_to_end'] = (exhibit_df['End Day'] - exhibit_df['Start Day'].min()).dt.days
exhibit_df['exhib_duration'] = exhibit_df['days_to_end'] - exhibit_df['days_to_start'] + 1  # to include also the end date
medium_colors = {'Architecture': '#003f5c', 'Film': '#2f4b7c', 'Installation': '#665191', 'Mixed': '#a05195',
                 'Painting': '#d45087', 'Performance': '#f95d6a', 'Photography': '#ff7c43', 'Sculpture': '#ffa600'}

# Create a figure with size
fig, ax = plt.subplots(figsize = (20, 10))
    
# Identify the values used
for index, row in exhibit_df.iterrows():
    plt.barh(y=row['Medium'], width=row['exhib_duration'], left=row['Start Day'], color=medium_colors[row['Medium']])

# Adding a legend
patches = []
for medium in medium_colors:
    patches.append(matplotlib.patches.Patch(color=medium_colors[medium]))
plt.legend(handles=patches, labels=medium_colors.keys(), fontsize=11)

plt.title('Tate Temporary Exhibitions 2007 - 2018 by Medium', fontsize=28, color='#003f5c')
plt.ylabel('Meduim', fontsize=18, color='#003f5c')
plt.xlabel('Date', fontsize=18, color='#003f5c')
plt.grid(True, color = "grey", linewidth = "1.2", axis = 'x')
plt.show()
fig.savefig('../visualisations/tate_temp_exhibitions_by_Medium.png', orientation = "landscape")

In [None]:
# Investigating Exhibitions by start date.  The exhibition ID is provided in Decending order of start date

# Create a figure with size
fig, ax = plt.subplots(figsize = (20, 10))

# Identify the values used
for index, row in exhibit_df.iterrows():
    plt.barh(y=row['ID'], width=row['exhib_duration'], left=row['Start Day'], color=medium_colors[row['Medium']])

# Adding a legend
patches = []
for medium in medium_colors:
    patches.append(matplotlib.patches.Patch(color=medium_colors[medium]))
plt.legend(handles=patches, labels=medium_colors.keys(), fontsize=11)

plt.title('Tate Temporary Exhibitions 2007 - 2018 by ID', fontsize=20, color='#003f5c')
plt.ylabel('Exhibition ID', fontsize=18, color='#003f5c')
ax.set_ylim(95, 0)
plt.xlabel('Date', fontsize=18, color='#003f5c')
plt.grid(True, color = "grey", linewidth = "1.2", axis = 'x')
plt.show()
fig.savefig('../visualisations/tate_temp_exhibitions_by_id.png')

In [None]:
exhibit_df.describe()

In [None]:
exhibit_df.info()

In [None]:
exhibit_df.sample(10)

In [None]:
#createing a csv with data used for Tate Temporary Exhibition Analysis

exhibit_df.to_csv("../data/tate_temp_exhibitions_dataset.csv", index = False)