# Import Statements

### I linked the google sheet to this notebook using this source: https://towardsdatascience.com/from-google-sheet-to-your-jupyter-notebook-ccdbf28fbf1b. Any updates to the google sheet will be reflected automatically in this notebook

In [123]:
import matplotlib.pyplot as plt 
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.graph_objects as go
import datetime

In [124]:
#Importing the module
import gspread
from df2gspread import df2gspread as d2g
from oauth2client.service_account import ServiceAccountCredentials

# Link to Google Sheets

Connect the Google Sheets API

In [125]:
#The scope is always look like this so we did not need to change anything
scope = [
   'https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']

#Name of our Service Account Key
google_key_file = 'jupyter-spring2020ctc-connect-3bde2ba97cd1.json'

credentials = ServiceAccountCredentials.from_json_keyfile_name(google_key_file, scope)
gc = gspread.authorize(credentials)

# Spring 2020 Data Cleaning

Open and save the "Spring 2020 CTC Recording" Google Sheets document, specifically the "Data-Analysis-Stats" sheet

In [126]:
#Opening the worksheet by using Worksheet ID
spring20_spreadsheet_key = '1OiGBYsa9eK26lAaznmiiczgSX17GHlOWoS0crNza-vo'
spring20_workbook = gc.open_by_key(spring20_spreadsheet_key)

#Selecting which sheet to pulling the data
spring20_sheet = spring20_workbook.worksheet('Data-Analysis-Stats')

#Pulling the data and transform it to the data frame
spring20_values = spring20_sheet.get_all_values()
spring20 = pd.DataFrame(spring20_values[1:], columns = spring20_values[0])

Open the dataframe

In [127]:
spring20.head()

Unnamed: 0,CURRENT STANDINGS BELOW,360,68,80,89,58.5,64.5,0,0.1,0.2,0.3,0.4,0.5,0.6
0,Fraternity,Current Total,Week 1 (2/3),Week 2 (2/10),Week 3 (2/17),Week 4 (2/14),Week 5 (3/2),SPR. BREAK(3/9),SPR. BREAK (3/16),Week 6 (3/23),Week 7 (3/30),Week 8 (4/6),Week 9 (4/13),Week 10 (4/20)
1,Chi Phi,0,,,,,,,,,,,,
2,Sigma Pi,0,,,,,,,,,,,,
3,SAE,4,,,,,4,,,,,,,
4,Elmo,0,,,,,,,,,,,,


Fill blank cells with NA's

In [128]:
spring20.replace(r'', np.NaN, inplace=True)
spring20.head()

Unnamed: 0,CURRENT STANDINGS BELOW,360,68,80,89,58.5,64.5,0,0.1,0.2,0.3,0.4,0.5,0.6
0,Fraternity,Current Total,Week 1 (2/3),Week 2 (2/10),Week 3 (2/17),Week 4 (2/14),Week 5 (3/2),SPR. BREAK(3/9),SPR. BREAK (3/16),Week 6 (3/23),Week 7 (3/30),Week 8 (4/6),Week 9 (4/13),Week 10 (4/20)
1,Chi Phi,0,,,,,,,,,,,,
2,Sigma Pi,0,,,,,,,,,,,,
3,SAE,4,,,,,4,,,,,,,
4,Elmo,0,,,,,,,,,,,,


Replace the header with the second row of the dataframe

In [129]:
new_header = spring20.iloc[0] # grab the first row for the header
spring20 = spring20[1:] # take the data less the header row
spring20.columns = new_header # set the header row as the spring20 header
spring20.head()

Unnamed: 0,Fraternity,Current Total,Week 1 (2/3),Week 2 (2/10),Week 3 (2/17),Week 4 (2/14),Week 5 (3/2),SPR. BREAK(3/9),SPR. BREAK (3/16),Week 6 (3/23),Week 7 (3/30),Week 8 (4/6),Week 9 (4/13),Week 10 (4/20)
1,Chi Phi,0,,,,,,,,,,,,
2,Sigma Pi,0,,,,,,,,,,,,
3,SAE,4,,,,,4.0,,,,,,,
4,Elmo,0,,,,,,,,,,,,
5,,22,,3.0,2.0,3.5,13.5,,,,,,,


Drop "Current Total" column- we will not be needing this

In [130]:
spring20.columns
spring20.drop(columns=['Current Total'], inplace=True) # drop irrelevant column
spring20.head()

Unnamed: 0,Fraternity,Week 1 (2/3),Week 2 (2/10),Week 3 (2/17),Week 4 (2/14),Week 5 (3/2),SPR. BREAK(3/9),SPR. BREAK (3/16),Week 6 (3/23),Week 7 (3/30),Week 8 (4/6),Week 9 (4/13),Week 10 (4/20)
1,Chi Phi,,,,,,,,,,,,
2,Sigma Pi,,,,,,,,,,,,
3,SAE,,,,,4.0,,,,,,,
4,Elmo,,,,,,,,,,,,
5,,,3.0,2.0,3.5,13.5,,,,,,,


Delete scrap notes at the bottom of the sheet

In [131]:
spring20 = spring20.iloc[0:44,] # keep only relevant rows

Fill NA's in "Fraternity" column using the forward fill method as appropriate

In [132]:
spring20['Fraternity'].fillna(method='ffill', inplace=True) # forward fill NA values for Fraternity name 
spring20.head()

Unnamed: 0,Fraternity,Week 1 (2/3),Week 2 (2/10),Week 3 (2/17),Week 4 (2/14),Week 5 (3/2),SPR. BREAK(3/9),SPR. BREAK (3/16),Week 6 (3/23),Week 7 (3/30),Week 8 (4/6),Week 9 (4/13),Week 10 (4/20)
1,Chi Phi,,,,,,,,,,,,
2,Sigma Pi,,,,,,,,,,,,
3,SAE,,,,,4.0,,,,,,,
4,Elmo,,,,,,,,,,,,
5,Elmo,,3.0,2.0,3.5,13.5,,,,,,,


Delete rows with empty weekly bags data (duplicate rows of each house caused by formats of the google sheet)

In [133]:
spring20 = spring20[~spring20['Fraternity'].duplicated(keep=False) |
                    spring20[spring20.columns[1:]].notnull().any(axis=1)]
spring20.head()

Unnamed: 0,Fraternity,Week 1 (2/3),Week 2 (2/10),Week 3 (2/17),Week 4 (2/14),Week 5 (3/2),SPR. BREAK(3/9),SPR. BREAK (3/16),Week 6 (3/23),Week 7 (3/30),Week 8 (4/6),Week 9 (4/13),Week 10 (4/20)
1,Chi Phi,,,,,,,,,,,,
2,Sigma Pi,,,,,,,,,,,,
3,SAE,,,,,4.0,,,,,,,
5,Elmo,,3.0,2.0,3.5,13.5,,,,,,,
7,Delta Sig,,,3.5,7.0,,,,,,,,


Tidy the dataframe by melting the week columns into a better format

In [134]:
spring20 = pd.melt(spring20, id_vars=['Fraternity'], value_vars=spring20.columns[1:])
spring20.columns=['Fraternity', 'Week', 'Bags']
spring20.head()

Unnamed: 0,Fraternity,Week,Bags
0,Chi Phi,Week 1 (2/3),
1,Sigma Pi,Week 1 (2/3),
2,SAE,Week 1 (2/3),
3,Elmo,Week 1 (2/3),
4,Delta Sig,Week 1 (2/3),


Convert "Week" column into standard date-time string, and convert to date-time type

In [135]:
spring20['Week'] = spring20['Week'].apply(lambda x: x[x.find('(')+1:-1] + '/20')
spring20['Week']= pd.to_datetime(spring20['Week'].astype(str), format='%m/%d/%y')
spring20.head()

Unnamed: 0,Fraternity,Week,Bags
0,Chi Phi,2020-02-03,
1,Sigma Pi,2020-02-03,
2,SAE,2020-02-03,
3,Elmo,2020-02-03,
4,Delta Sig,2020-02-03,


Fill NaN with 0's and convert "Bags" column to float type.

In [136]:
spring20.fillna(0, inplace=True)
spring20['Bags'] = pd.to_numeric(spring20['Bags'])
spring20.head()

Unnamed: 0,Fraternity,Week,Bags
0,Chi Phi,2020-02-03,0.0
1,Sigma Pi,2020-02-03,0.0
2,SAE,2020-02-03,0.0
3,Elmo,2020-02-03,0.0
4,Delta Sig,2020-02-03,0.0


Add "Spring 2020" label. Enjoy clean and tidy data!

In [137]:
spring20['Semester'] = 'Spring 2020'
spring20.head()

Unnamed: 0,Fraternity,Week,Bags,Semester
0,Chi Phi,2020-02-03,0.0,Spring 2020
1,Sigma Pi,2020-02-03,0.0,Spring 2020
2,SAE,2020-02-03,0.0,Spring 2020
3,Elmo,2020-02-03,0.0,Spring 2020
4,Delta Sig,2020-02-03,0.0,Spring 2020


# Fall 2019 Data Cleaning

Open and save the "Fall 2019 CTC Recording" Google Sheets document, specifically the "Data-Analysis-Stats" sheet

In [138]:
#Opening the worksheet by using Worksheet ID
fall19_spreadsheet_key = '14BYq15BfBUclNLbO8IRSAQeG98qCGDG3n59Hkfh3huU'
fall19_workbook = gc.open_by_key(fall19_spreadsheet_key)

#Selecting which sheet to pulling the data
fall19_sheet = fall19_workbook.worksheet('Data-Analysis-Stats')

#Pulling the data and transform it to the data frame
fall19_values = fall19_sheet.get_all_values()
fall19 = pd.DataFrame(fall19_values[1:], columns = fall19_values[0])

Open the dataframe

In [139]:
fall19.head()

Unnamed: 0,CURRENT STANDINGS BELOW,550.5,32,47.5,38.5,25,56.5,107.5,24.5,72,77,40.5,29.5
0,Fraternity,Current Total,Week 1 (9/8),Week 2 (9/15),Week 3 (9/22),Week 4 (9/29),Week 5 (10/13),Week 6 (10/20),Week 7 (10/27),Week 8 (11/3),Week 9 (11/10),Week 10 (11/17),Week 11 (11/24)
1,Chi Phi,0,,,,,,,,,,,
2,Sigma Pi,0,,,,,,,,,,,
3,SAE,9,,,,,,9,,,,,
4,,0,,,,,,,,,,,


Fill blank cells with NA's

In [140]:
fall19.replace(r'', np.NaN, inplace=True)
fall19.head()

Unnamed: 0,CURRENT STANDINGS BELOW,550.5,32,47.5,38.5,25,56.5,107.5,24.5,72,77,40.5,29.5
0,Fraternity,Current Total,Week 1 (9/8),Week 2 (9/15),Week 3 (9/22),Week 4 (9/29),Week 5 (10/13),Week 6 (10/20),Week 7 (10/27),Week 8 (11/3),Week 9 (11/10),Week 10 (11/17),Week 11 (11/24)
1,Chi Phi,0,,,,,,,,,,,
2,Sigma Pi,0,,,,,,,,,,,
3,SAE,9,,,,,,9,,,,,
4,,0,,,,,,,,,,,


Replace the header with the second row of the dataframe

In [141]:
new_header = fall19.iloc[0] # grab the first row for the header
fall19 = fall19[1:] # take the data less the header row
fall19.columns = new_header # set the header row as the spring20 header
fall19.head()

Unnamed: 0,Fraternity,Current Total,Week 1 (9/8),Week 2 (9/15),Week 3 (9/22),Week 4 (9/29),Week 5 (10/13),Week 6 (10/20),Week 7 (10/27),Week 8 (11/3),Week 9 (11/10),Week 10 (11/17),Week 11 (11/24)
1,Chi Phi,0,,,,,,,,,,,
2,Sigma Pi,0,,,,,,,,,,,
3,SAE,9,,,,,,9.0,,,,,
4,,0,,,,,,,,,,,
5,Elmo,0,,,,,,,,,,,


Drop "Current Total" column- we will not be needing this

In [142]:
fall19.columns
fall19.drop(columns=['Current Total'], inplace=True) # drop irrelevant column
fall19.head()

Unnamed: 0,Fraternity,Week 1 (9/8),Week 2 (9/15),Week 3 (9/22),Week 4 (9/29),Week 5 (10/13),Week 6 (10/20),Week 7 (10/27),Week 8 (11/3),Week 9 (11/10),Week 10 (11/17),Week 11 (11/24)
1,Chi Phi,,,,,,,,,,,
2,Sigma Pi,,,,,,,,,,,
3,SAE,,,,,,9.0,,,,,
4,,,,,,,,,,,,
5,Elmo,,,,,,,,,,,


In [143]:
fall19

Unnamed: 0,Fraternity,Week 1 (9/8),Week 2 (9/15),Week 3 (9/22),Week 4 (9/29),Week 5 (10/13),Week 6 (10/20),Week 7 (10/27),Week 8 (11/3),Week 9 (11/10),Week 10 (11/17),Week 11 (11/24)
1,Chi Phi,,,,,,,,,,,
2,Sigma Pi,,,,,,,,,,,
3,SAE,,,,,,9,,,,,
4,,,,,,,,,,,,
5,Elmo,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
99,,VASST,0,,,,,,,,,
100,,,0,,,,,,,,,
101,,,3,,,,,,,,,
102,,Club Rugby,0,,,,,,,,,


Delete scrap notes at the bottom of the sheet

In [144]:
fall19 = fall19.iloc[0:47,] # keep only relevant rows

Fill NA's in "Fraternity" column using the forward fill method as appropriate

In [145]:
fall19['Fraternity'].fillna(method='ffill', inplace=True) # forward fill NA values for Fraternity name 
fall19.head()

Unnamed: 0,Fraternity,Week 1 (9/8),Week 2 (9/15),Week 3 (9/22),Week 4 (9/29),Week 5 (10/13),Week 6 (10/20),Week 7 (10/27),Week 8 (11/3),Week 9 (11/10),Week 10 (11/17),Week 11 (11/24)
1,Chi Phi,,,,,,,,,,,
2,Sigma Pi,,,,,,,,,,,
3,SAE,,,,,,9.0,,,,,
4,SAE,,,,,,,,,,,
5,Elmo,,,,,,,,,,,


Delete rows with empty weekly bags data (duplicate rows of each house caused by formats of the google sheet)

In [146]:
fall19 = fall19[~fall19['Fraternity'].duplicated(keep=False) |
                    fall19[fall19.columns[1:]].notnull().any(axis=1)]
fall19.head()

Unnamed: 0,Fraternity,Week 1 (9/8),Week 2 (9/15),Week 3 (9/22),Week 4 (9/29),Week 5 (10/13),Week 6 (10/20),Week 7 (10/27),Week 8 (11/3),Week 9 (11/10),Week 10 (11/17),Week 11 (11/24)
1,Chi Phi,,,,,,,,,,,
2,Sigma Pi,,,,,,,,,,,
3,SAE,,,,,,9.0,,,,,
7,Elmo,8.5,,4.0,,9.0,1.0,,17.0,9.0,5.0,
9,Delta Sig,2.0,,,7.0,,3.5,,12.0,,1.5,


Tidy the dataframe by melting the week columns into a better format

In [147]:
fall19 = pd.melt(fall19, id_vars=['Fraternity'], value_vars=fall19.columns[1:])
fall19.columns=['Fraternity', 'Week', 'Bags']
fall19.head()

Unnamed: 0,Fraternity,Week,Bags
0,Chi Phi,Week 1 (9/8),
1,Sigma Pi,Week 1 (9/8),
2,SAE,Week 1 (9/8),
3,Elmo,Week 1 (9/8),8.5
4,Delta Sig,Week 1 (9/8),2.0


Convert "Week" column into standard date-time string, and convert to date-time type

In [148]:
fall19['Week'] = fall19['Week'].apply(lambda x: x[x.find('(')+1:-1] + '/19')
fall19['Week']= pd.to_datetime(fall19['Week'].astype(str), format='%m/%d/%y')
fall19.head()

Unnamed: 0,Fraternity,Week,Bags
0,Chi Phi,2019-09-08,
1,Sigma Pi,2019-09-08,
2,SAE,2019-09-08,
3,Elmo,2019-09-08,8.5
4,Delta Sig,2019-09-08,2.0


Fill NaN with 0's and convert "Bags" column to float type.

In [149]:
fall19.fillna(0, inplace=True)
fall19['Bags'] = pd.to_numeric(fall19['Bags'])
fall19.head()

Unnamed: 0,Fraternity,Week,Bags
0,Chi Phi,2019-09-08,0.0
1,Sigma Pi,2019-09-08,0.0
2,SAE,2019-09-08,0.0
3,Elmo,2019-09-08,8.5
4,Delta Sig,2019-09-08,2.0


Add "Fall 2019" label. Enjoy clean and tidy data!

In [150]:
fall19['Semester'] = 'Fall 2019'
fall2019.head()

Unnamed: 0,Fraternity,Week,Bags,Semester
0,Chi Phi,2019-09-08,0.0,Fall 2019
1,Sigma Pi,2019-09-08,0.0,Fall 2019
2,SAE,2019-09-08,0.0,Fall 2019
3,Elmo,2019-09-08,8.5,Fall 2019
4,Delta Sig,2019-09-08,2.0,Fall 2019


### Quick Graphs for total bags across all houses

In [151]:
totals = pd.concat([spring20, fall2019], axis=0, sort=False)

NameError: name 'spring2020' is not defined

In [None]:
weekly = spring20.groupby(['Week']).sum()

In [None]:
weekly.reset_index(level=0, inplace=True)

In [None]:
weekly['Cumulative Bags'] = weekly['Bags'].cumsum()

In [None]:
sns.lineplot(weekly['Week'], weekly['Bags'], linewidth=2.0)

In [None]:
sns.lineplot(weekly['Week'], weekly['Cumulative Bags'], linewidth=2.0)