## Import Statements

In [68]:
import matplotlib.pyplot as plt 
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.graph_objects as go
import datetime

In [69]:
#Importing the module
import gspread
from df2gspread import df2gspread as d2g
from oauth2client.service_account import ServiceAccountCredentials

## Link to Google Sheets

Connect API

In [70]:
#The scope is always look like this so we did not need to change anything
scope = [
   'https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']

#Name of our Service Account Key
google_key_file = 'jupyter-spring2020ctc-connect-3bde2ba97cd1.json'

credentials = ServiceAccountCredentials.from_json_keyfile_name(google_key_file, scope)
gc = gspread.authorize(credentials)

Open and save the "Spring 2020 CTC Recording" Google Sheets document, specifically the "Cans-Recording" sheet

In [71]:
#Opening the worksheet by using Worksheet ID
spreadsheet_key = '1OiGBYsa9eK26lAaznmiiczgSX17GHlOWoS0crNza-vo'
workbook = gc.open_by_key(spreadsheet_key)

#Selecting which sheet to pulling the data
sheet = workbook.worksheet('Cans-Recording')

#Pulling the data and transform it to the data frame
values = sheet.get_all_values()
df = pd.DataFrame(values[1:], columns = values[0])

## Data Cleaning

Open the dataframe

In [72]:
df.head()

Unnamed: 0,Unnamed: 1,BLUE BINNZZZ STUFF,Unnamed: 3,CURRENT STANDINGS BELOW,Unnamed: 5,Unnamed: 6,Unnamed: 7,TOTAL TOTAL->,360,68,...,89,58.5,64.5,0,0.1,0.2,0.3,0.4,0.5,0.6
0,Contacted?,Who is reaching out?,Status,Fraternity,Contact Name,Contact Email,In GroupMe?,Contact Number,Current Total,Week 1 (2/3),...,Week 3 (2/17),Week 4 (2/14),Week 5 (3/2),SPR. BREAK(3/9),SPR. BREAK (3/16),Week 6 (3/23),Week 7 (3/30),Week 8 (4/6),Week 9 (4/13),Week 10 (4/20)
1,,,,Chi Phi,Garrett Hanrahan,gph4fs@virginia.edu,y,1-410-924-9497,0,,...,,,,,,,,,,
2,,,,Sigma Pi,Ben Rurka,jbr4pc@virginia.edu,y,301-922-6454,0,,...,,,,,,,,,,
3,texted reese,Sterchi,wait on reeese to follow through,SAE,Reese Bowling,rcb2bb@virginia.edu,y,804-335-7891,4,,...,,,4,,,,,,,
4,,,,Elmo,Charlie Jones,caj3pb@virginia.edu,y,703-623-7011,0,,...,,,,,,,,,,


Redo the header with the second row of the dataframe

In [73]:
new_header = df.iloc[0] # grab the first row for the header
df = df[1:] # take the data less the header row
df.columns = new_header # set the header row as the df header
df.head()

Unnamed: 0,Contacted?,Who is reaching out?,Status,Fraternity,Contact Name,Contact Email,In GroupMe?,Contact Number,Current Total,Week 1 (2/3),...,Week 3 (2/17),Week 4 (2/14),Week 5 (3/2),SPR. BREAK(3/9),SPR. BREAK (3/16),Week 6 (3/23),Week 7 (3/30),Week 8 (4/6),Week 9 (4/13),Week 10 (4/20)
1,,,,Chi Phi,Garrett Hanrahan,gph4fs@virginia.edu,y,1-410-924-9497,0,,...,,,,,,,,,,
2,,,,Sigma Pi,Ben Rurka,jbr4pc@virginia.edu,y,301-922-6454,0,,...,,,,,,,,,,
3,texted reese,Sterchi,wait on reeese to follow through,SAE,Reese Bowling,rcb2bb@virginia.edu,y,804-335-7891,4,,...,,,4.0,,,,,,,
4,,,,Elmo,Charlie Jones,caj3pb@virginia.edu,y,703-623-7011,0,,...,,,,,,,,,,
5,,,,,Drew Quigley,ajq2vb@virginia.edu,y,704-780-0647,22,,...,2.0,3.5,13.5,,,,,,,


Drop all columns except "Fraternity" and weekly data

In [74]:
df.columns
df =  df.drop(columns=['Contacted?', 'Who is reaching out?', 'Status','Contact Name', 'Contact Email', 'In GroupMe?', 'Contact Number','Current Total']) # drop irrelevant columns
df.head()

Unnamed: 0,Fraternity,Week 1 (2/3),Week 2 (2/10),Week 3 (2/17),Week 4 (2/14),Week 5 (3/2),SPR. BREAK(3/9),SPR. BREAK (3/16),Week 6 (3/23),Week 7 (3/30),Week 8 (4/6),Week 9 (4/13),Week 10 (4/20)
1,Chi Phi,,,,,,,,,,,,
2,Sigma Pi,,,,,,,,,,,,
3,SAE,,,,,4.0,,,,,,,
4,Elmo,,,,,,,,,,,,
5,,,3.0,2.0,3.5,13.5,,,,,,,


Delete scrap notes at the bottom of the sheet

In [76]:
df = df.iloc[0:44,] # keep only relevant rows

Fill NA's in "Fraternity" column using the forward fill method as appropriate

In [78]:
df['Fraternity'].fillna(method='ffill', inplace=True) # forward fill NA values for Fraternity name 
df.head()

Unnamed: 0,Fraternity,Week 1 (2/3),Week 2 (2/10),Week 3 (2/17),Week 4 (2/14),Week 5 (3/2),SPR. BREAK(3/9),SPR. BREAK (3/16),Week 6 (3/23),Week 7 (3/30),Week 8 (4/6),Week 9 (4/13),Week 10 (4/20)
1,Chi Phi,,,,,,,,,,,,
2,Sigma Pi,,,,,,,,,,,,
3,SAE,,,,,4.0,,,,,,,
4,Elmo,,,,,,,,,,,,
5,,,3.0,2.0,3.5,13.5,,,,,,,


Delete rows with empty weekly bags data

In [None]:
new_cols_list = ['Fraternity']
for week_column in df.columns[1:]:
  new_col = week_column[week_column.find('/')-1:-1] + '/20'
  new_cols_list.append(new_col)
new_cols_list
df.columns = new_cols_list
df.head()

In [None]:
df = df[~df['Fraternity'].duplicated(keep=False) | df[df.columns[1:]].notnull().any(axis=1)]

In [None]:
df = pd.melt(df, id_vars=['Fraternity'], value_vars=df.columns[1:])
df = df.rename(columns={'variable': 'Week', 'value': 'Bags'})

In [None]:
df['Week']= pd.to_datetime(df['Week'].astype(str), format='%m/%d/%y')

In [None]:
df['Bags'] = pd.to_numeric(df['Bags'])

In [None]:
df = df.fillna(0)

Rename column names with "Fraternity" as first column name and a standard date format for each week (i.e. "2/3/20")

In [None]:
df.dtypes

In [None]:
df

In [None]:
weekly = df.groupby(['Week']).sum()

In [None]:
weekly.reset_index(level=0, inplace=True)

In [None]:
weekly['Cumulative Bags'] = weekly['Bags'].cumsum()

In [None]:
df

In [None]:
sns.lineplot(weekly['Week'], weekly['Bags'], linewidth=2.0)