In [118]:
import pandas as pd

import numpy as np

import glob, os

data_root = "../datasets/surveys/2016/"
normal_data = "*.csv"
# outlier_data = "week8.csv"

In [127]:
(data_root + normal_data)
FILENAMES = glob.glob(data_root + normal_data)

# Grab unique columns from all 2016 data
So we can rename them to be consistent

In [120]:
unique_cols = set()

for df in dfs:
    unique_cols.update(df.columns)
    
# Add new columns to all 2016 DataFrames
unique_cols.update(["Week", "Location"])
unique_cols

{'Location',
 'Week',
 'location',
 'pacing',
 'satisfaction',
 'timestamp',
 'track',
 'week'}

In [121]:
# dictionary of possible names and what we want to rename them to
new_names = {
    'How well are the tutorials paced?': 'pacing',
    'How likely is it that you would recommend the Make School Summer Academy to a friend?': 'satisfaction',
    'How well are the tutorials paced?': 'pacing',
    'How well is the schedule paced?':'pacing',
    'How would you rate your overall satisfaction with the Summer Academy this week?': 'satisfaction',
    'What track are you in?':'track',
    'Timestamp':'timestamp',
    'Unnamed: 0': 'timestamp',
    'Week':'week',
    'Location': 'location'}

# Helper functions to pull week number and location from filenames

In [123]:
import re

# TODO: DRYify
def extract_week(filename):
    """takes a list of filenames and returns a list of weeks
        example file:
            '../datasets/surveys/2016/Anon_Week_6_Feedback_-_SG.csv'
            should return: '6'
    """
    try:
        result = re.search(r'Week_+(\d)', filename).group(1)
        return result
    except AttributeError:
        return np.NaN

def extract_location(filename):
    """takes a list of filenames and returns a list of locations
        example file:
            '../datasets/surveys/2016/Anon_Week_6_Feedback_-_SG.csv'
            should return: 'SG'
    """
    try:
        result = re.search(r'([A-Za-z])+\.csv', filename).group(0)
        return result[:-4]
    except AttributeError:
        return np.NaN
        

# Normalizing columns
- drop the ones we don't want
- add the ones we do and fill the empty spaces with NaNs

# Some Helpful Docs
- [Renaming columns](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rename.html)
- [Drop columns](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html)
- [Add columns](https://erikrood.com/Python_References/add_new_col_df_default_value_final.html)
- [np.nan](https://www.numpy.org/devdocs/reference/constants.html#numpy.NAN)

In [124]:
from functools import reduce
import numpy as np

def df_normalizer(df, cols_list):
    """takes a dataframe, renames colums, drops columns it doesn't need, and adds new NaN filled ones.
    Arguments:
        1. df: the dataframe to normalize
        2. cols_list: list of columns the df should have by the end
        3. new_names: dict of possible column names 
    Steps:
        1. renames columns: 
        2. removes unnecessary columns: 
        3. adds necessary columns filled with NaN
    """
    # rename columns
    # taken out of this function
    # remove unnecessary columns
    for col in df.columns:
        # drop the column if it wasn't renamed
        if col not in new_names.values():
            df.drop(columns=col)
    # add nonexistent columns 
    for col in cols_list:
        if col not in df.columns:
            df[col] = np.NaN
    return df

# Special treatment for 2017

In [131]:
pd.read_csv('../datasets/surveys/2017/2017.csv')

Unnamed: 0,ID,Location,Track,Week,Rating (Num),Schedule Pacing
0,134,San Francisco,"Apps, Explorer",Week 1,3,Just right
1,36,Los Angeles,Apps,Week 1,4,A little too fast
2,117,San Francisco,Games,Week 1,4,Way too slow
3,253,,,Week 2,4,A little too fast
4,350,New York City,"Apps, Explorer",Week 1,4,Just right
5,23,Redwood City,Apps,Week 1,5,Just right
6,28,Los Angeles,Apps,Week 7,5,Just right
7,65,San Francisco,Apps,Week 1,5,A little too slow
8,101,Santa Clara,Apps,Week 1,5,A little too slow
9,124,Santa Clara,Apps,Week 1,5,Just right


# And for my final trick
## the dirty work
- iterates through all the FILENAMES we saved from earlier
- get the week and location from each file
- load it in as a dataframe
- rename the columns using our new_names dictionary
- normalize the columns that don't or should exist

In [128]:
"""loads in each file as a dataframe, adds week and location, then normalizes it"""
dfs = list()
df = None # temporary
for file in FILENAMES:
    # create a dataframe from the csv
    df = pd.read_csv(file)
    week = extract_week(file)
    location = extract_location(file)

    # add week and location columns
    df['location'] = location
    df['week'] = week

    # rename cols
    new_names = {
    'How well are the tutorials paced?': 'pacing',
    'How well is the schedule paced?':'pacing',
    'How would you rate your overall satisfaction with the Summer Academy this week?': 'satisfaction',
    'What track are you in?':'track',
    'Timestamp':'timestamp',
    'Unnamed: 0': 'timestamp',
    'Week':'week',
    'Location': 'location'}
    df.rename(columns=new_names, inplace=True)

    # normalize it: col_true_names
    cols = ['week', 'location', 'pacing', 'track', 'timestamp']
    df = df_normalizer(df, cols)
    dfs.append(df)
    
cleaned = pd.concat(dfs, sort=False)