<a href="https://colab.research.google.com/github/kat-tian/Allting/blob/master/allting_insights.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from itertools import chain
from typing import Dict, List

import numpy as np
import pandas as pd
from google.cloud import firestore
import os

import functools
from itertools import chain
from typing import Dict, List, TypeVar
from datetime import datetime

from google.cloud import firestore
import matplotlib.pyplot as plt
%matplotlib inline 



os.environ["GCLOUD_PROJECT"] = "stage-allting"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./google_credentials.json"

In [0]:

def normalize(val, min, max):
    return (val - min) / (max - min)


def get_deviation(mean, compare_num, min, max):
    return normalize(compare_num, min, max) - normalize(mean, min, max)


def none_fill(arr):
    new_list = []
    if len([i for i in arr if i]) == 0:
        return []
    for i, v in enumerate(arr):
        if i == 0:
            new_list.append(recursive_none_fill(v, i, arr))
        else:
            new_list.append(recursive_none_fill(new_list[-1], i, arr))
    return new_list


def recursive_none_fill(prev, idx, arr):
    current = arr[idx]
    if len(arr) == idx + 1:
        if current is not None:
            return current
        else:
            if prev is None:
                return np.mean([i for i in arr if i])
            return prev

    if current is None:
        next_val = recursive_none_fill(current, idx + 1, arr)
        if prev is None:
            return next_val
        return (next_val + prev) / 2
    return current



def meeting_mean(meeting_idx) -> Dict[str, float]:
    """
    Parameters:
    arg1: List of
    {
    'Description': str,
    'FeedbackId': str,
    'Order': int,
    'UserId': str',
    'Value': str'
    }

    Returns:
    dict:keys are questionId and value avg values for each question in question_group

    """
    values_dict = {}
    for response in meeting_idx:
        feedback_id = response['FeedbackId']

        value = response["Value"]
        if feedback_id not in values_dict:
            values_dict[feedback_id] = [value]
        else:
            values_dict[feedback_id].append(value)

    for key, value in values_dict.items():
        if len(value) != 0:
            values_dict[key] = np.mean(value).item()

    return values_dict


def parse_raw_feedback(meetings) -> Dict[str, List[float]]:
    """
    :param meetings: raw meetings type from firestore
    :return: a parsed dict of radio and slider results merged
    """
    all_means: Dict[str, List[float]] = {}

    for idx, meeting in enumerate(meetings):
        mean_obj = meeting_mean(meetings[idx]["SliderResults"])
        radio_res = meetings[idx]["RadioResults"]
        for val in radio_res:
            if val["Value"] == 'no':
                val["Value"] = 0
            else:
                val["Value"] = 1

        mean_obj.update(meeting_mean(radio_res))

        for question_id, mean in mean_obj.items():
            if question_id not in all_means:
                all_means[question_id] = [mean]
            else:
                all_means[question_id].append(mean)

    return all_means


def parse_special_types(meetings) -> Dict[str, List]:
    """
      :param meetings: raw meetings type from firestore
      :return: a parsed dict of special value types with a list containing None and floats
      """
    all_means: Dict[str, List[float]] = {
        "Hour": [],
        "Weekday": [],
        "SpeakerDistribution": [],
        "NumParticipants": [],
        "Length": [],
    }

    # TODO: Deal with missing values?
    for idx, meeting in enumerate(meetings):

        all_means["Length"].append(meeting.get("Length"))
        all_means["NumParticipants"].append(meeting.get("NumParticipants"))

        timestamp = meeting.get("MeetingTimestamp")
        if timestamp:
            date_time = datetime.fromtimestamp(timestamp / 1000)  # Divide by 1000 since value is in milliseconds
            hour = date_time.hour
            weekday = date_time.isoweekday()
            all_means["Hour"].append(hour)
            all_means["Weekday"].append(weekday)
        else:
            all_means["Hour"].append(None)
            all_means["Weekday"].append(None)

        speaker_distribution = meeting.get("SpeakerDistribution")
        if speaker_distribution:
            even_dist = 1 / len(speaker_distribution)
            dist_num = functools.reduce(lambda store, nextVal: store + abs(nextVal - even_dist),
                                        [0, *chain(speaker_distribution)])

            # results in a score from 0 to 1 where 0 is perfect speaker distribution and 1 is one person talking only
            distribution_score = normalize(dist_num, 0, 1 - even_dist)
            all_means["SpeakerDistribution"].append(distribution_score)
        else:
            all_means["SpeakerDistribution"].append(None)

    return all_means


def fill_mean(values: List[float], length: int) -> List[float]:
    list_len = len(values)
    if list_len == 0:
        return []
    if length < list_len:
        raise Exception("length must be longer than list, list length: {}, length {}".format(list_len, length))

    mean = np.mean(values)
    return [*chain([mean for i in range(length - list_len)], values)]


def get_min_max_data(raw):
    """
       :param
       :return:
       """

    results = {}
    for idx, meeting in enumerate(raw):
        slider_list = raw[idx]["SliderResults"]
        radio_list = raw[idx]["RadioResults"]
        for s in slider_list:
            obj = {}
            obj["Min"] = s["Min"]
            obj["Max"] = s["Max"]
            results[s["FeedbackId"]] = obj

        for r in radio_list:
            obj = {}
            obj["Min"] = 0
            obj["Max"] = 1
            results[r["FeedbackId"]] = obj

    return results


def get_correlations(means: Dict[str, List[float]]) -> Dict[str, List[Dict[str, float]]]:
    corr_matrix = np.corrcoef([*means.values()])
    keys = [*means.keys()]
    output = {}
    for idx, key in enumerate(keys):
        output[key] = []
        for row_idx, value in enumerate(corr_matrix[idx]):
            if keys[row_idx] != key and not np.isnan(value):
                output[key].append({keys[row_idx]: value})

    return output



def main(meeting_id, team_id, org_id, meeting_timestamp):



    print(meeting_id, team_id, org_id, meeting_timestamp)

    db = firestore.Client()

    raw_feedback = db.collection(u'feedback-results') \
        .where(u"TeamId", u"==", team_id) \
        .where(u"OrgId", u"==", org_id) \
        .where(u"MeetingTimestamp", u"<=", meeting_timestamp).stream()

    feedback_list = []
    for feedback in raw_feedback:
        meet = feedback.to_dict()
        feedback_list.append(meet)

    if len(feedback_list) == 0:
        print("No meetings to compare with")
        return

    question_values_dict = parse_raw_feedback(feedback_list)
    special_types_dict = parse_special_types(feedback_list)

    for k, v in special_types_dict.items():
      print(k,v , none_fill(v))
      special_types_dict[k] = none_fill(v)
    
    
    len_dict = {}

    longest_count = 0
    for key, value in question_values_dict.items():
        length = len(value)
        len_dict[key] = length
        if length > longest_count:
            longest_count = length

    for question_id, values in question_values_dict.items():
        if len(values) < longest_count:
            question_values_dict[question_id] = fill_mean(values, longest_count)

    correlations = get_correlations(question_values_dict)

    results_list = []
    # Here we add all relevant items to the results_list that is to be returned
    for key, val in correlations.items():
        corr_list = []
        for obj in val:
            sub_key = list(obj.keys())[0]
            value = list(obj.values())[0]
            corr_list.append({"QuestionId": sub_key, "Value": value})
        results_list.append({"QuestionId": key, "Values": corr_list, "DataPoints": len_dict[key]})

    series_data = []
    min_max_data = get_min_max_data(feedback_list)

    for key, val in question_values_dict.items():
        prevMean = np.mean(val[:-1])
        min_max = min_max_data[key]
        series_data.append(
            {"QuestionId": key, "Values": val, "PreMean": prevMean, "Mean": np.mean(val), "CurrentVal": val[-1],
             "Min": min_max["Min"], "Max": min_max["Max"],
             "Deviation": get_deviation(prevMean, val[-1], min_max["Min"], min_max["Max"])})

    return [correlations, series_data, special_types_dict]


In [31]:
db = firestore.Client()
current_feedback_doc_id = u'5ZSG8ESc5TDsVVl78KrX'
current_meeting = db.collection(u'feedback-results').document(current_feedback_doc_id).get().to_dict()
latest_meeting_timestamp = current_meeting[u'MeetingTimestamp']
team_id = current_meeting[u'TeamId']
org_id = current_meeting[u'OrgId']
print(latest_meeting_timestamp, team_id, org_id)


1588057349425 lLot1shBDwLRD2CR46PX iQFX5Oe0nHv7OisCV3V9


In [32]:
[correlations, feedbackData, special_types_dict] = main(current_feedback_doc_id, team_id, org_id, latest_meeting_timestamp )

5ZSG8ESc5TDsVVl78KrX lLot1shBDwLRD2CR46PX iQFX5Oe0nHv7OisCV3V9 1588057349425
Hour [17, 15, 15, 10, 21, 7] [17, 15, 15, 10, 21, 7]
Weekday [3, 4, 3, 6, 5, 2] [3, 4, 3, 6, 5, 2]
SpeakerDistribution [0.6656034795215656, 0.9907852564102564, 0.7283502198432421, 0.2276007215874924, None, 0.25919356745890976] [0.6656034795215656, 0.9907852564102564, 0.7283502198432421, 0.2276007215874924, 0.24339714452320108, 0.25919356745890976]
NumParticipants [4, 4, 4, 4, 4, 4] [4, 4, 4, 4, 4, 4]
Length [3800, 4000, 8000, 2000, 3600, 2580] [3800, 4000, 8000, 2000, 3600, 2580]


  c /= stddev[:, None]
  c /= stddev[None, :]


## Continuous Predictions

In [0]:
  #NOTE: This particular regression example doesn't work; need to onehot encode 
  def new_predicted_y(x_var, y_var, x_pred, input_dict, print_relation=True): 
    """
    this function is the same as the previous predicted y function, 
    but made this more general. Can use with any input dict. 
    """
    x = input_dict.get(x_var)
    y = input_dict.get(y_var)

    model = np.polyfit(x,y,2)
    predict = np.poly1d(model)

    if print_relation: 
      print("When {} has a value of {}, {} has a value of: {}."
      .format(x_var, x_pred, y_var, round(predict(x_pred),2)))

    return predict(x_pred)

In [0]:
new_predicted_y("Weekday", "SpeakerDistribution", 4, special_types_dict)

When Weekday has a value of 4, SpeakerDistribution has a value of: 0.77.


0.7675625995640554

In [0]:
def scatter_plot(x_var, y_var, input_dict):
  """
  x_var: (str) x-var from dict key 
  y_var: (str) y-var from dict key
  input_dict: (dict) dict to extract data 

  returns: (plt) scatter plot of x,y 
  """
  plt.scatter(input_dict.get(x_var), input_dict.get(y_var))
  plt.xlabel(x_var)
  plt.ylabel(y_var)
  plt.title("Scatter: {} by {}".format(x_var, y_var))

  return plt.show()


In [0]:
def get_strong_correlations(correlations_dict, thresh=0.5):
  """
  correlations_dict: (dict) 
  thresh: (float)

  returns: (dict) correlations where magnitude 
  is less/greater than thresh 
  """
  temp = {}
  for idx,val in enumerate(correlations_dict):
    iter_range = len(correlations_dict[idx])
    question_id = val['QuestionId']
    
    if question_id not in temp.keys():
      temp[question_id] = [x.get("QuestionId") for x in val["Values"] if x.get("Value") > thresh or x.get("Value") < -thresh]
  
  strong_corr = {k:v for k, v in temp.items() if len(v) > 0}
  
  return strong_corr

In [0]:
def get_regressor_vals(feedback_dict):
  """
  feedback_dict: (dict)
  
  returns: (dict) contains only question_id and 
  values for regression 
  """
  regressor_vals = {}
  for idx, val in enumerate(feedbackData):
    question_id = feedbackData[idx].get("QuestionId")
    
    if question_id not in regressor_vals.keys():
      regressor_vals[question_id] = feedbackData[idx]["Values"] 
  
  return regressor_vals

In [0]:
def predicted_y (x_var, y_var, x_pred, print_relation = False):
  """
  x_var: (str) x-val question_id key
  y_var: (str) y-val question_id key 

  x_pred: (float/int) x-val to predict on 
  print_relation: (bool) True = print on, False = print off

  returns: (float/int) predicted y-val for given x-val
  """
  regressor_vals = get_regressor_vals(feedbackData)
  x = regressor_vals[x_var]
  y = regressor_vals[y_var]

  model = np.polyfit(x,y,1)
  predict = np.poly1d(model)

  if print_relation: 
    print("When {} has a value of {}, {} has a value of: {}."
    .format(x_var, x_pred, y_var, round(predict(x_pred),2)))

  return predict(x_pred)

In [0]:
def relative_difference(low_val, high_val, x_var, y_var, print_relation=False):
  #TODO: Define directions of relations [higher/lower, increase/decrease]
  """
  low_val: (float/int) low number for x-val
  high_val: (float/int) high number for x-val

  x_var: (str) x-val question_id key
  y_var: (str) y-val question_id key 

  returns: (float) percent difference of y_var when x_var has high or low values 
  """
  
  low = predicted_y(x_var, y_var, low_val)
  high = predicted_y(x_var, y_var, high_val)

  difference = round(((high-low)/low)*100, 2)

  if difference > 0:
    relation = "higher"
  else: 
    relation = "lower"


  if print_relation: 
    print("When {} has a value of {}, {} has a {}% {} score compared to when {} has a value of {}. "
    .format(x_var, high_val, y_var, difference, relation, x_var, low_val))

  return difference

In [0]:
def store_differences(low_val, high_val, strong_corr_dict):
  """
  low_val: (float/int) low number for x-val
  high_val: (float/int) high number for x-val

  strong_corr_dict: (dict) cleaned correlations with only ids 

  returns: (dict) keys = ids, value = relative difference in values 
  """
  relations = {}

  for k, v in strong_corr_dict.items():
    x_var = k
    if len(v) > 1:
      for i in range(len(v)):
        y_var = v[i]
        update_key = "{}:{}".format(x_var, y_var)
        if update_key not in relations.keys():
          relations[update_key] = relative_difference(low_val, high_val, x_var, y_var)
    else: 
      y_var = v[0]
      update_key = "{}:{}".format(x_var, y_var)
      if update_key not in relations.keys():
        relations[update_key] = relative_difference(low_val,high_val, x_var, y_var)

  return relations
    

In [0]:
store_differences(1, 5, get_strong_correlations(correlations))

KeyError: ignored

## Categorical Predictions 

#### Group by categorical iv, get mean of dv

In [0]:
#use me to combine special dict and regressor dict 
def combine_dict(dict1, dict2):
  """
  dict1: (dict) first dictonary to combine
  dict2: (dict) second dictonary to combine

  returns: merged dict of dict1, dict2
  """
  return {**dict1, **dict2}

In [0]:
#send combined dict here, get mean values for dv grouped by categorical iv 
def categorical_means(dict, iv, dv, print_relation=False, return_highest=False):
  """
  dict: (dict) contains categorical predictors 
  iv: (str) label for categorical predictor
  dv: (str) label for value to predict on 

  return_highest=True: 
  returns: (dict) {key: categorical, value: mean of dv}
  for only the highest mean value for dv
  
  return_highest=False: 
  returns: (dict) {key: categorical, value: mean of dv}
  for all values 
  """
  df = pd.DataFrame.from_dict(dict)
  grouped = df.groupby(iv).mean()
  dv_ol = grouped[dv].sort_values(ascending=False).to_dict()

  if print_relation:
    for k, v in dv_ol.items(): 
      print("When {} is {}, the average value for {} is {}".format(iv, k, dv, v))

  if return_highest: 
    return {list(dv_ol.keys())[0], list(dv_ol.values())[0]}
  else: 
    return dv_ol

In [0]:
#call the functions
merged = combine_dict(special_types_dict, get_regressor_vals(feedbackData))
categorical_means(merged, 'Weekday', '1a')

{2: 6.25,
 3: 4.416666666666666,
 4: 4.666666666666667,
 5: 5.0,
 6: 4.666666666666667}

### Linear Regression with categoricals 

In [0]:
#imports
from sklearn.linear_model import LinearRegression

In [0]:
def create_dummy(dict, var, return_df=False):
  """
  dict: (dict) dict to get dummies
  var: (str) label for the categorical var

  return_df = True: 
  returns: (df) categorical dummy coded
  return_df=False:
  returns: (dict) categorical dummy coded
  """

  df = pd.DataFrame.from_dict(special_types_dict)
  ohe = pd.get_dummies(df[var])

  df.drop(columns=[var], inplace=True)
  final_df = df.join(ohe)

  ohe_dict = final_df.to_dict()

  #removes nested dicts 
  new_dict = {}
  for k, v in ohe_dict.items(): 
    x = list(v.values())
    if k not in new_dict:
      new_dict[k] = x

  if return_df: 
    return pd.DataFrame.from_dict(new_dict)
  else: 
    return new_dict

In [0]:
#create dict for categorical regression 
cat_regression = pd.DataFrame.from_dict({**get_regressor_vals(feedbackData), 
                       **create_dummy(special_types_dict, 'Weekday')})

#update column names to str 
cat_regression.columns = cat_regression.columns.astype(str)
cat_regression

In [0]:
def get_predictions(ivs, dv, predictors,df):
  """
  iv:(list) feature list 
  dv:(str) outcome to predict
  predictors: (list) specific values of 
  iv to predict on 

  returns:(float) prediction
  """
  X = df[ivs]
  y = df[dv]
  reg = LinearRegression().fit(X, y)
  
  return reg.predict([predictors])

In [85]:
get_predictions(['2', '3', '4', '5', '6'], '3a', [0, 0, 0, 0, 1] , cat_regression)

array([4.33333333])

### Scrapwork 

In [0]:
df = pd.DataFrame.from_dict(special_types_dict) #convert dict to df
gropued_weekday = df.groupby('Weekday').mean() # create df where grouped by mean


one_hot = pd.get_dummies(df['Weekday']) #create one_hot encoding
one_hot

new_df = df.drop(columns= ['Weekday']).join(one_hot) #join one_hot encoding
one_hot_dict = new_df.to_dict() #create dict with one-hot encoded vals

one_hot_dict

new_dict = {}
for k, v in one_hot_dict.items(): 
    x = list(v.values())
    if k not in new_dict:
      new_dict[k] = x

#very simple approach to the weekdays problem, groupby day of the week, 
#calculate mean of dv, compare to others and find optimal...
#too much chance? 
#remember to drop one dummy variable column to avoid multicolinarity (since data can be derived)


regressor_df = pd.DataFrame.from_dict(get_regressor_vals(feedbackData))
one_hot_df = pd.DataFrame.from_dict(one_hot_dict)

pd.concat([regressor_df, one_hot_df], ignore_index=True, axis=0)
# one_hot_df = pd.DataFrame.from_dict(combined)
# from sklearn.linear_model import LinearRegression
# model = LinearRegression()
# one_hot_df.head()

#TODO 
#I need to get these things from the one hot encoding into non-nested dicts
#so I can combine it with the the get_regressor_vals dict
#this can be used to do regression on categorical data

categorical = ["Hour", "Weekday"]
#TODO: calculate relations between categorical and continious vars
#TODO: calculate relations between categorical and categorical vars

#Idea, group by continous DV mean for each level of categorical (e.g., weekday)

#group by all the days of the week 
#take the mean for the corresponding dependent variable 