In [1]:
import numpy as np
import pandas as pd

We want to create a population where 10% of the people have a sensitivity to some specific food. First we will create a dataframe of  n users under a binomial distribution.

In [21]:
n = 100    # population size
p = 0.05    # probability person has a food sensitivity
user_id = np.arange(n)    # user id array

We will create a list of foods. If a person has a food sensitivity we will randomly select (from a uniform distribution) which food they are sensitive to.

In [22]:
x = np.random.binomial(1,p,size = n)  # this x array will trigger which person has a food sensitivity 
food_list = ['wheat','corn','dairy','beef','chicken','rice','apple','potatoes','broccoli','carrots'] 

In [23]:
target = []
for items in x:
    if items == 1:
        target.append(food_list[np.random.randint(len(food_list))])
    else:
        target.append('none')

### distribution of ratings based on sensitivity to specific food
Now that we have a list of foods and a food that the user is sensitive to we will distribute "wellness" ratings based on the food eaten.  For now all users have eaten all foods 


In [25]:
    poss_ratings = np.arange(1,6)  # create ratings 1-5
    none_likely = 3                # likelihood of rating if not sensitive
    sense_likely = 2               # likelihood of raiting if sensitive
    how_likely = 3                 # how much more likely
    total = len(poss_ratings) + how_likely-1
    
    # probability arrays for sensitive and not sensitive
    prob_none = [1/total if x != none_likely else how_likely/total for x in poss_ratings ]  
    prob_sens = [1/total if x != sense_likely else how_likely/total for x in poss_ratings]

In [26]:
def rate_it(food,food_list):
    """
    Creates an array of ratings based on a weighted probability of choosing a specific rating
    1. food = food that user is sensitive to
    2. food_list = list of foods available to eat
         - compare food to food list
         - if food is on the list chose from the sensitivity probability array for making a rating
         - else chose from the none probability array for making a rating
    """
    ratings = []
    for items in food_list:
        if items == food:
            ratings.append(np.random.choice(poss_ratings,p=prob_sens))
        else:
            ratings.append(np.random.choice(poss_ratings,p=prob_none))
    
    return ratings

Create a user ratings matrix   (this is super slow, need more efficient way of doing this)

In [27]:
user_ratings = []
for user in user_id:
    for food in target:
        ratings = rate_it(food,food_list)
    user_ratings.append(ratings)
    

And Finally, create a dataframe of user food ratings

In [28]:
user_data = pd.DataFrame(data=user_ratings, index= user_id, columns=food_list)
user_data['target'] = target

In [29]:
user_data.head()

Unnamed: 0,wheat,corn,dairy,beef,chicken,rice,apple,potatoes,broccoli,carrots,target
0,3,3,3,3,4,2,1,3,4,2,none
1,3,2,5,4,3,2,1,1,2,3,none
2,4,4,3,5,2,4,1,1,2,3,none
3,3,2,3,5,3,3,3,5,4,1,none
4,4,3,3,3,3,3,4,4,5,1,none


In [30]:
user_data.describe()

Unnamed: 0,wheat,corn,dairy,beef,chicken,rice,apple,potatoes,broccoli,carrots
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,3.14,2.79,2.84,2.88,3.09,2.88,2.99,2.91,2.95,2.85
std,1.247381,1.19168,1.228615,1.273506,1.181465,1.216719,1.27521,1.172884,1.166667,1.175293
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,2.0,2.0,2.0,2.0,2.75,2.0,2.0,2.0,2.0,2.0
50%,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
75%,4.0,3.0,4.0,3.25,4.0,4.0,4.0,3.0,4.0,3.0
max,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [31]:
# show the average rating for each target sensitivity
by_targ = user_data.groupby('target')
by_targ.mean()

Unnamed: 0_level_0,wheat,corn,dairy,beef,chicken,rice,apple,potatoes,broccoli,carrots
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
beef,3.0,5.0,5.0,5.0,3.0,4.0,3.0,5.0,1.0,5.0
broccoli,2.0,1.5,3.0,3.0,5.0,2.5,3.0,2.5,2.5,3.0
corn,1.0,2.0,4.0,2.0,3.0,4.0,4.0,3.0,5.0,2.0
dairy,2.0,5.0,5.0,2.0,5.0,3.0,3.0,3.0,4.0,4.0
none,3.2,2.778947,2.778947,2.873684,3.031579,2.863158,2.978947,2.894737,2.947368,2.821053


 Hmmm, somethings amiss...

In [32]:
pwd

'C:\\Users\\hasem\\Documents\\Python Scripts\\DataSci_class\\Python-Data-Science-and-Machine-Learning-Bootcamp\\Machine Learning Sections\\Recommender-Systems'