# Understanding Descriptive Statistics

Import the necessary libraries here:

In [1]:
# Libraries
import pandas as pd
import numpy as np
from scipy import stats

pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline

## Challenge 1
#### 1.- Define a function that simulates rolling a dice 10 times. Save the information in a dataframe.
**Hint**: you can use the *choices* function from module *random* to help you with the simulation.

In [2]:
# your code here
import random

def roll_dice_10_times():
    dice = [1, 2, 3, 4, 5, 6]
    outcomes = random.choices(dice, k=10)
    return outcomes

dice_lst = roll_dice_10_times()
columns_dice = ['Outcomes']
dice_df = pd.DataFrame(dice_lst, columns=columns_dice)


In [3]:
dice_df.sort_values(by='Outcomes')

Unnamed: 0,Outcomes
3,1
4,1
9,2
1,3
2,3
6,3
8,3
0,4
7,4
5,5


#### 2.- Plot the results sorted by value.

In [4]:
fig = px.histogram(dice_df, nbins=6, labels={'x': 'Outcomes'}, category_orders={'x': [1, 2, 3, 4, 5, 6]})

fig.update_traces(marker=dict(line=dict(color='black', width=1)))

bin_labels = [f'{i+1}' for i in range(6)]
fig.update_xaxes(
    tickmode='array',
    tickvals=list(range(1, 7)),
    ticktext=bin_labels,
    title_text='Outcomes')

fig.update_yaxes(title_text='Rolls frequency', range=(0, 10), 
                tickvals=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10))



#### 3.- Calculate the frequency distribution and plot it. What is the relation between this plot and the plot above? Describe it with words.

In [5]:
# your code here
freq_dice_df = pd.DataFrame(dice_df.value_counts())
freq_dice_df.rename(columns={0:'frequency'}, inplace=True)
freq_dice_df


Unnamed: 0_level_0,frequency
Outcomes,Unnamed: 1_level_1
3,4
1,2
4,2
2,1
5,1


In [6]:
fig_2 = px.histogram(freq_dice_df, labels={'x': 'Frequency'}, category_orders={'x': [1, 2, 3, 4, 5, 6]})

fig_2.update_traces(marker=dict(line=dict(color='black', width=1)))

bin_labels_2 = [f'{i+1} times' for i in range(6)]
fig_2.update_xaxes(
    tickmode='array',
    tickvals=list(range(1, 7)),
    ticktext=bin_labels_2,
    title_text='Frequency')

fig_2.update_yaxes(title_text='rolls', range=(0, 10), 
                tickvals=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10))

In [7]:
fig = px.histogram(freq_dice_df, nbins=6, labels={'x': 'Outcomes'}, category_orders={'x': [1, 2, 3, 4, 5, 6]})

fig.update_traces(marker=dict(line=dict(color='black', width=1)))

bin_labels = [f'{i+1}' for i in range(6)]
fig.update_xaxes(
    tickmode='array',
    tickvals=list(range(1, 7)),
    ticktext=bin_labels,
    title_text='Outcomes')

fig.update_yaxes(title_text='Rolls frequency', range=(0, 10), 
                tickvals=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10))

In [8]:
"""
In the first plot, the frequency is on the y-axis: this number correspond to the number on the x axis of the second plot. 
So, for example, if the first plot has two bars of the same height - aka it shows that 2 outcomes with frequency 1 -, 
the second plot will show a bar for '1 times' with height equal to 2.

"""

"\nIn the first plot, the frequency is on the y-axis: this number correspond to the number on the x axis of the second plot. \nSo, for example, if the first plot has two bars of the same height - aka it shows that 2 outcomes with frequency 1 -, \nthe second plot will show a bar for '1 times' with height equal to 2.\n\n"

## Challenge 2
Now, using the dice results obtained in *challenge 1*, your are going to define some functions that will help you calculate the mean of your data in two different ways, the median and the four quartiles. 

#### 1.- Define a function that computes the mean by summing all the observations and dividing by the total number of observations. You are not allowed to use any methods or functions that directly calculate the mean value. 

In [9]:
# your code here
def get_mean(col):
    mean_outcome = sum(col.values) / len(col.values)
    return mean_outcome

In [10]:
get_mean(dice_df['Outcomes'])

2.9

In [11]:
dice_df['Outcomes']

0    4
1    3
2    3
3    1
4    1
5    5
6    3
7    4
8    3
9    2
Name: Outcomes, dtype: int64

#### 2.- First, calculate the frequency distribution. Then, calculate the mean using the values of the frequency distribution you've just computed. You are not allowed to use any methods or functions that directly calculate the mean value. 

In [12]:
# your code here
def get_frequency(col):
    for value in col:
        freq_dict = dict(col.value_counts())
        return freq_dict

In [13]:
freq_outcomes = get_frequency(dice_df['Outcomes'])
freq_df = pd.DataFrame.from_dict(freq_outcomes, orient='index', columns=['Frequency'])
freq_df = freq_df.reset_index().rename(columns={'index':'Outcomes'})
freq_df

Unnamed: 0,Outcomes,Frequency
0,3,4
1,4,2
2,1,2
3,5,1
4,2,1


In [14]:
get_mean(freq_df['Frequency'])

2.0

#### 3.- Define a function to calculate the median. You are not allowed to use any methods or functions that directly calculate the median value. 
**Hint**: you might need to define two computation cases depending on the number of observations used to calculate the median.

In [15]:
# your code here
def get_median(df, name_col):
    sorted_outcomes = sorted(df[name_col])
    count_outcomes = len(sorted_outcomes)

    if count_outcomes % 2 == 0:
        median_col = (sorted_outcomes[count_outcomes//2] + sorted_outcomes[(count_outcomes//2)-1])//2
    else:
        median_col = sorted_outcomes[count_outcomes//2]
        
    return median_col


#in case the col has odd len the median is the middle one
#in case of even len the the mean of the two middle values

In [16]:
get_median(dice_df, 'Outcomes')

3

#### 4.- Define a function to calculate the four quartiles. You can use the function you defined above to compute the median but you are not allowed to use any methods or functions that directly calculate the quartiles. 

In [17]:
def get_quartiles(df, name_col):
    import math
    sorted_outcomes = sorted(df[name_col])
    num_outcomes =len(sorted_outcomes)
    
    index1 = math.ceil(num_outcomes * 1/4)
    index2 = math.ceil(get_median(df, name_col))
    index3 = math.ceil(num_outcomes * 3/4)

    q1 = sorted_outcomes[index1]
    q2 = sorted_outcomes[index2]
    q3 = sorted_outcomes[index3]

    return q1, q2, q3



In [18]:
get_quartiles(dice_df, 'Outcomes')

(3, 3, 4)

## Challenge 3
Read the csv `roll_the_dice_hundred.csv` from the `data` folder.
#### 1.- Sort the values and plot them. What do you see?

In [19]:
# your code here
roll_dice_100 = pd.read_csv(r'C:\Users\zluca\OneDrive\LAURA\Ironhack - data analyst\GitHub\week 4\Descriptive-Stats\data\roll_the_dice_hundred.csv')
roll_dice_100.drop(columns='Unnamed: 0', inplace=True)
sorted_values_100 = roll_dice_100.sort_values(by='value')
sorted_values_100


Unnamed: 0,roll,value
0,0,1
47,47,1
56,56,1
9,9,1
73,73,1
...,...,...
17,17,6
11,11,6
24,24,6
21,21,6


In [20]:

fig=px.histogram(sorted_values_100, x='value',nbins=15, title='Roll the dice 100 times')
fig.update_xaxes(range=[0,7])
fig.update_yaxes (range=[0, 101])
custom_tickvals = list(range(1, 7, 1))
fig.update_xaxes(tickvals=custom_tickvals)

In [21]:
"""
It seems that the outcomes appear with a very similar frequencey (aka have a close frequency distribution).

"""

'\nIt seems that the outcomes appear with a very similar frequencey (aka have a close frequency distribution).\n\n'

#### 2.- Using the functions you defined in *challenge 2*, calculate the mean value of the hundred dice rolls.

In [22]:
# your code here
mean_dice_100 = get_mean(roll_dice_100['value'])
median_dice_100 = get_median(roll_dice_100, 'value')
mean_dice_100, median_dice_100

(3.74, 4)

#### 3.- Now, calculate the frequency distribution.


In [23]:
freq_100 = get_frequency(roll_dice_100['value'])
freq_100 = pd.DataFrame.from_dict(freq_100, orient='index', columns=['Frequency']).reset_index()
freq_100 = freq_100.rename(columns={'index':'Outcomes'})
freq_100



Unnamed: 0,Outcomes,Frequency
0,6,23
1,4,22
2,2,17
3,3,14
4,1,12
5,5,12


In [24]:
mean_freq_100 = get_mean(freq_100['Frequency'])
median_freq_100 = get_median(freq_100, 'Frequency')
mean_freq_100, median_freq_100

(16.666666666666668, 15)

#### 4.- Plot the histogram. What do you see (shape, values...) ? How can you connect the mean value to the histogram? 

In [25]:
fig_3 = px.bar(freq_100, x='Outcomes', y='Frequency', labels={'Outcomes': 'Dice Outcomes', 'Frequency': 'Frequency'})

colors = ['red', 'blue', 'green', 'purple', 'orange', 'pink'] 
fig_3.update_traces(marker_color=colors)

fig_3.update_xaxes(title_text='Dice Outcome')
fig_3.update_yaxes(title_text='Frequency')

fig_3.add_shape(
    type='line',
    x0=mean_dice_100,
    x1=mean_dice_100,
    y0=0,
    y1=max(freq_100['Frequency']),
    line=dict(color='black', width=2))

fig_3.add_shape(
    type='line',
    y0=mean_freq_100,
    y1=mean_freq_100,
    x0=0,
    x1=max(freq_100['Outcomes']),
    line=dict(color='black', width=2))

# Show the plot
fig_3.show()

In [26]:
'''
6 and 4 are the outcomes with the highest frequency, but there is no big difference though all frequency values (considering the range of 100 times). 
In fact, we know that the mean of the outcomes is 3,74, while the median is 4. 
Values are close and are both indication that outcomes bigger than 3 appears slightly often than outcomes smaller than 3 - 
and this is also confirmed by the mean and median of the frequency: the mean is skewed in the direction of the highest frequencies.
To locate the mean of values and frequencies I added two line. The vertical one is the mean of the outcomes in the first dataset.
It is closer to the most frequent values and to the median too. The horizontal line is the mean of the frequency,
which is also close to the median - we can see its central position between the borders of the shortest and highest bars.'
'''

"\n6 and 4 are the outcomes with the highest frequency, but there is no big difference though all frequency values (considering the range of 100 times). \nIn fact, we know that the mean of the outcomes is 3,74, while the median is 4. \nValues are close and are both indication that outcomes bigger than 3 appears slightly often than outcomes smaller than 3 - \nand this is also confirmed by the mean and median of the frequency: the mean is skewed in the direction of the highest frequencies.\nTo locate the mean of values and frequencies I added two line. The vertical one is the mean of the outcomes in the first dataset.\nIt is closer to the most frequent values and to the median too. The horizontal line is the mean of the frequency,\nwhich is also close to the median - we can see its central position between the borders of the shortest and highest bars.'\n"

#### 5.- Read the `roll_the_dice_thousand.csv` from the `data` folder. Plot the frequency distribution as you did before. Has anything changed? Why do you think it changed?

In [27]:
roll_dice_1000 = pd.read_csv(r'C:\Users\zluca\OneDrive\LAURA\Ironhack - data analyst\GitHub\week 4\Descriptive-Stats\data\roll_the_dice_thousand.csv')

In [28]:
# your code here
roll_dice_1000.drop(columns='Unnamed: 0', inplace=True)
sorted_values_1000 = roll_dice_1000.sort_values(by='value')
sorted_values_1000

Unnamed: 0,roll,value
564,564,1
922,922,1
560,560,1
213,213,1
214,214,1
...,...,...
855,855,6
360,360,6
857,857,6
388,388,6


In [29]:
fig_4=px.histogram(sorted_values_1000, x='value',nbins=15, title='Roll the dice 1000 times')
fig_4.update_xaxes(range=[0,7])
fig_4.update_yaxes (range=[0, 1001])
custom_tickvals = list(range(1, 7, 1))
fig_4.update_xaxes(tickvals=custom_tickvals)

In [30]:
freq_1000 = get_frequency(roll_dice_1000['value'])
freq_1000 = pd.DataFrame.from_dict(freq_1000, orient='index', columns=['Frequency']).reset_index()
freq_1000 = freq_1000.rename(columns={'index':'Outcomes'})
freq_1000

Unnamed: 0,Outcomes,Frequency
0,1,175
1,3,175
2,4,168
3,2,167
4,6,166
5,5,149


In [31]:
mean_freq_1000 = get_mean(freq_1000['Frequency'])
median_freq_1000 = get_median(freq_1000, 'Frequency')
mean_freq_1000, median_freq_1000

(166.66666666666666, 167)

In [32]:
mean_dice_1000 = get_mean(roll_dice_1000['value'])
median_dice_1000 =get_median(roll_dice_1000, 'value')
mean_dice_1000, mean_dice_1000

(3.447, 3.447)

In [33]:
fig_5 = px.bar(freq_1000, x='Outcomes', y='Frequency', labels={'Outcomes': 'Dice Outcomes', 'Frequency': 'Frequency'})

colors = ['red', 'blue', 'green', 'purple', 'orange', 'pink'] 
fig_5.update_traces(marker_color=colors)

fig_5.update_xaxes(title_text='Dice Outcome')
fig_5.update_yaxes(title_text='Frequency')

fig_5.add_shape(
    type='line',
    x0=mean_dice_1000,
    x1=mean_dice_1000,
    y0=0,
    y1=max(freq_1000['Frequency']),
    line=dict(color='black', width=2))

fig_5.add_shape(
    type='line',
    y0=mean_freq_1000,
    y1=mean_freq_1000,
    x0=0,
    x1=max(freq_1000['Outcomes']),
    line=dict(color='black', width=2))

fig_5.show()

In [34]:
"""
The frequency distribution seems to become even more evenly distributed by increasing the 'rolling times'. 
The mean values seem also closer to the median values, in fact the mean of the outcomes and the median have the same value. 
Increasing the number of rolling times flatten the probability of each of the six outcomes to the same level.
"""

"\nThe frequency distribution seems to become even more evenly distributed by increasing the 'rolling times'. \nThe mean values seem also closer to the median values, in fact the mean of the outcomes and the median have the same value. \nIncreasing the number of rolling times flatten the probability of each of the six outcomes to the same level.\n"

## Challenge 4
In the `data` folder of this repository you will find three different files with the prefix `ages_population`. These files contain information about a poll answered by a thousand people regarding their age. Each file corresponds to the poll answers in different neighbourhoods of Barcelona.

#### 1.- Read the file `ages_population.csv`. Calculate the frequency distribution and plot it as we did during the lesson. Try to guess the range in which the mean and the standard deviation will be by looking at the plot. 

In [35]:
# your code here
ages_pop = pd.read_csv(r'C:\Users\zluca\OneDrive\LAURA\Ironhack - data analyst\GitHub\week 4\Descriptive-Stats\data\ages_population.csv')
ages_pop.sort_values(by='observation')

Unnamed: 0,observation
489,1.0
209,1.0
301,2.0
451,2.0
338,4.0
...,...
523,69.0
437,70.0
493,71.0
339,73.0


In [36]:
freq_ages = get_frequency(ages_pop['observation'])
freq_ages = pd.DataFrame.from_dict(freq_ages, orient='index', columns=['observation']).reset_index()
freq_ages = freq_ages.rename(columns={'index':'ages'})
freq_ages

Unnamed: 0,ages,observation
0,39.0,45
1,41.0,36
2,30.0,34
3,35.0,33
4,43.0,32
...,...,...
67,73.0,1
68,82.0,1
69,70.0,1
70,71.0,1


In [37]:
fig_6=px.histogram(freq_ages, x='ages', y='observation',nbins=15, title='Ages per population')
fig_6.update_xaxes(range=[0,100])
fig_6.update_yaxes (range=[0, 500])
custom_tickvals = list(range(1, 100, 2))
fig_6.update_xaxes(tickvals=custom_tickvals)

In [38]:
''' This seems like a normal distribution; my guess is that mean is around 37/38. 
The standard deviation should be represented as the distance between the first quartile (or the third quartile)
from the mean. Supposing the first quartile is around 20 and the third around 53, 
I would say that the standard deviation should be between 17 and 22 (too big range to make a guess?)'''

' This seems like a normal distribution; my guess is that mean is around 37/38. \nThe standard deviation should be represented as the distance between the first quartile (or the third quartile)\nfrom the mean. Supposing the first quartile is around 20 and the third around 53, \nI would say that the standard deviation should be between 17 and 22 (too big range to make a guess?)'

#### 2.- Calculate the exact mean and standard deviation and compare them with your guesses. Do they fall inside the ranges you guessed?

In [39]:
# your code here
get_mean(freq_ages['ages'])

37.611111111111114

In [40]:
np.std(freq_ages)


ages           21.091438
observation    11.183514
dtype: float64

In [41]:
"""
Guesses were close!
"""

'\nGuesses were close!\n'

#### 3.- Now read the file `ages_population2.csv` . Calculate the frequency distribution and plot it.

In [42]:
# your code here
ages_pop_2 = pd.read_csv(r'C:\Users\zluca\OneDrive\LAURA\Ironhack - data analyst\GitHub\week 4\Descriptive-Stats\data\ages_population2.csv')

In [43]:
ages_pop_2

Unnamed: 0,observation
0,25.0
1,31.0
2,29.0
3,31.0
4,29.0
...,...
995,26.0
996,22.0
997,21.0
998,19.0


In [44]:
freq_ages_2 = get_frequency(ages_pop_2['observation'])
freq_ages_2 = pd.DataFrame.from_dict(freq_ages_2, orient='index', columns=['observation']).reset_index()
freq_ages_2 = freq_ages_2.rename(columns={'index':'ages'})
freq_ages_2

Unnamed: 0,ages,observation
0,28.0,139
1,27.0,125
2,26.0,120
3,29.0,115
4,25.0,98
5,30.0,90
6,24.0,78
7,31.0,61
8,23.0,41
9,22.0,35


In [45]:
fig_7=px.histogram(freq_ages_2, x='ages', y='observation',nbins=15, title='Ages per population')
fig_7.update_xaxes(range=[0,100])
fig_7.update_yaxes (range=[0, 500])
custom_tickvals = list(range(1, 100, 2))
fig_7.update_xaxes(tickvals=custom_tickvals)

####  4.- What do you see? Is there any difference with the frequency distribution in step 1?

In [46]:
"""
The shape of the plot is similar to the plot in step 1 - so close to a normal distribution, but a bit skewed to the right. 
The range for age values is much smaller, so the standard deviation would be for example also smaller than in the first plot.
That can be also guessed by the fact that the curve is 'tighter' than the one in step 1.
The mean seems around 28.
"""

"\nThe shape of the plot is similar to the plot in step 1 - so close to a normal distribution. \nThe range for age values is much smaller, so the standard deviation would be for example also smaller than in the first plot.\nThat can be also guessed by the fact that the curve is 'tighter' than the one in step 1.\nThe mean seems around 28.\n"

#### 5.- Calculate the mean and standard deviation. Compare the results with the mean and standard deviation in step 2. What do you think?

In [47]:
# your code here
get_mean(freq_ages_2['ages'])

27.5

In [48]:
np.std(freq_ages_2['ages'])

5.188127472091127

In [49]:
"""
The guesses were in the right direction...
"""

'\nThe guesses were in the right direction...\n'

## Challenge 5
Now is the turn of `ages_population3.csv`.

#### 1.- Read the file `ages_population3.csv`. Calculate the frequency distribution and plot it.

In [50]:
# your code here
ages_pop_3 = pd.read_csv(r'C:\Users\zluca\OneDrive\LAURA\Ironhack - data analyst\GitHub\week 4\Descriptive-Stats\data\ages_population3.csv')

In [51]:
freq_ages_3 = get_frequency(ages_pop_3['observation'])
freq_ages_3 = pd.DataFrame.from_dict(freq_ages_3, orient='index', columns=['observation']).reset_index()
freq_ages_3 = freq_ages_3.rename(columns={'index':'ages'})
freq_ages_3

Unnamed: 0,ages,observation
0,32.0,37
1,35.0,31
2,37.0,31
3,39.0,29
4,36.0,26
...,...,...
70,76.0,1
71,8.0,1
72,9.0,1
73,1.0,1


In [52]:
fig_8=px.histogram(freq_ages_3, x='ages', y='observation',nbins=15, title='Ages per population')
fig_8.update_xaxes(range=[0,100])
fig_8.update_yaxes (range=[0, 500])
custom_tickvals = list(range(1, 100, 2))
fig_8.update_xaxes(tickvals=custom_tickvals)

#### 2.- Calculate the mean and standard deviation. Compare the results with the plot in step 1. What is happening?

In [53]:
# your code here
get_mean(freq_ages_3['ages'])

39.92

In [54]:
np.std(freq_ages_3['ages'])

21.783639120526516

In [55]:
"""
Mean and standard deviation values are close in plot on ages_pop_1 and in this plot. 
However the plot looks different; there are a lot more values in the older side of the population. 
Visually, it seems that the difference between mean and the quartiles must be bigger here than in the first plot.
Maybe this will be shown better with a boxplot?
"""

'\nMean and standard deviation values are close in plot on ages_pop_1 and in this plot. \nHowever the plot looks different; there are a lot more values in the older side of the population. \nVisually, it seems that the difference between mean and the quartiles must be bigger here than in the first plot.\nMaybe this will be shown better with a boxplot?\n'

#### 3.- Calculate the four quartiles. Use the results to explain your reasoning for question in step 2. How much of a difference is there between the median and the mean?

In [56]:
# your code here
get_quartiles(freq_ages_3, 'ages')

(22.0, 43.0, 60.0)

In [57]:
get_quartiles(freq_ages, 'ages')

(20.0, 39.0, 56.0)

In [58]:
"""
The median and the third quartile are considerably higher in the third ages df. 
This is compatible with the many more values in the older side of the population showd by the plot.
In the first df median and mean are very close (like in a normal distribution curve). In this last case, the median is higher than the mean,
so the curve is negatively skewed.
"""

'\nThe median and the third quartile are considerably higher in the third ages df. \nThis is compatible with the many more values in the older side of the population showd by the plot.\nIn the first df median and mean are very close (like in a normal distribution curve). In this last case, the median is higher than the mean,\nso the curve is negatively skewed.\n'

#### 4.- Calculate other percentiles that might be useful to give more arguments to your reasoning.

In [65]:
print(np.percentile(freq_ages_3, 90))
print(np.percentile(freq_ages_3, 95))

62.099999999999994
69.54999999999998


In [66]:
print(np.percentile(freq_ages, 90))
print(np.percentile(freq_ages, 95))

58.70000000000002
65.85


In [59]:
"""
By showing the  values in the 90th and in the 95th percentile for the third and the first dataframe respectively,
we can see that in the third dataframe they are higher than in the first. This means that there are more older people 
represented in the third dataframe than in the first, as the percentiles are an indication of how the values are distributed.
The higher are the 90th and the 95th percentile, the higher is the number of ages values higher than q3 and the curve will be skewed to the left.
"""

'\nyour comments here\n'

## Bonus challenge
Compare the information about the three neighbourhoods. Prepare a report about the three of them. Remember to find out which are their similarities and their differences backing your arguments in basic statistics.

In [60]:
# your code here

In [61]:
"""
your comments here
"""

'\nyour comments here\n'