# Statistics on python (numpy)

### Install libraries

Use `!` to run things on terminal

In [3]:
!pip install numpy
!pip install pandas
!pip install scipy



### Import libraries

In [29]:
# Import numpy for statistical functions
import numpy as np
# Import pandas for easy CSV reading
import pandas as pd

### Read the CSV

In [3]:
df = pd.read_csv('./StudentsPerformance.csv')
# Display only the top 5 elements
df.head(5)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


### Shape tells you rows and columns of the dataframe

In [28]:
df.shape

(1000, 8)

In [27]:
df[df["gender"]=="female"].shape

(518, 8)

#### Reorder colunns

In [40]:
cols = list(df.columns.values)
cols.sort(reverse=True)
df[cols]

Unnamed: 0,writing score,test preparation course,reading score,race/ethnicity,parental level of education,math score,lunch,gender
0,74,none,72,group B,bachelor's degree,72,standard,female
1,88,completed,90,group C,some college,69,standard,female
2,93,none,95,group B,master's degree,90,standard,female
3,44,none,57,group A,associate's degree,47,free/reduced,male
4,75,none,78,group C,some college,76,standard,male
...,...,...,...,...,...,...,...,...
995,95,completed,99,group E,master's degree,88,standard,female
996,55,none,55,group C,high school,62,free/reduced,male
997,65,completed,71,group C,high school,59,free/reduced,female
998,77,completed,78,group D,some college,68,standard,female


### Create an array from any column
- Getting math scores column as an array

In [6]:
math_scores = np.array(df['math score'])
math_scores[:10]

array([72, 69, 90, 47, 76, 71, 88, 40, 64, 38])

## Numpy & Pandas Statistics

#### Mean

- The mean (average) of a data set is found by adding all numbers in the data set and then dividing by the number of values in the set



$${\displaystyle A={\frac {1}{n}}\sum _{i=1}^{n}a_{i}={\frac {a_{1}+a_{2}+\cdots +a_{n}}{n}}}$$

In [7]:
# Using sum and dividing by total
mean = np.sum(math_scores) / len(math_scores)
print(mean)
# Using built in mean function
mean = np.mean(math_scores)
print(mean)

66.089
66.089


***Using Pandas:*** We can do the same operation with the dataframe

In [10]:
df["math score"].mean()

66.089

#### Median
- The median is the middle value when a data set is ordered from least to greates

  $$m\left(x\right)
=\begin{cases}
  x_\frac{n+1}{2}                                    & n\text{ odd}\\
  \frac {1}{2}\left(x_{\frac{n}{2}} + x_{\frac{n}{2} + 1}\right) & n \text{ even}
\end{cases}$$


In [8]:
def get_median(arr):
    n = len(arr)
    sorted_arr = sorted(math_scores)
    if n % 2 == 0:
        return (sorted_arr[n//2] + sorted_arr[n//2 - 1])/2
    else:
        return sorted_arr[n//2]

In [9]:
# Using own formula
print(get_median(math_scores))
# Using built in median function
print(np.median(math_scores))

66.0
66.0


***Using Pandas:*** We can do the same operation with the dataframe

In [20]:
df["math score"].median()

66.0

#### Variance
- How far the data points in a population are from the population mean.

 $$\sigma^2 = \frac{\sum\limits_{i=1}^N (X -\mu)^2}{N}$$



In [10]:
def get_variance(arr):
    n = len(arr)
    mean = np.mean(arr)
    # deviations = [(x - mean) ** 2 for x in arr]
    # Better use numpy vector operations
    deviations = (arr - mean) ** 2
    variance = np.sum(deviations) / n
    return variance

In [11]:
print(get_variance(math_scores))
print(np.var(math_scores))

229.68907899999996
229.68907899999996


***Using Pandas:*** We can do the same operation with the dataframe

In [17]:
df["math score"].var()

229.91899799799847

#### Standard Deviation
- The standard deviation is a measure of how close the numbers are to the mean
- Obtained from the square root of variance
$$\sigma = \sqrt{\frac{\sum\limits_{i=1}^N (X -\mu)^2}{N}}$$


In [12]:
# Using just sqrt of variance
print(np.sqrt(np.var(math_scores)))
# Using built in numpy function
print(np.std(math_scores))

15.155496659628149
15.155496659628149


***Using Pandas:*** We can do the same operation with the dataframe

In [18]:
df["math score"].std()

15.163080096009468

### Other Stats concepts

In [58]:
## scipy is needed for zscore
from scipy import stats

#### Z-score

- A z-score measures exactly how many standard deviations above or below the mean a data point is

$$ z =\frac{x_i-\mu}{\sigma}$$

In [60]:
z_scores = stats.zscore(math_scores)

In [62]:
# zscore calculates how far every point is from the mean, in teams of standar deviation
z_scores[:10]

array([ 0.39002351,  0.19207553,  1.57771141, -1.25954302,  0.65395415,
        0.32404085,  1.44574609, -1.72142165, -0.13783778, -1.85338697])

#### Covariance

- How  variables change in relation to each other over the data set
- A positive value means they are directly proportional. If one increases the other does it too
- A negative value means they are inversely proportional. If one increases the other decreases, and viceversa

$$cov_{x,y}=\frac{\sum_{i=1}^{N}(x_{i}-\bar{x})(y_{i}-\bar{y})}{N-1}$$

In [67]:
# getting reading scores, to evaluate how correlated these values are with math scores
reading_scores = np.array(df['reading score'])

In [68]:
def get_covariance(x, y):
    n = len(x)
    x_diff = x - np.mean(x)
    y_diff = y - np.mean(y)
    sum_diff = np.sum(x_diff * y_diff)
    covariance = sum_diff / (n - 1)
    return covariance

In [70]:
get_covariance(math_scores, reading_scores)

180.99895795795797

In [66]:
# Numpy creates a matrix of all covariances, it can handle more than 2 sets of values
np.cov(math_scores, reading_scores)

array([[229.918998  , 180.99895796],
       [180.99895796, 213.1656046 ]])

#### Correlation coeficient

- Measures the direction and strength of a linear relationship
- This coeficient is normalized. Maning values are between -1 and 1
- The closer to -1 or 1 means they are highly correlated
- The closer to 0, means they are not correlated

$$ r = \frac{N\sum{XY}-(\sum{X}\sum{Y})}{\sqrt{ [N \sum{x^2}-(\sum{x})^2 ][N \sum{y^2}-(\sum{y})^2 }]} $$


In [71]:
def get_correlation_coeficient(x, y):
    n = len(x)
    sum_x_y = np.sum(x * y)
    sum_x = np.sum(x)
    sum_y = np.sum(y)
    sum_x_2 = np.sum(x ** 2)
    sum_y_2 = np.sum(y ** 2)
    numerator = n * sum_x_y - (sum_x * sum_y)
    denominator = np.sqrt((n * sum_x_2 - sum_x ** 2) * (n * sum_y_2 - sum_y ** 2))
    r = numerator / denominator
    return r

In [72]:
get_correlation_coeficient(math_scores, reading_scores)

0.817579663672054

In [73]:
np.corrcoef(math_scores, df['reading score'])

array([[1.        , 0.81757966],
       [0.81757966, 1.        ]])