# Statistics on python (numpy)

### Install libraries

Use `!` to run things on terminal

In [None]:
!pip install numpy
!pip install pandas
!pip install scipy
!pip install mysql-connector-python

### Import libraries

In [1]:
# Import numpy for statistical functions
import numpy as np
import pandas as pd
import mysql.connector as sql


### Read the Salaries as a Pandas Series

In [27]:
db_connection = sql.connect(host='127.0.0.1', database='employees', user='root', password='')
db_cursor = db_connection.cursor()
db_cursor.execute("""SELECT salary FROM salaries limit 1000""")
table_rows = db_cursor.fetchall()
salaries_df = pd.DataFrame(table_rows)

In [28]:
salaries_df.head(10)

Unnamed: 0,0
0,60117
1,62102
2,66074
3,66596
4,66961
5,71046
6,74333
7,75286
8,75994
9,76884


### Create a numpy array
- Getting math scores column as an array

In [29]:
salaries = np.array(salaries_df[0])
salaries[:10]

array([60117, 62102, 66074, 66596, 66961, 71046, 74333, 75286, 75994,
       76884])

## Numpy Statistics

#### Mean

- The mean (average) of a data set is found by adding all numbers in the data set and then dividing by the number of values in the set



$${\displaystyle A={\frac {1}{n}}\sum _{i=1}^{n}a_{i}={\frac {a_{1}+a_{2}+\cdots +a_{n}}{n}}}$$

In [30]:
# Using sum and dividing by total
mean = np.sum(salaries) / len(salaries)
print(mean)
# Using built in mean function
mean = np.mean(salaries)
print(mean)

64419.765
64419.765


#### Median
- The median is the middle value when a data set is ordered from least to greates

  $$m\left(x\right)
=\begin{cases}
  x_\frac{n+1}{2}                                    & n\text{ odd}\\
  \frac {1}{2}\left(x_{\frac{n}{2}} + x_{\frac{n}{2} + 1}\right) & n \text{ even}
\end{cases}$$


In [31]:
def get_median(arr):
    n = len(arr)
    sorted_arr = sorted(salaries)
    if n % 2 == 0:
        return (sorted_arr[n//2] + sorted_arr[n//2 - 1])/2
    else:
        return sorted_arr[n//2]

In [32]:
# Using own formula
print(get_median(salaries))
# Using built in median function
print(np.median(salaries))

62736.0
62736.0


#### Variance
- How far the data points in a population are from the population mean.

 $$\sigma^2 = \frac{\sum\limits_{i=1}^N (X -\mu)^2}{N}$$



In [33]:
def get_variance(arr):
    n = len(arr)
    mean = np.mean(arr)
    # deviations = [(x - mean) ** 2 for x in arr]
    # Better use numpy vector operations
    deviations = (arr - mean) ** 2
    variance = np.sum(deviations) / n
    return variance

In [34]:
print(get_variance(salaries))
print(np.var(salaries))

248320058.89377502
248320058.89377502


#### Standard Deviation
- The standard deviation is a measure of how close the numbers are to the mean
- Obtained from the square root of variance
$$\sigma = \sqrt{\frac{\sum\limits_{i=1}^N (X -\mu)^2}{N}}$$


In [35]:
# Using just sqrt of variance
print(np.sqrt(np.var(salaries)))
# Using built in numpy function
print(np.std(salaries))

15758.174351547676
15758.174351547676


### Other Stats concepts

In [36]:
## scipy is needed for zscore
from scipy import stats

#### Z-score

- A z-score measures exactly how many standard deviations above or below the mean a data point is

$$ z =\frac{x_i-\mu}{\sigma}$$

In [37]:
z_scores = stats.zscore(salaries)

In [38]:
# zscore calculates how far every point is from the mean, in teams of standar deviation
z_scores[:10]

array([-0.27304971, -0.14708335,  0.10497631,  0.13810197,  0.16126456,
        0.42049509,  0.62908525,  0.6895618 ,  0.73449086,  0.79096948])