# `Standard Imports`

In [7]:
import numpy as np
import seaborn as sns
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
import env

* from pydataset import data
* mpg = data('mpg')
* mpg.head()

# `Stats Tests`

> discrete - discrete = chi 2

> discrete - continouts = means test

> continous - continous = correlation

> 2 or more independent groups = ANOVA

* # $Chi^2$ Test

Lets us test the hypothesis that one group is independent of another

$H_0$: is always that there is no association between the groups (they are independent)

$H_a$: is that there is a association (they are not independent) between the groups

> `chi^2: is ran on discrete vs discrete`

1. Form hypothesis
2. Make contigency table (pd.crosstab)
3. Use stats.chi2_contingency

In [33]:
def chi2_test(table):
    α = 0.05
    chi2, pval, degf, expected = stats.chi2_contingency(table)
    print('Observed')
    print(observed.values)
    print('\nExpected')
    print(expected.astype(int))
    print('\n----')
    print(f'chi^2 = {chi2:.4f}')
    print(f'p-value = {pval:.4f}')
    print('----')
    if pval < α:
        print ('We reject the null hypothesis.')
    else:
        print ("We fail to reject the null hypothesis.")

* # One Sample \ Two-Tailed

> discrete - continuous = means test
1. Plot Distributions
2. Establish Hypotheses
3. Set alpha
4. Verify Assumptions
    * Normal Distribution
        * `run wilcoxon signed rank test if non-parametric`
    * Sample Size > 30
    * Check Variance (variance is the "spread" of the data)
        * `Standard Deviation is the square root of variance`
5. Compute t-statistic and p-value (stats.ttest_1samp)
6. Conclude

In [22]:
def conclude_1samp_tt(group1, group_mean):
    α = 0.05
    tstat, p = stats.ttest_1samp(group1, group_mean)
    print(f't-stat')
    print(tstat)
    print(f'P-Value')
    print(p)
    print('\n----')
    if ((p < α) & (tstat > 0)):
        print("we can reject the null hypothesis.")
    else:
        print('We fail to reject the null hypothesis.')

* # One Sample \ One-Tailed (Greater Than)

In [13]:
def conclude_1samp_gt(group1, group_mean):
    α = 0.05
    tstat, p = stats.ttest_1samp(group1, group_mean)
    print(f't-stat')
    print(tstat)
    print(f'P-Value')
    print(p)
    print('\n----')
    if ((p / 2) < α) and (tstat > 0):
        print("we can reject the null hypothesis.")
    else:
        print('We fail to reject the null hypothesis.')

* # One Sample \ One-Tailed (Less Than)

In [14]:
def conclude_1samp_lt(group1, group_mean):
    α = 0.05
    tstat, p = stats.ttest_1samp(group1, group_mean)
    print(f't-stat')
    print(tstat)
    print(f'P-Value')
    print(p)
    print('\n----')
    if ((p / 2) < α) and (tstat < 0):
        print("we can reject the null hypothesis.")
    else:
        print('We fail to reject the null hypothesis.')

* # Two Sample \ Two-Tailed

1. Plot Distributions
2. Establish Hypotheses
3. Set alpha
4. Verify Assumptions
    * Normal Distribution
        * `run mann-whitney test if non-parametric`
    * Sample Size > 30
    * Independent Samples?
5. Compute t-statistic and p-value (stats.ttest_ind)
6. Conclude

In [15]:
def conclude_2samp_tt(sample1, sample2):
    α = 0.05
    stat, p = stats.ttest_ind(sample1, sample2, equal_var=True)
    print(f'stat')
    print(tstat)
    print(f'P-Value')
    print(p)
    print('\n----')
    if p < α:
        print("we can reject the null hypothesis.")
    else:
        print('We fail to reject the null hypothesis.')

* # Two Sample \ One-Tailed (Greater Than)

In [18]:
def conclude_2samp_gt(sample1, sample2):
    α = 0.05
    stat, p = stats.ttest_ind(sample1, sample2, equal_var=True)
    print(f'stat')
    print(tstat)
    print(f'P-Value')
    print(p)
    print('\n----')
    if (((p/2) < α) and (tstat > 0)):
        print("we can reject the null hypothesis.")
    else:
        print('We fail to reject the null hypothesis.')

* # Two Sample \ One-Tailed (Less Than)

In [19]:
def conclude_2samp_lt(sample1, sample2):
    α = 0.05
    stat, p = stats.ttest_ind(sample1, sample2, equal_var=True)
    print(f'stat')
    print(tstat)
    print(f'P-Value')
    print(p)
    print('\n----')
    if (((p/2) < α) and (tstat < 0)):
        print("we can reject the null hypothesis.")
    else:
        print('We fail to reject the null hypothesis.')

* # ANOVA

1. Plot Distributions
2. Establish Hypotheses
    * $H_0$: The means are all equal
    * $H_a$: At least 2 groups' means are not equal
3. Set alpha
4. Verify Assumptions
    * Normal Distribution
        * `run kruskal-wallis test if non-parametric`
    * Sample Size > 30
    * Independent Samples?
5. Compute t-statistic and p-value (stats.ttest_ind)
6. Conclude

In [21]:
def conclude_anova(theoretical_mean, group1, group2):
    α = 0.05
    tstat, pval = stats.f_oneway(theoretical_mean, group1, group2)
    print(f'stat')
    print(tstat)
    print(f'P-Value')
    print(pval)
    print('----')
    if pval < α:
        print("We can reject the null hypothesis.")
    else:
        print('We fail to reject the null hypothesis.')

* # `Correlation`

![image.png](attachment:image.png)

### Hypothesis Testing Procedure:

1. Determine the appropriate test
    * what variables you are comparing?
        > `continous - continous = correlation`
    * are your variables continuous or categorical?
2. Setup
    * for normal distributions: stats.pearsonr
    * for nonparametic test: stats.spearmanr
    * set null hypothesis
    * $H_0$: "there is no linear relationship between the two variables"
3. Visualize
4. Calculate test statistic + p-value
5. Conclude


* ## `PEARSON (parametric)`

In [28]:
def conclude_pearsonr(floats1, floats2):
    α = 0.05
    r, p = stats.pearsonr(floats1, floats2)
    print('r =', r)
    print('p =', p)
    print('----')
    if p < α:
        print("We can reject the null hypothesis.")
    else:
        print("We fail to reject the null hypothesis.")

* ## `SPEARMAN (non-parametric)`

In [31]:
def conclude_spearmanr(floats1, floats2):
    α = 0.05
    r, p = stats.spearmanr(floats1, floats2)
    print('r =', r)
    print('p =', p)
    print('----')
    if p < α:
        print("We can reject the null hypothesis.")
    else:
        print("We fail to reject the null hypothesis.")

# `SQL -> Python`

In [27]:
from env import username, password, host

def get_connection(db, user=username, host=host, password=password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'
print(f'set url equal to function call')
print(f'insert url into pd.read_sql("query", url)')

set url equal to function call
insert url into pd.read_sql("query", url)


In [24]:
url = get_connection('employees')

In [17]:
emp = pd.read_sql('select * from employees', url)
emp.head()

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date
0,10001,1953-09-02,Georgi,Facello,M,1986-06-26
1,10002,1964-06-02,Bezalel,Simmel,F,1985-11-21
2,10003,1959-12-03,Parto,Bamford,M,1986-08-28
3,10004,1954-05-01,Chirstian,Koblick,M,1986-12-01
4,10005,1955-01-21,Kyoichi,Maliniak,M,1989-09-12


# `TESTING AREA`

In [19]:
def new_data(SQL_query, db): # verified!
    """
    This function will:
    - take in a SQL query and db
    - create a connection url to mySQL
    - return a df of the given query and db
    """
    url = env.get_connection(db)  
    return pd.read_sql(SQL_query, url)

In [9]:
new_data('select * from passengers', 'titanic_db')

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,1
887,887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,1
888,888,0,3,female,,1,2,23.4500,S,Third,,Southampton,0
889,889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,1


In [20]:
def get_SQL_data(SQL_query, db, filename):
    """
    This function will:
    - Check local directory for csv file
        - return if exists
    - IF csv does not exist
        - create a df of the SQL_query (remember to keep query BROAD)
        - write df to csv
    - Return df
    """
    if os.path.exists(os.getcwd() + '/filename'):
        df = pd.read_csv(filename)
        return df
    else:
        df = new_data(SQL_query, db)
        df.to_csv(filename)
        return df