# Before your start:
- Read the README.md file
- Comment as much as you can and use the resources (README.md file)
- Happy learning!

In [2]:
# import numpy and pandas
import pandas as pd
import numpy as np
from scipy.stats import trim_mean, mode, skew, gaussian_kde, pearsonr, spearmanr, beta
from statsmodels.stats.weightstats import ztest as ztest

from scipy.stats import ttest_ind, norm, t
from scipy.stats import f_oneway
from scipy.stats import sem

# Challenge 1 - Exploring the Data

In this challenge, we will examine all salaries of employees of the City of Chicago. We will start by loading the dataset and examining its contents

In [7]:
# Run this code:
salaries = pd.read_csv('/Users/sylviaperez-montero/Desktop/lab-hypothesis-testing-en-main/data/Current_Employee_Names__Salaries__and_Position_Titles.csv')

Examine the `salaries` dataset using the `head` function below.

In [12]:
# Examines the salaries dataset using the head function.
print(salaries.head())


                  Name                              Job Titles  \
0    AARON,  JEFFERY M                                SERGEANT   
1      AARON,  KARINA   POLICE OFFICER (ASSIGNED AS DETECTIVE)   
2  AARON,  KIMBERLEI R                CHIEF CONTRACT EXPEDITER   
3  ABAD JR,  VICENTE M                       CIVIL ENGINEER IV   
4    ABASCAL,  REECE E             TRAFFIC CONTROL AIDE-HOURLY   

         Department Full or Part-Time Salary or Hourly  Typical Hours  \
0            POLICE                 F           Salary            NaN   
1            POLICE                 F           Salary            NaN   
2  GENERAL SERVICES                 F           Salary            NaN   
3       WATER MGMNT                 F           Salary            NaN   
4              OEMC                 P           Hourly           20.0   

   Annual Salary  Hourly Rate  
0       101442.0          NaN  
1        94122.0          NaN  
2       101592.0          NaN  
3       110064.0          NaN  
4   

# Challenge 2 - Hypothesis Tests

In this section of the lab, we will test whether the hourly wage of all hourly workers is significantly different from $30/hr. Import the correct one sample test function from scipy and perform the hypothesis test for a 95% two sided confidence interval.

In [18]:
# Tests whether the hourly wage of all hourly workers is significantly different from $30/hr.
# Scipy is used. A hypothesis test for a 95% two sided confidence interval is conducted.
# The main column evaluated is the Hourly Rate variable.

from scipy import stats

# Filters the dataset for hourly workers
hourly_workers = salaries[salaries['Salary or Hourly'] == 'Hourly']

# Extracts the hourly rate for these workers
hourly_wages = hourly_workers['Hourly Rate'].dropna()  # Remove NaN values

# Performs a two-sided one-sample t-test using the scipy.stats.ttest_1samp() function.
# Null hypothesis: The mean hourly wage is 30
# Alternative hypothesis: The mean hourly wage is different from 30
t_stat, p_value = stats.ttest_1samp(hourly_wages, 30)

# Significance level (alpha) for a 95% confidence interval
alpha = 0.05

# Conclusion based on comparing the p-value to alpha.
# If the p-value is less than 0.05, the null hypothesis is rejected. This means the hourly wage is 
# significantly different from $30/hr.
if p_value < alpha:
    result = "Reject the null hypothesis: The mean hourly wage is significantly different from $30/hr."
else:
    result = "Fail to reject the null hypothesis: The mean hourly wage is not significantly different from $30/hr."

# Print result
print(result)

Reject the null hypothesis: The mean hourly wage is significantly different from $30/hr.


# Challenge 3 - Constructing Confidence Intervals

While testing our hypothesis is a great way to gather empirical evidence for accepting or rejecting the hypothesis, another way to gather evidence is by creating a confidence interval. A confidence interval gives us information about the true mean of the population. So for a 95% confidence interval, we are 95% sure that the mean of the population is within the confidence interval. 
).

To read more about confidence intervals, click [here](https://en.wikipedia.org/wiki/Confidence_interval).


In the cell below, we will construct a 95% confidence interval for the mean hourly wage of all hourly workers. 

The confidence interval is computed in SciPy using the `t.interval` function. You can read more about this function [here](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.t.html).

To compute the confidence interval of the hourly wage, use the 0.95 for the confidence level, number of rows - 1 for degrees of freedom, the mean of the sample for the location parameter and the standard error for the scale. The standard error can be computed using [this](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.sem.html) function in SciPy.

In [20]:
# Constructs a 95% confidence interval for the mean hourly wage of all hourly workers.

import numpy as np
from scipy import stats

# Filter the dataset for hourly workers
hourly_workers = salaries[salaries['Salary or Hourly'] == 'Hourly'] 

# Extract the hourly rate for these workers
hourly_wages = hourly_workers['Hourly Rate'].dropna()  # Removes NaN values

# Calculate the sample mean
sample_mean = np.mean(hourly_wages)

# Calculate the sample standard deviation
sample_std = np.std(hourly_wages, ddof=1)  # Use ddof=1 for sample standard deviation

# Calculate the sample size (number of hourly workers)
n = len(hourly_wages)

# Calculate the standard error of the sample
standard_error = sample_std / np.sqrt(n)

# Degrees of freedom
df = n - 1

# Calculate the 95% confidence interval for the mean hourly wage
confidence_level = 0.95
confidence_interval = stats.t.interval(confidence_level, df, loc=sample_mean, scale=standard_error)

# Print the result
print(f"The 95% confidence interval for the mean hourly wage is: {confidence_interval}")

The 95% confidence interval for the mean hourly wage is: (32.52345834488425, 33.05365708767623)


# Challenge 4 - Hypothesis Tests of Proportions

Another type of one sample test is a hypothesis test of proportions. In this test, we examine whether the proportion of a group in our sample is significantly different than a fraction. 

You can read more about one sample proportion tests [here](http://sphweb.bumc.bu.edu/otlt/MPH-Modules/BS/SAS/SAS6-CategoricalData/SAS6-CategoricalData2.html).

In the cell below, use the `proportions_ztest` function from `statsmodels` to perform a hypothesis test that will determine whether the number of hourly workers in the City of Chicago is significantly different from 25% at the 95% confidence level.

In [26]:
# Uses the proportions_ztest function from statsmodels to perform a hypothesis test 
# that will determine whether the number of hourly workers in the City of Chicago is 
# significantly different from 25% at the 95% confidence level.

# Null hypothesis, Ho: the proportion of hourly workers in Chicago is 25%, p = 0.25.
# Alternative hypothesis, Ha: the proportion of hourly workers is different from 25% (p not equal to 0.25).

import pandas as pd
from statsmodels.stats.proportion import proportions_ztest

# Load the dataset
file_path = '/Users/sylviaperez-montero/Desktop/lab-hypothesis-testing-en-main/data/Current_Employee_Names__Salaries__and_Position_Titles.csv'
df = pd.read_csv(file_path)

# Filter the dataset for hourly workers
hourly_workers = df[df['Salary or Hourly'] == 'Hourly']

# Total number of workers in the dataset
total_workers = len(df)

# Count the number of hourly workers
count_hourly_workers = len(hourly_workers)

# Expected proportion under the null hypothesis
expected_proportion = 0.25

# Perform the two-sided z-test using proportions_ztest
count = [count_hourly_workers]  # Number of hourly workers
nobs = [total_workers]         # Total number of workers

# Perform the z-test
# The proportions_ztest() function from statsmodels is used to compare the sample proportion to 0.25.
z_stat, p_value = proportions_ztest(count, nobs, value=expected_proportion, alternative='two-sided')

# Extract the scalar values from the numpy arrays
z_stat_value = z_stat[0]
p_value_value = p_value[0]

# Print the results
print(f"Z-statistic: {z_stat_value:.4f}")
print(f"P-value: {p_value_value:.4f}")

# Decision based on the p-value
alpha = 0.05
if p_value_value < alpha:
    result = "Reject the null hypothesis: The proportion of hourly workers is significantly different from 25%."
else:
    result = "Fail to reject the null hypothesis: The proportion of hourly workers is not significantly different from 25%."

print(result)

Z-statistic: -3.5100
P-value: 0.0004
Reject the null hypothesis: The proportion of hourly workers is significantly different from 25%.
