In [1]:
import numpy as np
import pandas as pd
from scipy import stats

# EXERCISE 1. The hourly wages in a particular industry are normally distributed with mean $13.20 and standard deviation $2.50. A company in this industry employs 40 workers, paying them an average of $12.20 per hour. Can this company be accused of paying substandard wages? Use an α = .01 level test.

In [2]:
mean1 = 13.20
mean2 = 12.20
std = 2.50
sample = 40

In [3]:
z_test = (mean2 - mean1) / (std / np.sqrt(sample))
z_test

-2.5298221281347035

-z0.01 = -2.326

-2.53 < -z0.01. So, H0 is rejected: there is evidence that the company is paying substandard wages.

# EXERCISE 2. Shear strength measurements derived from unconfined compression tests for two types of soils gave the results shown in the following document (measurements in tons per square foot). Do the soils appear to differ with respect to average shear strength, at the 1% significance level?

In [4]:
soil = pd.read_csv("/content/soil - Sheet1.csv")
soil.head()

Unnamed: 0,Soil1,Soil2
0,1.442,1.364
1,1.943,1.878
2,1.11,1.337
3,1.912,1.828
4,1.553,1.371


In [5]:
stats.ttest_ind(soil["Soil1"], soil["Soil2"], nan_policy="omit")

Ttest_indResult(statistic=5.1681473319343345, pvalue=2.5932287323528147e-06)

With α = .01, the test rejects if |t| > 2.576. So, we can reject the hypothesis that the soils have equal mean shear strengths.

# EXERCISE 3. The following dataset is based on data provided by the World Bank (https://datacatalog.worldbank.org/dataset/education-statistics). World Bank Edstats.

1. Get descriptive statistics (the central tendency, dispersion and shape of a dataset’s distribution) for each continent group (AS, EU, AF, NA, SA, OC).

In [6]:
import numpy as np
import pandas as pd
from scipy import stats

In [7]:
df = pd.read_csv("/content/2015 PISA Test - Sheet1.csv")
df.head()

Unnamed: 0,Country Code,Continent_Code,internet_users_per_100,Math,Reading,Science
0,ALB,EU,63.252933,413.157,405.2588,427.225
1,ARE,AS,90.5,427.4827,433.5423,436.7311
2,ARG,SA,68.043064,409.0333,425.3031,432.2262
3,AUS,OC,84.560519,493.8962,502.9006,509.9939
4,AUT,EU,83.940142,496.7423,484.8656,495.0375


In [8]:
df.groupby(["Continent_Code"], dropna=False).describe()

Unnamed: 0_level_0,internet_users_per_100,internet_users_per_100,internet_users_per_100,internet_users_per_100,internet_users_per_100,internet_users_per_100,internet_users_per_100,internet_users_per_100,Math,Math,Math,Math,Math,Math,Math,Math,Reading,Reading,Reading,Reading,Reading,Reading,Reading,Reading,Science,Science,Science,Science,Science,Science,Science,Science
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Continent_Code,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2
AF,2.0,43.359918,7.297226,38.2,40.779959,43.359918,45.939877,48.519836,2.0,363.2121,5.099513,359.6062,361.40915,363.2121,365.01505,366.818,2.0,355.4574,7.916909,349.8593,352.65835,355.4574,358.25645,361.0555,2.0,381.07425,7.536556,375.7451,378.409675,381.07425,383.738825,386.4034
AS,17.0,68.455613,21.08606,21.976068,50.3,74.0,84.948353,92.884826,17.0,466.216647,64.35649,380.259,403.8332,459.816,531.2961,564.1897,17.0,454.059682,57.048962,346.549,408.1022,433.5423,508.6905,535.1002,17.0,467.945847,56.671371,386.4854,417.6112,456.4836,523.2774,555.5747
EU,37.0,77.274888,12.425773,53.744979,68.6329,76.184,87.479056,98.2,37.0,477.981449,35.150403,371.3114,464.0401,489.7287,503.722,521.2506,37.0,476.164608,37.720545,351.7415,452.5143,487.2501,499.8146,526.4247,37.0,478.299381,34.450616,383.6824,460.7749,490.225,501.9369,534.1937
OC,2.0,86.391704,2.589686,84.560519,85.476112,86.391704,87.307296,88.222889,2.0,494.55975,0.938401,493.8962,494.227975,494.55975,494.891525,495.2233,2.0,506.08565,4.504341,502.9006,504.493125,506.08565,507.678175,509.2707,2.0,511.6487,2.340241,509.9939,510.8213,511.6487,512.4761,513.3035
SA,7.0,60.180494,9.772455,40.9,57.116462,64.289,66.321532,69.198471,7.0,402.8877,18.128894,377.0695,388.1022,409.0333,417.61765,422.6714,7.0,425.359229,19.709688,397.5414,416.1269,425.3031,431.9227,458.5709,7.0,421.747186,18.470319,396.6836,408.20545,424.5905,433.7946,446.9561
,5.0,66.886792,14.364359,54.215766,57.431043,59.76295,74.554202,88.47,5.0,424.25096,71.717058,327.702,400.2534,408.0235,469.6285,515.6474,5.0,446.42092,66.614744,357.7377,423.2765,427.4875,496.9351,526.6678,5.0,438.18076,76.789918,331.6388,415.7099,419.608,496.2424,527.7047


In [55]:
df_mean = df.groupby(["Continent_Code"], dropna=False).mean()
df_mean

Unnamed: 0_level_0,internet_users_per_100,Math,Reading,Science
Continent_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AF,43.359918,363.2121,355.4574,381.07425
AS,68.455613,466.216647,454.059682,467.945847
EU,77.274888,477.981449,476.164608,478.299381
OC,86.391704,494.55975,506.08565,511.6487
SA,60.180494,402.8877,425.359229,421.747186
,66.886792,424.25096,446.42092,438.18076


In [56]:
df_median = df.groupby(["Continent_Code"], dropna=False).median()
df_median

Unnamed: 0_level_0,internet_users_per_100,Math,Reading,Science
Continent_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AF,43.359918,363.2121,355.4574,381.07425
AS,74.0,459.816,433.5423,456.4836
EU,76.184,489.7287,487.2501,490.225
OC,86.391704,494.55975,506.08565,511.6487
SA,64.289,409.0333,425.3031,424.5905
,59.76295,408.0235,427.4875,419.608


In [57]:
df_mode = df.groupby(["Continent_Code"], dropna=False).agg(lambda x:x.value_counts().index[0])
df_mode

Unnamed: 0_level_0,Country Code,internet_users_per_100,Math,Reading,Science
Continent_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AF,TUN,38.2,366.818,361.0555,375.7451
AS,KAZ,21.976068,446.1098,535.1002,524.6445
EU,LVA,71.378,492.3254,416.2293,464.7819
OC,AUS,84.560519,495.2233,509.2707,513.3035
SA,TTO,40.9,417.9919,407.3486,446.9561
,DOM,59.76295,515.6474,423.2765,527.7047


In [58]:
df_std = df.groupby(["Continent_Code"], dropna=False).std()
df_std

Unnamed: 0_level_0,internet_users_per_100,Math,Reading,Science
Continent_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AF,7.297226,5.099513,7.916909,7.536556
AS,21.08606,64.35649,57.048962,56.671371
EU,12.425773,35.150403,37.720545,34.450616
OC,2.589686,0.938401,4.504341,2.340241
SA,9.772455,18.128894,19.709688,18.470319
,14.364359,71.717058,66.614744,76.789918


In [59]:
df_variance = df.groupby(["Continent_Code"], dropna=False).var()
df_variance

Unnamed: 0_level_0,internet_users_per_100,Math,Reading,Science
Continent_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AF,53.249507,26.00503,62.677447,56.799679
AS,444.621918,4141.757822,3254.584049,3211.644321
EU,154.399841,1235.550805,1422.83954,1186.844969
OC,6.706475,0.880597,20.289087,5.476726
SA,95.500869,328.656783,388.471816,341.152678
,206.334814,5143.336351,4437.524053,5896.691519


In [14]:
Q1 = df.groupby(["Continent_Code"], dropna=False).quantile([0.25])
Q1

Unnamed: 0_level_0,Unnamed: 1_level_0,internet_users_per_100,Math,Reading,Science
Continent_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AF,0.25,40.779959,361.40915,352.65835,378.409675
AS,0.25,50.3,403.8332,408.1022,417.6112
EU,0.25,68.6329,464.0401,452.5143,460.7749
OC,0.25,85.476112,494.227975,504.493125,510.8213
SA,0.25,57.116462,388.1022,416.1269,408.20545
,0.25,57.431043,400.2534,423.2765,415.7099


In [15]:
Q3 = df.groupby(["Continent_Code"], dropna=False).quantile([0.75])
Q3

Unnamed: 0_level_0,Unnamed: 1_level_0,internet_users_per_100,Math,Reading,Science
Continent_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AF,0.75,45.939877,365.01505,358.25645,383.738825
AS,0.75,84.948353,531.2961,508.6905,523.2774
EU,0.75,87.479056,503.722,499.8146,501.9369
OC,0.75,87.307296,494.891525,507.678175,512.4761
SA,0.75,66.321532,417.61765,431.9227,433.7946
,0.75,74.554202,469.6285,496.9351,496.2424


In [16]:
grouper = df.groupby("Continent_Code", dropna=False)
iqr = grouper.quantile([0.75, 0.25]).groupby("Continent_Code").agg(np.subtract.reduce)
iqr

Unnamed: 0_level_0,internet_users_per_100,Math,Reading,Science
Continent_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AF,5.159918,3.6059,5.5981,5.32915
AS,34.648353,127.4629,100.5883,105.6662
EU,18.846156,39.6819,47.3003,41.162
OC,1.831185,0.66355,3.18505,1.6548
SA,9.20507,29.51545,15.7958,25.58915
,17.123159,69.3751,73.6586,80.5325


In [17]:
df.groupby("Continent_Code", dropna=False).count()

Unnamed: 0_level_0,Country Code,internet_users_per_100,Math,Reading,Science
Continent_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AF,2,2,2,2,2
AS,17,17,17,17,17
EU,37,37,37,37,37
OC,2,2,2,2,2
SA,7,7,7,7,7
,5,5,5,5,5


2. Determine whether there is any difference (on the average) for the math scores among European (EU) and Asian (AS) countries (assume normality and equal variances). Draw side-by-side box plots.

In [98]:
import seaborn as sns

In [102]:
avg_math = df.groupby(["Continent_Code", "Country Code"])[["Math"]].mean()
avg_math

Unnamed: 0_level_0,Unnamed: 1_level_0,Math
Continent_Code,Country Code,Unnamed: 2_level_1
AF,DZA,359.6062
AF,TUN,366.8180
AS,ARE,427.4827
AS,CHN,531.2961
AS,GEO,403.8332
...,...,...
SA,CHL,422.6714
SA,COL,389.6438
SA,PER,386.5606
SA,TTO,417.2434


In [273]:
avg_math.loc["AS"].mean()

Math    466.216647
dtype: float64

In [274]:
avg_math.loc["EU"].mean()

Math    477.981449
dtype: float64

In [275]:
stats.ttest_ind(avg_math.loc["EU"], avg_math.loc["AS"])

Ttest_indResult(statistic=array([0.87005532]), pvalue=array([0.38826888]))

The low t value indicates that there is no significant difference between European and Asian countries for mean mathematics scores.

# EXERCISE 4. The sample dataset has placement test scores (out of 100 points) for four subject areas: English, Reading, Math, and Writing. Students in the sample completed all 4 placement tests when they enrolled in the university. Suppose we are particularly interested in the English and Math sections, and want to determine whether students tended to score higher on their English or Math test, on average. We could use a paired t test to test if there was a significant difference in the average of the two tests.

In [30]:
grade = pd.read_csv("/content/students_2014 - students_2014.csv")
grade.head()

Unnamed: 0,ids,bday,enrolldate,expgradate,Rank,Major,Gender,Athlete,Height,Weight,Smoking,Sprint,MileMinDur,English,Reading,Math,Writing,State,LiveOnCampus,HowCommute,CommuteTime,SleepTime,StudyTime
0,43783,3/22/1995,,,,,0.0,0,72.35,,0.0,7.978,,88.24,81.5,60.02,81.44,In state,1.0,,,7.0,1.0
1,20278,1/1/1995,,,,Philosophy,0.0,0,70.66,179.2,0.0,8.004,0:06:21,89.45,85.25,70.19,73.27,,1.0,,,5.0,2.0
2,20389,12/31/1994,,,,,0.0,0,70.68,198.52,0.0,,0:07:00,96.73,86.88,71.2,84.24,In state,,,,8.0,7.0
3,22820,12/1/1994,,,,business administration,1.0,0,,198.34,,8.473,0:12:44,74.06,88.68,55.89,73.16,In state,1.0,,,2.0,6.0
4,24559,11/10/1994,,,,,1.0,1,67.43,128.17,2.0,,0:06:25,82.61,77.3,65.52,80.45,Out of state,1.0,,,7.0,3.0


In [39]:
grade_non_null = grade.loc[:, ["English", "Math"]].dropna()

In [40]:
grade_non_null.shape

(398, 2)

In [49]:
grade_non_null[["English", "Math"]].corr()

Unnamed: 0,English,Math
English,1.0,0.243014
Math,0.243014,1.0


In [51]:
stats.ttest_rel(grade_non_null["English"], grade_non_null["Math"])

Ttest_relResult(statistic=36.312568981719856, pvalue=3.0710987192210606e-128)

In [52]:
grade_non_null.mean()

English    82.744095
Math       65.446834
dtype: float64

In [53]:
grade_non_null["English"].mean() - grade_non_null["Math"].mean()

17.297261306532633

-English and Math scores were weakly and positively correlated (r = 0.243, p < 0.001).

-There was a significant average difference between English and Math scores (t = 36.313, p < 0.001).

-On average, English scores were 17.30 points higher than Math scores