In [11]:
# Importing
import pandas as pd
import numpy as np
from scipy.stats import t

#loading data to variable
androidData_df = pd.read_csv('android_data_sample.csv')
iphoneData_df = pd.read_csv('iphone_data_sample.csv')

# printing the first few rows to understand 
print("Android Data:")
print(androidData_df.head())
print("\niPhone Data:")
print(iphoneData_df.head())

Android Data:
      test_id  android_device_id  \
0  4123331744          164588951   
1  4123331748          181276407   
2  4123331750          152528883   
3  4123331753          217323104   
4  4123331754          170748723   

                                 android_fingerprint            test_date  \
0  samsung/heroltexx/herolte:8.0.0/R16NW/G930FXXS...  2018-08-01 00:00:00   
1  hardkernel/cm_odroidxu3/odroidxu3:6.0.1/MOB31K...  2018-08-01 00:00:00   
2  motorola/athene_f/athene_f:7.0/NPJS25.93-14-15...  2018-08-01 00:00:00   
3  xiaomi/ugg/ugg:7.1.2/N2G47H/V9.5.6.0.NDKCNFA:u...  2018-08-01 00:00:00   
4  vivo/1610/1610:6.0.1/MMB29M/compiler06071715:u...  2018-08-01 00:00:00   

  client_ip_address  download_kbps  upload_kbps  latency server_name  \
0     37.47.xxx.xxx          37133        21762       20      Warsaw   
1   188.152.xxx.xxx           5605          802       58        Rome   
2    42.111.xxx.xxx           4380         1178       72     Chennai   
3    112.97.xxx.xx

In [23]:
#Question 1 
#values for android
andSpd = androidData_df['download_kbps']
andSTD = andSpd.std()
andLen = len(andSpd)
#values for iphone
iphoSpd = iphoneData_df['download_kbps']
iphoSTD = iphoSpd.std()
iphoLen = len(iphoSpd)
# pooled standard deviation
pooledSTD = np.sqrt(((andLen - 1) * andSTD**2 + (iphoLen - 1) * iphoSTD**2) / (andLen + iphoLen - 2))

# calc standard error
pooledStdE = pooledSTD*np.sqrt(1/andLen+1/iphoLen)
# Calc marg error confid at 90%
t90 = t.ppf(0.95, df=andLen + iphoLen - 2)
margE = pooledStdE*t90 

# pooled confid int
andMean = andSpd.mean()
iphoMean = iphoSpd.mean()
confidIntP = (andMean - iphoMean - margE, andMean - iphoMean + margE)

print(f"pooled confidence interval at 90% is: {confidIntP[0]:.0f} to {confidIntP[1]:.0f} kbps")

pooled confidence interval at 90% is: -2464 to 3834 kbps


In [27]:
#Question 2 
# Filtering download speeds for US and India
usSpd = iphoneData_df[iphoneData_df['server_country'] == 'United States']['download_kbps']
indSpd = iphoneData_df[iphoneData_df['server_country'] == 'India']['download_kbps']

# Values for US
usMean = usSpd.mean()
usSTD = usSpd.std()
usLen = len(usSpd)
# Values for India
indMean = indSpd.mean()
indSTD = indSpd.std()
indLen = len(indSpd)

# non-pooled standard error (NP = non pooled)
StdE_NP= np.sqrt((usSTD**2 / usLen) + (indSTD**2 / indLen))

# degrees of freedom
df_NP= ((usSTD**2/usLen+indSTD**2/indLen)**2 / ((usSTD**2/usLen)**2/(usLen-1) + (indSTD**2/indLen)**2 / (indLen-1)))

# t-value for 99% confidence
t99 = t.ppf(0.995, df=int(df_NP))

# margin of error
margE_NP = StdE_NP*t99 

# confidence interval
confidInt_NP = (usMean - indMean - margE_NP, usMean - indMean + margE_NP)

print(f"Non-pooled confidence interval at 99% is: {confidInt_NP[0]:.0f} to {confidInt_NP[1]:.0f} kbps")
print(f"Are download speeds much faster in US compared to India? {'Yes' if confidInt_NP[0] > 0 else 'No'}")


Non-pooled confidence interval at 99% is: 13233 to 35128 kbps
Are download speeds much faster in US compared to India? Yes


In [37]:
#question 3
# Paired data of salaries from slides 
privSalary = np.array([125.7, 85.3, 130.8, 119.5, 85, 137.8, 75.9, 75.8, 40.3, 90.8,
                             66.8, 65.4, 141.2, 89, 94.6, 129, 114.3, 70, 104.6, 114.8,
                             59.6, 134.2, 115.7, 90.3, 116.9, 79.5, 118.3, 99.2, 82.2, 92.9])
pubSalary= np.array([108.7, 69.3, 113.5, 100, 75.3, 135.9, 66.5, 64, 35.4, 73.1,
                            51.6, 44.2, 116.9, 85.5, 86, 113.8, 97.6, 55.7, 85.4, 86.4,
                            52.4, 112.3, 100, 66.4, 107.3, 67.6, 98.4, 78.3, 70.5, 76.6])

# paired differences
diff = privSalary - pubSalary

# values
diffMean = np.mean(diff)
diffSTD = np.std(diff, ddof=1)
diffLen = len(diff)

# standard error of mean differences
diffStdE = diffSTD / np.sqrt(diffLen)

# two tailed t-value for 98% confidence interval
tcrit98 = t.ppf(0.99, df=diffLen - 1)  

# MargE
margE = tcrit98 * diffStdE

# confid int
confidIntLower = diffMean - margE
confidIntUpper = diffMean + margE

print(f"98% Confidence Interval of the paired salaries is: ${confidIntLower:.1f}k to ${confidIntUpper:.1f}k")



98% Confidence Interval of the paired salaries is: $12.2k to $17.9k
