# Final Project for DSC 530


http://thinkstats2.com

Copyright 2016 Allen B. Downey

MIT License: https://opensource.org/licenses/MIT

Dataset is about people's height information and other possible factors.

In [None]:
from __future__ import print_function, division

%matplotlib inline

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

import numpy as np
import pandas as pd
import seaborn as sns
import random
import statistics
import thinkstats2
import thinkplot

In [None]:
df = pd.read_csv("heights.csv")
df.head()

In [None]:
sns.boxplot(x=df['earn'])

In [None]:
sns.boxplot(x=df['sex'])

In [None]:
sns.boxplot(x=df['ed'])

In [None]:
sns.boxplot(x=df['age'])

In [None]:
sns.boxplot(x=df['height'])


In [None]:
hist = thinkstats2.Hist(df.earn.value_counts(), label='earn')
thinkplot.Hist(hist)
thinkplot.Config(xlabel='Earn', ylabel='Count')

In [None]:
heights = np.floor(df.height)
hist = thinkstats2.Hist(heights, label='height')
thinkplot.Hist(hist)
thinkplot.Config(xlabel='Height', ylabel='Count')

In [None]:
hist = thinkstats2.Hist(df.sex.value_counts(), label='sex')
thinkplot.Hist(hist)
thinkplot.Config(xlabel='sex', ylabel='Count')

In [None]:
hist = thinkstats2.Hist(df.ed.value_counts(), label='ed')
thinkplot.Hist(hist)
thinkplot.Config(xlabel='Education', ylabel='Count')

In [None]:
hist = thinkstats2.Hist(df.age.value_counts(), label='age')
thinkplot.Hist(hist)
thinkplot.Config(xlabel='Age', ylabel='Count')

In [None]:
print("Mean: ", statistics.mean(df.height))
print("Median: ", statistics.median(df.height))
print("Mode: ", statistics.mode(df.height))

In [None]:
print("Mean: ", statistics.mean(df.earn))
print("Median: ", statistics.median(df.earn))
print("Mode: ", statistics.mode(df.earn))

In [None]:
print("Mean: ", statistics.mean(df.ed))
print("Median: ", statistics.median(df.ed))
print("Mode: ", statistics.mode(df.ed))

In [None]:
print("Mean: ", statistics.mean(df.age))
print("Median: ", statistics.median(df.age))
print("Mode: ", statistics.mode(df.age))

In [None]:
print("Mean: ", statistics.mean(df.sex))
print("Median: ", statistics.median(df.sex))
print("Mode: ", statistics.mode(df.sex))

In [None]:
sns.boxplot(x=df['ed'])


In [None]:
sns.boxplot(x=df['age'])

In [None]:
cdf = thinkstats2.Cdf(df.height, label='height')
thinkplot.Cdf(cdf)
thinkplot.Config(xlabel='Height', ylabel='CDF', loc='upper left')

In [None]:
actual_pmf = thinkstats2.Pmf(df.age, label = 'actual')
thinkplot.Pmf(actual_pmf)
thinkplot.Config(xlabel='Age', ylabel='PMF')
def BiasPmf(pmf, label):
    new_pmf = pmf.Copy(label=label)

    for x, p in pmf.Items():
        new_pmf.Mult(x, x)
        
    new_pmf.Normalize()
    return new_pmf
biased_pmf = BiasPmf(actual_pmf,label = 'biased')
thinkplot.PrePlot(2)
thinkplot.Pmfs([actual_pmf,biased_pmf])
thinkplot.Config(xlabel='Age', ylabel='PMF')

In [None]:
height, sex = df.height, df.sex
thinkplot.Scatter(sex, heights, alpha=1)
thinkplot.Config(xlabel='Sex',
                 ylabel='Height',
                 legend=False)

In [None]:
heights, ages = df.height, df.age
thinkplot.Scatter(ages, heights, alpha=1)
thinkplot.Config(xlabel='Age',
                 ylabel='Height',
                 legend=False)

In [None]:
heights, ages = df.height, df.age
height, sex = df.height, df.sex
def Cov(xs, ys, meanx=None, meany=None):
    xs = np.asarray(xs)
    ys = np.asarray(ys)

    if meanx is None:
        meanx = np.mean(xs)
    if meany is None:
        meany = np.mean(ys)

    cov = np.dot(xs-meanx, ys-meany) / len(xs)
    return cov
def Corr(xs, ys):
    xs = np.asarray(xs)
    ys = np.asarray(ys)

    meanx, varx = thinkstats2.MeanVar(xs)
    meany, vary = thinkstats2.MeanVar(ys)

    corr = Cov(xs, ys, meanx, meany) / np.sqrt(varx * vary)
    return corr
def SpearmanCorr(xs, ys):
    xranks = pd.Series(xs).rank()
    yranks = pd.Series(ys).rank()
    return Corr(xranks, yranks)
print("Cor",Corr(ages,heights))
print("SpearmanCor",SpearmanCorr(ages,heights))


In [None]:
class DiffMeansPermute(thinkstats2.HypothesisTest):

    def TestStatistic(self, data):
        group1, group2 = data
        test_stat = abs(group1.mean() - group2.mean())
        return test_stat

    def MakeModel(self):
        group1, group2 = self.data
        self.n, self.m = len(group1), len(group2)
        self.pool = np.hstack((group1, group2))

    def RunModel(self):
        np.random.shuffle(self.pool)
        data = self.pool[:self.n], self.pool[self.n:]
        return data

data = df.age.values, df.height.values
ht = DiffMeansPermute(data)
pvalue = ht.PValue()
pvalue
ht.PlotCdf()
thinkplot.Config(xlabel='test statistic',
                   ylabel='CDF')

In [None]:
import statsmodels.formula.api as smf

formula = 'age ~ height'
model = smf.ols(formula, data=df)
results = model.fit()
results.summary()