#**Lab 2: Inferential Statistics in Python**
# RPAD 676: Data Science for the Public Good

## Author:
## Date:

In [None]:
# load libraries
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

In [None]:
# load dataset

# Load the CSV file
data = pd.read_csv('digital_services_data.csv')

In [None]:
# Display the first few rows of the dataframe

### Visualize Data

In [None]:
# Bar Chart

data.groupby('categoricalvariable')['continuousvariable'].mean().sort_values().plot(kind='bar', color='skyblue', figsize=(8,6))
plt.xlabel("categoricalvariable")
plt.ylabel("Continuous Variable ")
plt.title("Title of Bar Chart")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Pie Chart
categorical_counts = data['categoricalvariable'].value_counts()
categorical_counts.plot(kind='pie', autopct='%1.1f%%', colors=["lightblue", "lightgreen", "coral"], figsize=(6,6))
plt.title("Distribution of Categorical Variable")
plt.ylabel("")
plt.show()

In [None]:
# Box plot

sns.boxplot(x=data['categoricalvariable'], y=data['continuousvariable'])
plt.title("Title of Box Plot")
plt.show()

## Calculating Descriptive Statistics

In [None]:
# Descriptive Statistics
print("Descriptive Statistics for Numeric Variables:")
print(data.describe())

In [None]:
# Frequency Distributions for Categorical Variables

print("\nFrequency Distribution for Categorical Variables:")
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"\n{col}:")
    print(data[col].value_counts())

In [None]:
# Calculate the mean of a continuous variable
continuous_variable = 'continuousvariable'  # Change variable name as needed
mean_value = data[continuous_variable].mean().round(2)
print(f"\nMean of {continuous_variable}: {mean_value}")

In [None]:
# Calculate the means of continuous variables across groups
grouping_variable = 'categoricalvariable'  # Change to appropriate categorical variable
grouped_means = data.groupby(grouping_variable)[numeric_cols].mean().round(2)
print(f"\nMean values of continuous variables grouped by {grouping_variable}:")
print(grouped_means)

## Hypothesis Testing

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multicomp import pairwise_tukeyhsd

#### T-Tests - Comparing Means Across Two Groups

In [None]:
# Calculate the mean of a specific continuous variable across one categorical variable
outcome_variable = 'continuousvariable'  # Change to the variable of interest
categorical_variable = 'categoricalvariable'  # Change to the desired categorical variable

specific_grouped_means = data.groupby(categorical_variable)[outcome_variable].mean().round(2)

print(f"\nMean of {outcome_variable} grouped by {categorical_variable}:")
print(specific_grouped_means)

In [None]:
# T-Test: Comparing means across two groups

t_stat, p_val = stats.ttest_ind(
    data[data['categoricalvariable'] == 1]['continuousvariable'],
    data[data['categoricalvariable'] == 0]['continuousvariable']
)
print(f"T-test: Effect of Disability on Time Taken (in Seconds), t-statistic={t_stat:.4f}, p-value={p_val:.4f}")


#### ANOVA - Comparing Means Across Three or More Groups

In [None]:
# Means across more than two groups

outcome_variable = 'continuousvariable'  # Change to the variable of interest
categorical_variable = 'categoricalvariable'  # Change to the desired categorical variable

specific_grouped_means = data.groupby(categorical_variable)[outcome_variable].mean().round(2)
print(f"\nMean of {outcome_variable} grouped by {categorical_variable}:")
print(specific_grouped_means)

In [None]:
# ANOVA: Comparing time taken across agency types

aov = smf.ols(f'{outcome_variable} ~ C({categorical_variable})', data=data).fit()
anova_table = sm.stats.anova_lm(aov, typ=2)

print("\nANOVA Results:")
print(anova_table)

In [None]:
# Conduct post hoc test (Tukey HSD) if ANOVA is significant
alpha = 0.05
if anova_table['PR(>F)'][0] < alpha:
    print("\nANOVA is significant. Conducting post hoc Tukey HSD test:")
    tukey_results = pairwise_tukeyhsd(endog=data[outcome_variable], groups=data[categorical_variable], alpha=alpha)
    print(tukey_results)
else:
    print("\nANOVA is not significant. No post hoc test needed.")


## Regression Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Simple Linear Regression: Predicting outcome with one predictor
X = data[['independentvariable']]
y = data['dependentvariable']
X = sm.add_constant(X)  # Adds an intercept term
model = sm.OLS(y, X).fit()

In [None]:
# Display Regression Results in a Nice Table
print("\nSimple Linear Regression Results:")
print(model.summary())

In [None]:
# Visualizing Regression Results
plt.figure(figsize=(8,6))
sns.regplot(x=data['independentvariable'], y=data['dependentvariable'], scatter_kws={'alpha':0.5}, line_kws={'color':'red'})
plt.xlabel("X Label")
plt.ylabel("Y Label")
plt.title("Plot Title")
plt.show()

#### Multiple Regression

In [None]:
# Multiple Linear Regression: Predicting outcome using multiple predictors
X = data[['independentvariable','independentvariable']]
y = data['dependentvariable']
X = sm.add_constant(X)  # Adds an intercept term
model = sm.OLS(y, X).fit()

In [None]:
# Display Regression Results in a Nice Table
print("\nMultiple Linear Regression Results:")
print(model.summary())