# A/B Test: San Francisco vs. New York City
This notebook analyzes whether there's a statistically significant difference in average sales between San Francisco and New York City using a t-test.

In [None]:
import pandas as pd
from scipy import stats

## Load and Clean Data

In [None]:
# Load dataset
all_data = pd.read_csv("all_data.csv")

# Clean data
all_data.dropna(how='all', inplace=True)
all_data = all_data[all_data['Order Date'].str[0:2] != 'Or']  # Remove bad headers
all_data['Quantity Ordered'] = pd.to_numeric(all_data['Quantity Ordered'])
all_data['Price Each'] = pd.to_numeric(all_data['Price Each'])
all_data['Sales'] = all_data['Quantity Ordered'] * all_data['Price Each']

## Extract City Information

In [None]:
def get_city(address):
    try:
        return f"{address.split(',')[1].strip()} ({address.split(',')[2].split()[0]})"
    except:
        return None

all_data['City'] = all_data['Purchase Address'].apply(get_city)

## A/B Test

In [None]:
# Create groups
group_a = all_data[all_data['City'] == 'San Francisco (CA)']['Sales']
group_b = all_data[all_data['City'] == 'New York City (NY)']['Sales']

# Perform t-test
t_stat, p_val = stats.ttest_ind(group_a, group_b, equal_var=False)

# Output results
print("Average Sales - San Francisco:", round(group_a.mean(), 2))
print("Average Sales - New York City:", round(group_b.mean(), 2))
print("T-statistic:", round(t_stat, 3))
print("P-value:", round(p_val, 3))

# Interpret
if p_val < 0.05:
    print("Result is statistically significant: the difference in sales is likely due to the campaign.")
else:
    print("Result is NOT statistically significant: observed difference may be due to chance.")