In [18]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Seed for reproducibility
np.random.seed(42)

# Parameters
n_respondents = 1000

# Generate Satisfaction scores
satisfaction = np.random.uniform(low=1, high=7, size=n_respondents)

# Observation ID (assuming sequential starting from 1)
obs_id = np.arange(1, n_respondents + 1)

# Gender (0 for male, 1 for female, assuming roughly 50/50 distribution)
gender = np.random.binomial(1, 0.5, n_respondents)

# Age (Uniformly distributed between 18 and 70)
age = np.random.randint(18, 71, n_respondents)

# Involvement Level (1 to 5 scale)
involvement = np.random.randint(1, 6, n_respondents)

# Actual Purchase (binary, assuming 70% probability of purchase)
actual_purchase = np.random.binomial(1, 0.7, n_respondents)

# Amount of Purchase ($0 to $2000, only for those who made a purchase)
amount_purchase = actual_purchase * np.random.uniform(0, 2000, n_respondents)

# Education Level (1 to 4 scale)
education_level = np.random.randint(1, 5, n_respondents)

# Income (divided into 5 brackets)
income_brackets = ['0-25k', '25k-50k', '50k-75k', '75k-100k', '100k+']
income = np.random.choice(income_brackets, n_respondents)

# Compile additional demographics into the DataFrame
additional_demo_df = pd.DataFrame({
    'Obs': obs_id,
    'Female': gender,
    'Age': age,
    'Involve': involvement,
    'Actual_Purchase': actual_purchase,
    'Amount_Purchase': amount_purchase,
    'Education_Level': education_level,
    'Income_Bracket': income
})

# Function to generate dependent variables with targeted relationships
def generate_dependent_variable(satisfaction, coefficient, noise_std, scale=(1, 7)):
    noise = np.random.normal(0, noise_std, n_respondents)
    variable_scores = satisfaction * coefficient + noise
    return np.clip(variable_scores, scale[0], scale[1])

# Generating Recommend and Repurchase scores with specific influences from Satisfaction
recommend = generate_dependent_variable(satisfaction, 0.7, 0.8)
repurchase = generate_dependent_variable(satisfaction, 1.1, 0.5)

# Continue generating other variables as before
return_scores = generate_dependent_variable(satisfaction, 0.3, 1.0)
staff_scores = generate_dependent_variable(satisfaction, 0.4, 1.0)
contact_scores = generate_dependent_variable(satisfaction, 0.4, 1.0)
price_scores = np.random.uniform(1, 7, n_respondents)
store_scores = np.random.uniform(1, 7, n_respondents)
checkout_scores = np.random.uniform(1, 7, n_respondents)
merchandise_scores = np.random.uniform(1, 7, n_respondents)

# Compile the dataset
df = pd.DataFrame({
    'Satisfaction': satisfaction,
    'Recommend': recommend,
    'Repurchase': repurchase,
    'Return': return_scores,
    'Staff': staff_scores,
    'Contact': contact_scores,
    'Price': price_scores,
    'Store': store_scores,
    'Checkout': checkout_scores,
    'Merchandise': merchandise_scores
})


def generate_questions_improved(main_variable, num_questions=6, target_qs=[2, 3, 5]):
    np.random.seed(42)  # For reproducibility
    questions = {}
    main_variable_flat = main_variable.flatten()  # Ensure main_variable is flat

    for q in range(1, num_questions + 1):
        # Define initial noise variance based on whether the question is targeted
        noise_variance = 0.5 if q in target_qs else 1  # Lower variance for targeted questions
        
        # Generate noise and question scores
        noise = np.random.normal(0, noise_variance, size=n_respondents)
        q_scores = 0.5 * main_variable_flat + noise  # Simplified linear relationship + noise
        q_scores = np.clip(q_scores, 1, 7)  # Ensure scores are within 1 to 7
        
        questions[f'Q{q}'] = q_scores

    return pd.DataFrame(questions)


# Generate main variables
# Assuming `generate_dependent_variable` has been used for 'Return', 'Staff', 'Contact'

# Generate questions for each main variable and add to df
main_variables = ['Return', 'Staff', 'Contact', 'Price', 'Store', 'Checkout', 'Merchandise']
for var in main_variables:
    qs_df = generate_questions_improved(df[var].values.reshape(-1, 1), num_questions=6, target_qs=[2, 3, 5])
    df = pd.concat([df, qs_df.add_prefix(f'{var}_')], axis=1)

full_df = pd.concat([df, additional_demo_df], axis=1)
try:
    # File path (Adjust based on your local file system when running locally)
    file_path = "C://Users//Jamie Humphries//OneDrive - ionscout.com//biased_Datasurvey_data2.xlsx"
    full_df.to_excel(file_path, index=False)
    # Print the success message
    print("The full dataset file has been created successfully.")

except Exception as e:
    # Print the error message if there's an error
    print("An error occurred:", e)

The full dataset file has been created successfully.
