## Import of Needed Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

## Set the number of responses and the seed values for random values

In [None]:
# Seed for reproducibility
np.random.seed(42)

# Set the number of respondents
n_respondents = 1000

## Set the demographic and other responses 
#### All are randomly assigned with a biased or normal distribution

In [None]:
# Generate Satisfaction scores
satisfaction = np.random.uniform(low=1, high=7, size=n_respondents)

# Observation ID (assuming sequential starting from 1)
# This is the only variable that is not generated randomly
# This is the also the respondent ID
obs_id = np.arange(1, n_respondents + 1)

# Gender (0 for male, 1 for female, assuming roughly 50/50 distribution)
# This is a binary variable
# By changing the probability, you can change the distribution
# The first argument is the number of trials, the second is the probability
gender = np.random.binomial(1, 0.5, n_respondents)

# Age (Uniformly distributed between 18 and 70)
age = np.random.randint(18, 71, n_respondents)

# Involvement Level (1 to 5 scale)
involvement = np.random.randint(1, 6, n_respondents)

# Actual Purchase (binary, assuming 70% probability of purchase)
# This is a binary variable
# By changing the probability, you can change the distribution
# The first argument is the number of trials, the second is the probability
actual_purchase = np.random.binomial(1, 0.7, n_respondents)

# Amount of Purchase ($0 to $2000, only for those who made a purchase)
amount_purchase = actual_purchase * np.random.uniform(0, 2000, n_respondents)

# Education Level (1 to 4 scale)
# 1: High School, 2: Bachelors, 3: Masters, 4: PhD
# This is an ordinal variable
# By changing the range, you can change the distribution
# The first argument is the lowest value, the second is the highest value
education_level = np.random.randint(1, 5, n_respondents)

# Income (divided into 5 brackets)
# This is a categorical variable
# By changing the brackets, you can change the distribution
# The first argument is the list of values to choose from
# The choice is made in a normalized way, so you can use any list of values
income_brackets = ['0-25k', '25k-50k', '50k-75k', '75k-100k', '100k+']
income = np.random.choice(income_brackets, n_respondents)

# Compile additional demographics into the DataFrame
# This is the creation of the demographic dataframe
# The data will be added to the final dataframe before saving
# This is a separate dataframe to keep the code clean
# To make changes to the demographics, you only need to change this code
# ensure that changes  have a corresponding varible assignment like the examples above and listed below
additional_demo_df = pd.DataFrame({
    'Obs': obs_id,
    'Female': gender,
    'Age': age,
    'Involve': involvement,
    'Actual_Purchase': actual_purchase,
    'Amount_Purchase': amount_purchase,
    'Education_Level': education_level,
    'Income_Bracket': income
})


## Generate the varibles for the first level response
##### In this example we have Recommend and Repurchase that are influence by Satisfaction
##### We have Return, Staff, Contact, Price, Store, Checkout and Merchandise that influences Satisfaction
##### Here you can adjust which varibles influence the next level and assign biased varible data
##### to simulate significanct parameter estimates. 

In [None]:
# Function to generate dependent variables with targeted relationships
# This function will be used to generate dependent variables
# The function takes in the independent variable, a coefficient, and a noise level
# The function returns the dependent variable
# The function also clips the scores to the specified range

def generate_dependent_variable(satisfaction, coefficient, noise_std, scale=(1, 7)):
    noise = np.random.normal(0, noise_std, n_respondents)
    variable_scores = satisfaction * coefficient + noise
    return np.clip(variable_scores, scale[0], scale[1])

# Generating Recommend and Repurchase scores with specific influences from Satisfaction
# This is the creation of the dependent variables
# The data will be added to the final dataframe before saving
# This is a separate dataframe to keep the code clean
# To make changes to the dependent variables, you only need to change this code
# ensure that changes  have a corresponding varible assignment like the examples above and listed below
# You can make changes to the coefficients and noise levels to see the impact on the dependent variables
# You can make changes to the names of the dependent varibles to suit your needs
# the settings following satisfaction are the coefficients and noise levels
#
recommend = generate_dependent_variable(satisfaction, 0.7, 0.8)
repurchase = generate_dependent_variable(satisfaction, 1.1, 0.5)

# Continue generating other variables as before
# This is the creation of the independent varibles to Overall Satisfaction
# The data will be added to the final dataframe before saving
# This is a separate dataframe to keep the code clean
# To make changes to the independent variables, you only need to change this code
# ensure that changes  have a corresponding varible assignment like the examples above and listed below
# You can make changes to the coefficients and noise levels to see the impact on the dependent variables
# You can simply add more variables to the list to create more independent variables
# the settings following satisfaction are the coefficients and noise levels
# You can move the names around to the biased varibles and the unbiased varibles to simulate new data and its impact
# This allows you to create many different datasets with different relationships between the independent and dependent variables
# The names of the independent variables can be changed to suit your needs

return_scores = generate_dependent_variable(satisfaction, 0.3, 1.0)
staff_scores = generate_dependent_variable(satisfaction, 0.4, 1.0)
contact_scores = generate_dependent_variable(satisfaction, 0.4, 1.0)
price_scores = np.random.uniform(1, 7, n_respondents)
store_scores = np.random.uniform(1, 7, n_respondents)
checkout_scores = np.random.uniform(1, 7, n_respondents)
merchandise_scores = np.random.uniform(1, 7, n_respondents)

# Compile the dataset
# Ensure that the names of the independent and dependent variables match the names of the variables above
df = pd.DataFrame({
    'Satisfaction': satisfaction,
    'Recommend': recommend,
    'Repurchase': repurchase,
    'Return': return_scores,
    'Staff': staff_scores,
    'Contact': contact_scores,
    'Price': price_scores,
    'Store': store_scores,
    'Checkout': checkout_scores,
    'Merchandise': merchandise_scores
})

## Function to create third order of varibles
#### These Varibles influence the specific section satisfation scores which influence satisfaction

In [None]:
# This function will be used to generate questions results that in a survey
# would be spe4cific to the independent variables that influence the varibles that inturn influence satisfaction
# The function takes in the independent variable, a coefficient, and a noise level
# the target_qs is a list of the questions that are targeted to be biased and have higher parameter estimates

def generate_questions_improved(main_variable, num_questions=6, target_qs=[2, 3, 5]):
    np.random.seed(42)  # For reproducibility
    questions = {}
    main_variable_flat = main_variable.flatten()  # Ensure main_variable is flat

    for q in range(1, num_questions + 1):
        # Define initial noise variance based on whether the question is targeted
        noise_variance = 0.5 if q in target_qs else 1  # Lower variance for targeted questions
        
        # Generate noise and question scores
        noise = np.random.normal(0, noise_variance, size=n_respondents)
        q_scores = 0.5 * main_variable_flat + noise  # Simplified linear relationship + noise
        q_scores = np.clip(q_scores, 1, 7)  # Ensure scores are within 1 to 7
        
        questions[f'Q{q}'] = q_scores

    return pd.DataFrame(questions)

    # Generate main variables
# Assuming `generate_dependent_variable` has been used for 'Return', 'Staff', 'Contact'

# Generate questions for each main variable and add to df
# you can chage the name and number of the main variables to suit your needs
# six question results are generated for each main variable
# the target_qs list is used to specify which questions are biased
# the number of the targets are counted left to right starting from 1
main_variables = ['Return', 'Staff', 'Contact', 'Price', 'Store', 'Checkout', 'Merchandise']
for var in main_variables:
    qs_df = generate_questions_improved(df[var].values.reshape(-1, 1), num_questions=6, target_qs=[2, 3, 5])
    # Brings together the main variables and the questions results
    df = pd.concat([df, qs_df.add_prefix(f'{var}_')], axis=1)

## Brings everything together and saves the file to the directory of your choice

In [None]:
# Add the demographic data to the full dataset
full_df = pd.concat([df, additional_demo_df], axis=1)
try:
    # File path (Adjust based on your local file system when running locally)
    file_path = "C://Users//Jamie Humphries//OneDrive - ionscout.com//biased_Datasurvey_data2.xlsx"
    full_df.to_excel(file_path, index=False)
    # Print the success message
    print("The full dataset file has been created successfully.")

except Exception as e:
    # Print the error message if there's an error
    print("An error occurred:", e)