In [8]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy import stats
from sklearn.preprocessing import LabelEncoder

# Load the CSV file while skipping the first few rows with metadata
file_path = './data.csv'  # Replace with your file path

# Skip metadata rows and handle empty columns
data = pd.read_csv(file_path, skiprows=3)

# Remove completely empty columns
data = data.dropna(axis=1, how='all')

# Display the first few rows and column names to inspect
print(data.head())
print(data.columns)


   Response ID Response Status                 IP Address  \
0          NaN             NaN                        NaN   
1   22080410.0         Started            212.186.128.243   
2   22080584.0         Started  2a09:bac2:f1:1c46::2d1:69   
3   22080588.0       Completed             178.165.203.12   
4   22080597.0       Completed             178.165.203.12   

  Timestamp (mm/dd/yyyy)  Duplicate  Time Taken to Complete (Seconds)  \
0                    NaN        NaN                               NaN   
1    10/16/2024 20:27:23        0.0                               6.0   
2    10/16/2024 20:44:51        0.0                              90.0   
3    10/16/2024 20:48:00        0.0                             184.0   
4    10/16/2024 20:52:15        0.0                             107.0   

   Seq. Number Country Code Region  Are you a local or tourist?  ...  \
0          NaN          NaN    NaN                          NaN  ...   
1          1.0           AT      9                

In [16]:
# Hypothesis 1: Viewing live footage significantly influences the choice of ski resort
# Assuming 'Has live footage ever led you to choose one ski resort over another?' is a binary variable
contingency_table_h1 = pd.crosstab(data['Has live footage ever led you to choose one ski resort over another?'], 
                                    data['Are you a local or tourist?'])
chi2_h1, p_h1, dof_h1, expected_h1 = stats.chi2_contingency(contingency_table_h1)

# Hypothesis 2: Frequent visitors to ski resorts are more likely to rely on live footage
# Example mapping for frequency
data['Visit Frequency'] = data['How often do you visit ski resorts?'].map({'Rarely': 1, 'Occasionally': 2, 'Frequently': 3})
corr_h2 = data[['Visit Frequency', 'Do you look at live footage of ski resorts before planning your visit?']].corr()

# Hypothesis 3: Specific factors seen on live footage have a stronger impact on ski resort choice
# Count factors affecting decision
factors = ['Wheater conditions', 'Snow quality', 'Crowdedness of the resort', 'Condition of slopes', 'Waiting times at lifts']
h3_results = data[factors].sum()

# Hypothesis 4: Ski resort visitors who find live footage accurate are more satisfied with their overall experience
satisfaction_corr_h4 = data[['After visiting how often has your experience matched your expectations set by live footage?', 
                              'Overall how satisfied were you with your visit to resorts where you viewed live footage beforehand?']].corr()



# Hypothesis 6: Visitors who find live footage to be reliable are more likely to revisit the ski resort
h6_corr = data[['Are you more likely to revisit a ski resort if they provide reliable and satisfactory live footage?', 
                 'After visiting how often has your experience matched your expectations set by live footage?']].corr()

# Hypothesis 7: Locals are more likely to base their decision on live footage than tourists
h7_results = data.groupby('Are you a local or tourist?')['Has live footage ever led you to choose one ski resort over another?'].value_counts(normalize=True)

# Output the results
print("H1 Results (Chi-squared):", chi2_h1, p_h1)
print("H2 Correlation (Visit Frequency vs. Live Footage Usage):", corr_h2)
print("H3 Factor Counts:", h3_results)
print("H4 Satisfaction Correlation:", satisfaction_corr_h4)

print("H6 Correlation (Reliability vs. Revisiting):", h6_corr)
print("H7 Local vs. Tourist Decision Influence:", h7_results)

H1 Results (Chi-squared): 0.025438884001219086 0.8732783318226793
H2 Correlation (Visit Frequency vs. Live Footage Usage):                                                     Visit Frequency  \
Visit Frequency                                                 NaN   
Do you look at live footage of ski resorts befo...              NaN   

                                                    Do you look at live footage of ski resorts before planning your visit?  
Visit Frequency                                                                                   NaN                       
Do you look at live footage of ski resorts befo...                                                1.0                       
H3 Factor Counts: Series([], dtype: float64)
H4 Satisfaction Correlation:                                                     After visiting how often has your experience matched your expectations set by live footage?  \
After visiting how often has your experience ma...                 

  h3_results = data[factors].sum()


In [27]:
# Convert 'What is your age?' to numeric
data['What is your age?'] = pd.to_numeric(data['What is your age?'], errors='coerce')

# Create age groups
data['Age Group'] = pd.cut(data['What is your age?'], 
                           bins=[0, 25, 45, 65, np.inf], 
                           labels=['Young', 'Middle-Aged', 'Old', 'Senior'], 
                           right=False)

# Create binary column for live footage usage
data['Live Footage Usage'] = data['Do you look at live footage of ski resorts before planning your visit?'].map({'Ja': 1, 'Nein': 0})

# Drop rows with NaN values in 'Age Group' or 'Live Footage Usage'
filtered_data = data.dropna(subset=['Age Group', 'Live Footage Usage'])

In [28]:
# Check the original data shape
print("Original data shape:", data.shape)

# Check the filtered data shape
print("Filtered data shape:", filtered_data.shape)

# Display the first few rows of the filtered data
print(filtered_data.head())


Original data shape: (168, 46)
Filtered data shape: (0, 46)
Empty DataFrame
Columns: [Response ID, Response Status, IP Address, Timestamp (mm/dd/yyyy), Duplicate, Time Taken to Complete (Seconds), Seq. Number, Country Code, Region, Are you a local or tourist?, Which category of skier/snowboarder are you, Unnamed: 19, Unnamed: 20, Unnamed: 21, Unnamed: 22, How often do you visit ski resorts?, Which activities do you usually do when visiting a ski resort?, Unnamed: 25, Unnamed: 26, What do you consider when choosing a ski resort?, Unnamed: 28, Unnamed: 29, Unnamed: 30, Do you look at live footage of ski resorts before planning your visit?, Wheater conditions, Snow quality, Crowdedness of the resort, Condition of slopes, Waiting times at lifts, Has live footage ever led you to choose one ski resort over another?, What factors seen on live footage would impact your travel desicions / ski resort choice?, Unnamed: 39, Unnamed: 40, Unnamed: 41, Unnamed: 42, Did you ever not go to a ski resort

In [21]:
# Check unique values in age column
print("Unique values in 'What is your age?':", data['What is your age?'].unique())

# Check unique values in live footage usage column
print("Unique values in 'Do you look at live footage of ski resorts before planning your visit?':", 
      data['Do you look at live footage of ski resorts before planning your visit?'].unique())


Unique values in 'What is your age?': [nan  3.  5.  6.  4.  2.  1.  7.]
Unique values in 'Do you look at live footage of ski resorts before planning your visit?': [nan  1.  2.  3.  4.  5.]


In [22]:
# Review the age grouping logic
data['Age Group'] = pd.cut(data['What is your age?'], 
                           bins=[0, 25, 35, 45, 55, 65, np.inf], 
                           labels=['<25', '25-35', '35-45', '45-55', '55-65', '65+'], 
                           right=False)

# Check how many rows were categorized into each age group
print("Age Group Counts:\n", data['Age Group'].value_counts())


Age Group Counts:
 <25      156
25-35      0
35-45      0
45-55      0
55-65      0
65+        0
Name: Age Group, dtype: int64


In [23]:
# Check the original responses for live footage usage
print("Responses for live footage usage:", data['Do you look at live footage of ski resorts before planning your visit?'].unique())

# Apply mapping
data['Live Footage Usage'] = data['Do you look at live footage of ski resorts before planning your visit?'].map({'Ja': 1, 'Nein': 0})

# Check the mapping results
print("Live Footage Usage counts:\n", data['Live Footage Usage'].value_counts())


Responses for live footage usage: [nan  1.  2.  3.  4.  5.]
Live Footage Usage counts:
 Series([], Name: Live Footage Usage, dtype: int64)


In [24]:
# Recreate filtered data with valid entries
filtered_data = data.dropna(subset=['Age Group', 'Live Footage Usage'])

# Check the shape and some rows of the filtered data again
print("Filtered Data Shape After Rechecking:", filtered_data.shape)
print(filtered_data[['Age Group', 'Live Footage Usage']].head())


Filtered Data Shape After Rechecking: (0, 46)
Empty DataFrame
Columns: [Age Group, Live Footage Usage]
Index: []


In [25]:
# Step 1: Map the age values to proper age groups
age_mapping = {
    1: '<20',
    2: '20-29',
    3: '30-39',
    4: '40-49',
    5: '50-59',
    6: '60-69',
    7: '70+'
}

# Step 2: Map the live footage responses
live_footage_mapping = {
    1: 1,  # Yes
    2: 0,  # No
    # Add more mappings if needed based on responses 3, 4, 5
}

# Apply mappings
data['Age Group'] = data['What is your age?'].map(age_mapping)
data['Live Footage Usage'] = data['Do you look at live footage of ski resorts before planning your visit?'].map(live_footage_mapping)

# Check the mappings
print("Mapped Age Group Counts:\n", data['Age Group'].value_counts())
print("Mapped Live Footage Usage Counts:\n", data['Live Footage Usage'].value_counts())

# Filter the data again
filtered_data = data.dropna(subset=['Age Group', 'Live Footage Usage'])

# Check the shape and some rows of the filtered data
print("Filtered Data Shape After Mappings:", filtered_data.shape)
print(filtered_data[['Age Group', 'Live Footage Usage']].head())


Mapped Age Group Counts:
 20-29    69
30-39    46
40-49    20
50-59    11
60-69     5
<20       4
70+       1
Name: Age Group, dtype: int64
Mapped Live Footage Usage Counts:
 1.0    48
0.0    45
Name: Live Footage Usage, dtype: int64
Filtered Data Shape After Mappings: (91, 46)
  Age Group  Live Footage Usage
3     30-39                 1.0
4     30-39                 0.0
5     50-59                 1.0
7     60-69                 0.0
8     40-49                 1.0


In [26]:
# Create a contingency table
contingency_table = pd.crosstab(filtered_data['Age Group'], filtered_data['Live Footage Usage'])

# Perform the Chi-square test
chi2_stat, p_value, dof, expected = stats.chi2_contingency(contingency_table)

# Print results
print("Chi-Squared Statistic:", chi2_stat)
print("P-Value:", p_value)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:\n", expected)

# Interpret the result
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis: There is a significant relationship between age group and live footage usage.")
else:
    print("Fail to reject the null hypothesis: There is no significant relationship between age group and live footage usage.")


Chi-Squared Statistic: 5.335219068247454
P-Value: 0.5015920154126792
Degrees of Freedom: 6
Expected Frequencies:
 [[16.06593407 17.93406593]
 [14.17582418 15.82417582]
 [ 5.67032967  6.32967033]
 [ 3.30769231  3.69230769]
 [ 1.41758242  1.58241758]
 [ 0.47252747  0.52747253]
 [ 1.89010989  2.10989011]]
Fail to reject the null hypothesis: There is no significant relationship between age group and live footage usage.


In [30]:
# create overview of all the hypotheses and print them
# Hypothesis 1: Viewing live footage significantly influences the choice of ski resort
# Hypothesis 2: Frequent visitors to ski resorts are more likely to rely on live footage
# Hypothesis 3: Specific factors seen on live footage have a stronger impact on ski resort choice
# Hypothesis 4: Ski resort visitors who find live footage accurate are more satisfied with their overall experience
# Hypothesis 5: Age groups have different preferences for live footage usage
# Hypothesis 6: Visitors who find live footage to be reliable are more likely to revisit the ski resort
# Hypothesis 7: Locals are more likely to base their decision on live footage than tourists
# Hypothesis 8: The frequency of ski resort visits is correlated with the reliance on live footage

# Print the results of all hypotheses
print("H1 Results (Chi-squared):", chi2_h1, p_h1)
print("H2 Correlation (Visit Frequency vs. Live Footage Usage):", corr_h2)
print("H3 Factor Counts:", h3_results)
print("H4 Satisfaction Correlation:", satisfaction_corr_h4)
print("H5 Results (Chi-squared):", chi2_stat, p_value)  # Add results for H5
print("H6 Correlation (Reliability vs. Revisiting):", h6_corr)
print("H7 Local vs. Tourist Decision Influence:", h7_results)

H1 Results (Chi-squared): 0.025438884001219086 0.8732783318226793
H2 Correlation (Visit Frequency vs. Live Footage Usage):                                                     Visit Frequency  \
Visit Frequency                                                 NaN   
Do you look at live footage of ski resorts befo...              NaN   

                                                    Do you look at live footage of ski resorts before planning your visit?  
Visit Frequency                                                                                   NaN                       
Do you look at live footage of ski resorts befo...                                                1.0                       
H3 Factor Counts: Series([], dtype: float64)
H4 Satisfaction Correlation:                                                     After visiting how often has your experience matched your expectations set by live footage?  \
After visiting how often has your experience ma...                 

In [37]:
import pandas as pd

# Sample hypothesis results based on your data
chi2_h1, p_h1 = 0.0254, 0.8733
corr_h2 = pd.DataFrame({
    'Visit Frequency': [None, None],
    'Live Footage Usage': [None, 1.0]
})  # Replace with actual correlation if available

h3_results = pd.Series([], dtype='float64')  # Replace with actual results if available

satisfaction_corr_h4 = pd.DataFrame({
    'After visiting how often has your experience matched your expectations set by live footage?': [1.0, 0.5048],
    'Overall satisfaction with visit': [0.5048, 1.0]
})

chi2_h5, p_h5 = 5.3352, 0.5016

h6_corr = pd.DataFrame({
    'Are you more likely to revisit a ski resort if they provide reliable and satisfactory live footage?': [1.0, 0.3552],
    'After visiting how often has your experience matched your expectations set by live footage?': [0.3552, 1.0]
})

h7_results = pd.DataFrame({
    'Are you a local or tourist?': [1.0, 2.0],
    'Has live footage ever led you to choose one ski resort over another?': [0.2947, 0.7302]
})

# Create a function to write results to a Markdown file
def write_results_to_md(filename):
    with open(filename, 'w') as f:
        f.write("# Hypothesis Results Document\n\n")

        f.write("## Hypothesis 1:\n")
        f.write("**Viewing live footage significantly influences the choice of ski resort.**\n")
        f.write("This hypothesis tests whether the availability of live footage impacts the decision-making process of potential visitors.\n")
        f.write(f"**Results (Chi-squared):**\n- Chi-squared Statistic: **{chi2_h1}**\n- p-value: **{p_h1}**\n")
        f.write("A p-value greater than 0.05 suggests that there is not enough evidence to reject the null hypothesis, indicating that live footage may not significantly influence choice.\n")
        f.write("**Conclusion:** The findings indicate that live footage does not play a significant role in influencing the choice of ski resort among visitors, suggesting that other factors may be more decisive in their decision-making process.\n\n")

        f.write("## Hypothesis 2:\n")
        f.write("**Frequent visitors to ski resorts are more likely to rely on live footage.**\n")
        f.write("This hypothesis examines the correlation between how often respondents visit ski resorts and their usage of live footage to aid in their decision-making.\n")
        f.write("**Correlation (Visit Frequency vs. Live Footage Usage):**\n")
        f.write(corr_h2.to_markdown(index=True) + "\n")
        f.write("**Conclusion:** Further analysis is needed to understand the nature of this correlation. While frequent visitors may engage with live footage, the exact impact on decision-making requires additional research.\n\n")

        f.write("## Hypothesis 3:\n")
        f.write("**Specific factors seen on live footage have a stronger impact on ski resort choice.**\n")
        f.write("This hypothesis focuses on identifying which factors from live footage most influence the choice of ski resort.\n")
        f.write("**Factor Counts:**\n")
        f.write(f"- Results:\n  {h3_results}\n")
        f.write("**Conclusion:** More data is required to determine the specific factors influencing choice from live footage, as current results are inconclusive.\n\n")

        f.write("## Hypothesis 4:\n")
        f.write("**Ski resort visitors who find live footage accurate are more satisfied with their overall experience.**\n")
        f.write("This hypothesis investigates the relationship between the accuracy of live footage and visitor satisfaction after their visit.\n")
        f.write("**Satisfaction Correlation:**\n")
        f.write(satisfaction_corr_h4.to_markdown(index=True) + "\n")
        f.write("**Conclusion:** There appears to be a positive correlation between satisfaction and the accuracy of live footage. This suggests that enhancing the quality and reliability of live footage could improve visitor satisfaction.\n")

        f.write("## Hypothesis 5:\n")
        f.write("**Age groups have different preferences for live footage usage.**\n")
        f.write("This hypothesis analyzes whether different age demographics exhibit varying levels of preference for using live footage when choosing a ski resort.\n")
        f.write("**Results (Chi-squared):**\n")
        f.write(f"- Chi-squared Statistic: **{chi2_h5}**\n- p-value: **{p_h5}**\n")
        f.write("A p-value greater than 0.05 indicates insufficient evidence to reject the null hypothesis.\n")
        f.write("**Conclusion:** The results suggest that age does not significantly influence preferences for live footage usage, but further exploration is needed to understand the nuances across different age groups.\n\n")

        f.write("## Hypothesis 6:\n")
        f.write("**Visitors who find live footage to be reliable are more likely to revisit the ski resort.**\n")
        f.write("This hypothesis tests whether the reliability of live footage correlates with a visitor's likelihood of returning to the ski resort.\n")
        f.write("**Correlation (Reliability vs. Revisiting):**\n")
        f.write(h6_corr.to_markdown(index=True) + "\n")
        f.write("**Conclusion:** Reliability of live footage does seem to play a role in the likelihood of revisiting. Enhancing the quality and consistency of live footage could encourage repeat visits.\n")

        f.write("## Hypothesis 7:\n")
        f.write("**Locals are more likely to base their decision on live footage than tourists.**\n")
        f.write("This hypothesis looks at how the residency status of visitors (local vs. tourist) affects their reliance on live footage when choosing ski resorts.\n")
        f.write("**Decision Influence:**\n")
        f.write(h7_results.to_markdown(index=False) + "\n")
        f.write("**Conclusion:** The results indicate that locals might rely more on live footage when making their choices compared to tourists. This could reflect a greater familiarity with the local conditions showcased in the footage.\n")

# Call the function to write results to a Markdown file
write_results_to_md('hypothesis_results.md')

# Optionally, print results to console
print("# Hypothesis Results Document\n")

print("## Hypothesis 1:")
print("**Viewing live footage significantly influences the choice of ski resort.**")
print("This hypothesis tests whether the availability of live footage impacts the decision-making process of potential visitors.")
print(f"**Results (Chi-squared):**")
print(f"- Chi-squared Statistic: **{chi2_h1}**")
print(f"- p-value: **{p_h1}**")
print("A p-value greater than 0.05 suggests that there is not enough evidence to reject the null hypothesis, indicating that live footage may not significantly influence choice.")
print("**Conclusion:** The findings indicate that live footage does not play a significant role in influencing the choice of ski resort among visitors, suggesting that other factors may be more decisive in their decision-making process.\n")

print("## Hypothesis 2:")
print("**Frequent visitors to ski resorts are more likely to rely on live footage.**")
print("This hypothesis examines the correlation between how often respondents visit ski resorts and their usage of live footage to aid in their decision-making.")
print("**Correlation (Visit Frequency vs. Live Footage Usage):**")
print(corr_h2.to_markdown(index=True))
print("**Conclusion:** Further analysis is needed to understand the nature of this correlation. While frequent visitors may engage with live footage, the exact impact on decision-making requires additional research.\n")

print("## Hypothesis 3:")
print("**Specific factors seen on live footage have a stronger impact on ski resort choice.**")
print("This hypothesis focuses on identifying which factors from live footage most influence the choice of ski resort.")
print("**Factor Counts:**")
print(h3_results)
print("**Conclusion:** More data is required to determine the specific factors influencing choice from live footage, as current results are inconclusive.\n")

print("## Hypothesis 4:")
print("**Ski resort visitors who find live footage accurate are more satisfied with their overall experience.**")
print("This hypothesis investigates the relationship between the accuracy of live footage and visitor satisfaction after their visit.")
print("**Satisfaction Correlation:**")
print(satisfaction_corr_h4.to_markdown(index=True))
print("**Conclusion:** There appears to be a positive correlation between satisfaction and the accuracy of live footage. This suggests that enhancing the quality and reliability of live footage could improve visitor satisfaction.\n")

print("## Hypothesis 5:")
print("**Age groups have different preferences for live footage usage.**")
print("This hypothesis analyzes whether different age demographics exhibit varying levels of preference for using live footage when choosing a ski resort.")
print("**Results (Chi-squared):**")
print(f"- Chi-squared Statistic: **{chi2_h5}**")
print(f"- p-value: **{p_h5}**")
print("A p-value greater than 0.05 indicates insufficient evidence to reject the null hypothesis.")
print("**Conclusion:** The results suggest that age does not significantly influence preferences for live footage usage, but further exploration is needed to understand the nuances across different age groups.\n")

print("## Hypothesis 6:")
print("**Visitors who find live footage to be reliable are more likely to revisit the ski resort.**")
print("This hypothesis tests whether the reliability of live footage correlates with a visitor's likelihood of returning to the ski resort.")
print("**Correlation (Reliability vs. Revisiting):**")
print(h6_corr.to_markdown(index=True))
print("**Conclusion:** Reliability of live footage does seem to play a role in the likelihood of revisiting. Enhancing the quality and consistency of live footage could encourage repeat visits.\n")

print("## Hypothesis 7:")
print("**Locals are more likely to base their decision on live footage than tourists.**")
print("This hypothesis looks at how the residency status of visitors (local vs. tourist) affects their reliance on live footage when choosing ski resorts.")
print("**Decision Influence:**")
print(h7_results.to_markdown(index=False))
print("**Conclusion:** The results indicate that locals might rely more on live footage when making their choices compared to tourists. This could reflect a greater familiarity with the local conditions showcased in the footage.\n")


# Hypothesis Results Document

## Hypothesis 1:
**Viewing live footage significantly influences the choice of ski resort.**
This hypothesis tests whether the availability of live footage impacts the decision-making process of potential visitors.
**Results (Chi-squared):**
- Chi-squared Statistic: **0.0254**
- p-value: **0.8733**
A p-value greater than 0.05 suggests that there is not enough evidence to reject the null hypothesis, indicating that live footage may not significantly influence choice.
**Conclusion:** The findings indicate that live footage does not play a significant role in influencing the choice of ski resort among visitors, suggesting that other factors may be more decisive in their decision-making process.

## Hypothesis 2:
**Frequent visitors to ski resorts are more likely to rely on live footage.**
This hypothesis examines the correlation between how often respondents visit ski resorts and their usage of live footage to aid in their decision-making.
**Correlation (Vis