In [None]:
from google.colab import files
uploaded = files.upload()  # Upload a file from your local system to the Google Colab environment

import pandas as pd
# Load the uploaded CSV file into a pandas DataFrame
df = pd.read_csv('ASD_Traits_Study_Data.csv')

In [None]:
import pandas as pd  # Import pandas for data manipulation and analysis
import numpy as np  # Import numpy for numerical operations and handling arrays
import seaborn as sns  # Import seaborn for creating statistical data visualizations
import matplotlib.pyplot as plt  # Import matplotlib for general plotting
from scipy.stats import chi2_contingency  # Import chi-squared test function for categorical data analysis

In [None]:
!pip install --upgrade psutil  # Upgrade the psutil library for system monitoring and resource management
!pip install dython  # Install the dython library for advanced correlation analysis and other data analysis utilities

In [None]:
# Import required libraries
# Plot a pie chart to visualize the distribution of ASD traits
plt.figure(figsize=(8, 8))  # Set the figure size for the plot

# Map numeric values in the 'ASD_traits' column to descriptive labels
label_mapping = {1: 'Yes (ASD)', 2: 'No (Non-ASD)'}
asd_traits_counts = df['ASD_traits'].map(label_mapping).value_counts()  # Count occurrences of each category
labels = asd_traits_counts.index  # Get the category labels for the pie chart

# Define a function to display both absolute values and percentages in the pie chart
def func(pct, allvalues):
    absolute = int(pct / 100. * sum(allvalues))  # Calculate the absolute value
    return f"{absolute} ({pct:.1f}%)"  # Return the formatted string with both values

# Plot the pie chart
plt.pie(asd_traits_counts,
        labels=labels,  # Set the labels for the chart
        autopct=lambda pct: func(pct, asd_traits_counts),  # Display absolute values and percentages
        startangle=140,  # Rotate the start angle for better visualization
        colors=sns.color_palette("pastel"),  # Use pastel colors for the wedges
        wedgeprops={'edgecolor': 'black'},  # Add a black edge to each wedge for better contrast
        textprops={'fontsize': 14})  # Set the font size for text annotations

plt.show()  # Display the pie chart

In [None]:
import matplotlib.pyplot as plt  # Import matplotlib for plotting
import seaborn as sns  # Import seaborn for enhanced visualizations
import pandas as pd  # Import pandas for data manipulation and analysis
from scipy.stats import chi2_contingency  # Import chi-squared test function for categorical data analysis
import numpy as np  # Import numpy for numerical computations

# Set the DPI (dots per inch) for the figures to ensure high-resolution plots
plt.rcParams['figure.dpi'] = 1000

# Function to calculate Cramer's V correlation coefficient for two categorical variables
def cramers_v(var1, var2):
    confusion_matrix = pd.crosstab(var1, var2)  # Create a contingency table
    chi2 = chi2_contingency(confusion_matrix)[0]  # Perform chi-squared test and extract chi-squared value
    n = confusion_matrix.sum().sum()  # Get the total number of observations
    r, k = confusion_matrix.shape  # Get the dimensions of the contingency table
    return round(np.sqrt(chi2 / (n * (min(r - 1, k - 1)))), 3)  # Calculate and return Cramer's V

# Function to calculate Phi correlation coefficient for two categorical variables
def phi_correlation(var1, var2):
    confusion_matrix = pd.crosstab(var1, var2)  # Create a contingency table
    chi2 = chi2_contingency(confusion_matrix)[0]  # Perform chi-squared test and extract chi-squared value
    n = confusion_matrix.sum().sum()  # Get the total number of observations
    return round(np.sqrt(chi2 / n), 3)  # Calculate and return Phi correlation coefficient

# List of categorical variables for correlation analysis
categorical_columns = ['Gender', 'Ethnicity', 'Family_mem_with_ASD', 'Rater', 'ASD_traits',
                       'SRS', 'CARS', 'A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'AQ10']

# Initialize empty dataframes to store the correlation matrices for Cramer's V and Phi coefficients
cramers_v_matrix = pd.DataFrame(index=categorical_columns, columns=categorical_columns)
phi_matrix = pd.DataFrame(index=categorical_columns, columns=categorical_columns)

# Calculate Cramer's V and Phi correlation coefficients for each pair of variables
for col1 in categorical_columns:
    for col2 in categorical_columns:
        if col1 == col2:
            cramers_v_matrix.loc[col1, col2] = 1.00  # Set self-correlation to 1.00
            phi_matrix.loc[col1, col2] = 1.00  # Set self-correlation to 1.00
        else:
            cramers_v_matrix.loc[col1, col2] = f"{cramers_v(df[col1], df[col2]):.2f}"  # Calculate Cramer's V
            phi_matrix.loc[col1, col2] = f"{phi_correlation(df[col1], df[col2]):.2f}"  # Calculate Phi coefficient

# Visualization: Heatmap for Cramer's V correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(cramers_v_matrix.astype(float), annot=True, cmap='Pastel1')  # Plot heatmap with annotations
plt.show()

# Visualization: Heatmap for Phi correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(phi_matrix.astype(float), annot=True, cmap='Pastel1')  # Plot heatmap with annotations
plt.show()

In [None]:
import pandas as pd  # Import pandas for data manipulation

# List of binary variables
binary_variables = ['Gender', 'Family_mem_with_ASD', 'A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10']

# Mapping dictionaries for descriptive labels
gender_mapping = {1: 'Boy', 2: 'Girl'}
family_asd_mapping = {1: 'Yes', 2: 'No'}

# Apply mapping to the dataset for readability
df['Gender_mapped'] = df['Gender'].map(gender_mapping)  # Map 'Gender' to descriptive labels
df['Family_mem_with_ASD_mapped'] = df['Family_mem_with_ASD'].map(family_asd_mapping)  # Map 'Family_mem_with_ASD'

# Initialize an empty list to store frequency summary data
summary_data = []

# Calculate frequencies for each binary variable
for var in binary_variables:
    # Frequency counts for ASD = Yes (1) and ASD = No (2)
    yes_asd = df[df['ASD_traits'] == 1][var].value_counts()  # For ASD Yes (1)
    no_asd = df[df['ASD_traits'] == 2][var].value_counts()   # For ASD No (2)

    # Apply mapping for specific variables to enhance readability
    if var == 'Gender':
        yes_asd.index = yes_asd.index.map(gender_mapping)
        no_asd.index = no_asd.index.map(gender_mapping)
    elif var == 'Family_mem_with_ASD':
        yes_asd.index = yes_asd.index.map(family_asd_mapping)
        no_asd.index = no_asd.index.map(family_asd_mapping)

    # Append the frequency summary to the list
    summary_data.append({
        'Variable': var,
        'Value 1 - Yes (ASD)': yes_asd.get(1, 0) if var not in ['Gender', 'Family_mem_with_ASD'] else yes_asd.get('Boy', 0),
        'Value 1 - No (Non-ASD)': no_asd.get(1, 0) if var not in ['Gender', 'Family_mem_with_ASD'] else no_asd.get('Boy', 0),
        'Value 0 - Yes (ASD)': yes_asd.get(0, 0) if var not in ['Gender', 'Family_mem_with_ASD'] else yes_asd.get('Girl', 0),
        'Value 0 - No (Non-ASD)': no_asd.get(0, 0) if var not in ['Gender', 'Family_mem_with_ASD'] else no_asd.get('Girl', 0)
    })

# Convert the list of summary data to a DataFrame
summary_table = pd.DataFrame(summary_data)

# Rename columns for better clarity
summary_table.rename(columns={
    'Value 1 - Yes (ASD)': 'Count for Value 1 - Yes (ASD)',
    'Value 1 - No (Non-ASD)': 'Count for Value 1 - No (Non-ASD)',
    'Value 0 - Yes (ASD)': 'Count for Value 0 - Yes (ASD)',
    'Value 0 - No (Non-ASD)': 'Count for Value 0 - No (Non-ASD)'
}, inplace=True)

# Replace missing values with 0 to ensure completeness
summary_table.fillna(0, inplace=True)

# Print the summary table
print(summary_table)

# Optionally save the summary table to an Excel file
summary_table.to_excel('binary_variable_summary_with_mapped_values.xlsx', index=False)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Map 'ASD_traits' values to descriptive labels
label_mapping = {1: 'Yes (ASD)', 2: 'No (Non-ASD)'}
df['ASD_traits_mapped'] = df['ASD_traits'].map(label_mapping)

# Define mappings for other variables
gender_mapping = {1: 'Male', 2: 'Female'}
family_mapping = {1: 'Yes', 2: 'No'}
rater_mapping = {1: 'Family Member', 2: 'Healthcare Professional', 3: 'Others'}

# Apply mappings for readability
df['Gender_mapped'] = df['Gender'].map(gender_mapping)
df['Family_mem_with_ASD_mapped'] = df['Family_mem_with_ASD'].map(family_mapping)
df['Rater_mapped'] = df['Rater'].map(rater_mapping)

# Set graph style and font size
sns.set(style="whitegrid", font_scale=1.1)

# Set DPI for high-quality visualization
plt.rcParams['figure.dpi'] = 1000

# Configure figure size
plt.figure(figsize=(20, 12))

# Variables to visualize
variables = ['Gender_mapped', 'Family_mem_with_ASD_mapped', 'Rater_mapped', 'SRS', 'CARS', 'AQ10']

# Create histograms for each variable in a 2x3 layout
for i, var in enumerate(variables):
    plt.subplot(2, 3, i + 1)  # 2x3 grid layout
    if var in ['SRS', 'AQ10']:
        sns.countplot(
            data=df,
            x=var,
            hue='ASD_traits_mapped',
            palette="bright",
            dodge=True,
            edgecolor='black'
        )
        # Set x-axis labels explicitly for 1-10
        plt.xticks(range(0, 10), labels=[str(x) for x in range(1, 11)], fontsize=11)
    elif var == 'CARS':
        sns.countplot(
            data=df,
            x=var,
            hue='ASD_traits_mapped',
            palette="bright",
            dodge=True,
            edgecolor='black'
        )
        # Set x-axis labels explicitly for 1-4
        plt.xticks(range(0, 4), labels=[str(x) for x in range(1, 5)], fontsize=11)
    else:
        sns.countplot(
            x=var,
            hue='ASD_traits_mapped',
            data=df,
            palette="bright",
            dodge=True,
            edgecolor='black'
        )

    # Set titles and axis labels with proper font sizes
    plt.title(f'{var.replace("_mapped", "")} by ASD Traits', fontsize=14)
    plt.xlabel(var.replace("_mapped", ""), fontsize=12)
    plt.ylabel('Count', fontsize=12)

    # Display bar values at the center of each bar
    for p in plt.gca().patches:
        height = p.get_height()
        if height > 0:
            plt.text(p.get_x() + p.get_width() / 2., height + 1, f'{int(height)}',
                     ha='center', fontsize=10, color='black')

    # Adjust legend formatting
    plt.legend(title="", fontsize=10)

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Map 'ASD_traits' values to descriptive labels
label_mapping = {1: 'Yes (ASD)', 2: 'No (Non-ASD)'}
df['ASD_traits_mapped'] = df['ASD_traits'].map(label_mapping)

# Map 'Ethnicity' values to descriptive labels
ethnicity_mapping = {
    1: 'Asian',
    2: 'Black',
    3: 'Hispanic',
    4: 'Latino',
    5: 'Middle Eastern',
    6: 'Mixed',
    7: 'Native Indian',
    8: 'Others',
    9: 'Pacific Islander',
    10: 'South Asian',
    11: 'White European'
}
df['Ethnicity_mapped'] = df['Ethnicity'].map(ethnicity_mapping)

# Set Seaborn style
sns.set(style="whitegrid")

# Set high resolution for the plots
plt.rcParams['figure.dpi'] = 1000

# Set figure size
plt.figure(figsize=(14, 8))

# Age_Years bar chart (top)
plt.subplot(2, 1, 1)
ax1 = sns.countplot(x='Age_Years', hue='ASD_traits_mapped', data=df, palette="Set2", edgecolor="black")
plt.title('Age Distribution by ASD Traits', fontsize=14)
plt.xlabel('Age (Years)', fontsize=12)
plt.ylabel('Count', fontsize=12)

# Adjust legend
ax1.legend(title="", loc='upper right')

# Display counts on top of the bars
for p in ax1.patches:
    if p.get_height() > 0:
        ax1.annotate(f'{int(p.get_height())}',
                     (p.get_x() + p.get_width() / 2., p.get_height()),
                     ha='center', va='baseline', fontsize=10, color='black', xytext=(0, 5), textcoords='offset points')

# Ethnicity bar chart (bottom)
plt.subplot(2, 1, 2)
ax2 = sns.countplot(x='Ethnicity_mapped', hue='ASD_traits_mapped', data=df, palette="Set3", edgecolor="black")
plt.title('Ethnicity Distribution by ASD Traits', fontsize=14)
plt.xlabel('Ethnicity', fontsize=12)
plt.ylabel('Count', fontsize=12)

# Rotate x-axis labels for better readability
plt.xticks(rotation=30, ha='right')

# Adjust legend
ax2.legend(title="", loc='upper right')

# Display counts on top of the bars
for p in ax2.patches:
    if p.get_height() > 0:
        ax2.annotate(f'{int(p.get_height())}',
                     (p.get_x() + p.get_width() / 2., p.get_height()),
                     ha='center', va='baseline', fontsize=10, color='black', xytext=(0, 5), textcoords='offset points')

# Adjust layout to avoid overlapping
plt.tight_layout()
plt.show()

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Map 'ASD_traits' values to descriptive labels
label_mapping = {1: 'Yes (ASD)', 2: 'No (Non-ASD)'}
df['ASD_traits_mapped'] = df['ASD_traits'].map(label_mapping)

# Apply PCA with 2 components
pca = PCA(n_components=2)
pca_components = pca.fit_transform(df.select_dtypes(include=[float, int]))

# Create a DataFrame for PCA results
pca_df = pd.DataFrame(data=pca_components, columns=['PC1', 'PC2'])
pca_df['ASD_traits_mapped'] = df['ASD_traits_mapped']

# Plot PCA results
plt.figure(figsize=(9, 7.5))  # Set figure size
sns.scatterplot(
    x='PC1',
    y='PC2',
    hue='ASD_traits_mapped',
    data=pca_df,
    palette='Set2',
    s=100,  # Adjust marker size
    edgecolor='black'  # Add border to markers
)

# Add labels and title
plt.xlabel('Principal Component 1 (PC1)', fontsize=14)
plt.ylabel('Principal Component 2 (PC2)', fontsize=14)
plt.legend(title='', fontsize=12)  # Adjust legend font size

# Add grid for better readability
plt.grid(alpha=0.3)

# Ensure proper layout
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import pandas as pd

# Load dataset
df = pd.read_csv('ASD_Traits_Study_Data.csv')

# Map 'ASD_traits' values to descriptive labels
label_mapping = {1: 'Yes (ASD)', 2: 'No (Non-ASD)'}
df['ASD_traits_mapped'] = df['ASD_traits'].map(label_mapping)

# Separate independent variables (X) and dependent variable (y)
X = df.drop(columns=['ASD_traits', 'ASD_traits_mapped'])  # Exclude dependent variable
y = df['ASD_traits']  # Dependent variable

# Determine the number of unique classes in the dependent variable
n_classes = len(np.unique(y))

# Calculate the maximum possible components for LDA
max_components = min(X.shape[1], n_classes - 1)

# Set the number of components for LDA
n_components = 2 if max_components >= 2 else 1

# Apply LDA
lda = LDA(n_components=n_components)
lda_results = lda.fit_transform(X, y)

# Visualize LDA results
plt.figure(figsize=(9, 7.5))  # Set figure size

if n_components == 2:
    # 2D visualization for two components
    sns.scatterplot(
        x=lda_results[:, 0],
        y=lda_results[:, 1],
        hue=df['ASD_traits_mapped'],
        palette='Set2',
        s=200,  # Increase marker size
        edgecolor='black'  # Add border to markers
    )
    plt.xlabel('Linear Discriminant 1 (LD1)', fontsize=14)
    plt.ylabel('Linear Discriminant 2 (LD2)', fontsize=14)
else:
    # 1D visualization for a single component
    sns.stripplot(
        x=lda_results[:, 0],
        hue=df['ASD_traits_mapped'],
        palette='Set2',
        size=10,  # Adjust marker size
        jitter=0.25,  # Add jitter to spread points
        dodge=True,
        linewidth=1,  # Add border to points
        edgecolor='black'  # Set border color
    )
    plt.xlabel('Linear Discriminant 1 (LD1)', fontsize=14)
    plt.ylabel('')  # No label for Y-axis in 1D visualization

# Adjust legend position and remove title
plt.legend(title='')

# Add grid for better readability
plt.grid(alpha=0.3)

# Optimize layout and display the plot
plt.tight_layout()
plt.show()