In [1]:
import pandas as pd
import requests
import gzip
import io

In [2]:
# URL of the gzipped CSV file
url = 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE174nnn/GSE174369/suppl/GSE174369_PerMeth.AllRegions.csv.gz'

In [3]:
# Fetch the gzipped file from the URL
response = requests.get(url)

In [9]:
# Ensure the request was successful
if response.status_code == 200:
    # Decompress the gzipped file
    with gzip.GzipFile(fileobj=io.BytesIO(response.content)) as f:
        # Load the CSV file into a pandas DataFrame
        df = pd.read_csv(f)
        # Display the first few rows of the DataFrame
    print(df.head())
    # Print the column names
    print("Column names:", df.columns.tolist())
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")

           Unnamed: 0  Day_0_Control_3_R1  Day_0_Control_3_R2  \
0    chr1.15750.16253           90.340909           79.000000   
1    chr1.17349.17853           92.834891           79.646018   
2    chr1.19016.20044           79.591837           85.106383   
3    chr1.29253.29642            0.211864            0.000000   
4  chr1.128751.129717           35.526316           27.027027   

   Day_0_Control_4_R1  Day_0_Control_4_R2  Day_0_Control_1_R1  \
0           90.000000           89.204545           82.304527   
1           78.761062           93.536122           89.577465   
2           60.869565           89.147287           82.978723   
3            0.000000            0.543478            0.526316   
4           39.226519           31.543624           31.496063   

   Day_0_Control_1_R2  Day_0_Control_2_R1  Day_0_Control_2_R2  \
0           80.295567           78.571429           50.000000   
1           91.586538           85.795455           90.000000   
2           88.095238  

In [6]:
# Display basic information about the dataset
print(df.info())

# Display summary statistics
print(df.describe())

# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:\n", missing_values)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267882 entries, 0 to 267881
Data columns (total 49 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Unnamed: 0           267882 non-null  object 
 1   Day_0_Control_3_R1   267882 non-null  float64
 2   Day_0_Control_3_R2   267882 non-null  float64
 3   Day_0_Control_4_R1   267882 non-null  float64
 4   Day_0_Control_4_R2   267882 non-null  float64
 5   Day_0_Control_1_R1   267882 non-null  float64
 6   Day_0_Control_1_R2   267882 non-null  float64
 7   Day_0_Control_2_R1   267882 non-null  float64
 8   Day_0_Control_2_R2   267882 non-null  float64
 9   Day_0_FSHD2_3_R1     267882 non-null  float64
 10  Day_0_FSHD2_3_R2     267882 non-null  float64
 11  Day_0_FSHD2_2_R1     267882 non-null  float64
 12  Day_0_FSHD2_2_R2     267882 non-null  float64
 13  Day_0_FSHD2_4_R1     267882 non-null  float64
 14  Day_0_FSHD2_4_R2     267882 non-null  float64
 15  Day_0_FSHD2_1_R1 

In [10]:
# Rename the first column to 'region'
df.rename(columns={'Unnamed: 0': 'region'}, inplace=True)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Fill missing values using vectorized operations
df_melted.fillna(df_melted.mean(), inplace=True)

# Normalize methylation levels using vectorized operations
df_melted['normalized_methylation'] = (df_melted['methylation_level'] - df_melted['methylation_level'].mean()) / df_melted['methylation_level'].std()

# Display the first few rows of the preprocessed DataFrame
print(df_melted.head())



               region              sample  methylation_level
0    chr1.15750.16253  Day_0_Control_3_R1          90.340909
1    chr1.17349.17853  Day_0_Control_3_R1          92.834891
2    chr1.19016.20044  Day_0_Control_3_R1          79.591837
3    chr1.29253.29642  Day_0_Control_3_R1           0.211864
4  chr1.128751.129717  Day_0_Control_3_R1          35.526316


In [12]:
# Extract condition and time point from the 'sample' column
df_melted['condition'] = df_melted['sample'].apply(lambda x: 'Control' if 'Control' in x else 'FSHD2')
df_melted['time_point'] = df_melted['sample'].apply(lambda x: x.split('_')[1])

# Display the first few rows to verify
print(df_melted.head())


               region              sample  methylation_level condition  \
0    chr1.15750.16253  Day_0_Control_3_R1          90.340909   Control   
1    chr1.17349.17853  Day_0_Control_3_R1          92.834891   Control   
2    chr1.19016.20044  Day_0_Control_3_R1          79.591837   Control   
3    chr1.29253.29642  Day_0_Control_3_R1           0.211864   Control   
4  chr1.128751.129717  Day_0_Control_3_R1          35.526316   Control   

  time_point  
0          0  
1          0  
2          0  
3          0  
4          0  


In [None]:
# Sample a subset of the data for initial plots to speed up rendering
df_sample = df_melted.sample(frac=0.1, random_state=42)  # Sample 10% of the data

# Plot distribution of methylation levels
plt.figure(figsize=(10, 6))
sns.histplot(df_sample['methylation_level'], bins=50, kde=True)
plt.title('Distribution of Methylation Levels')
plt.xlabel('Methylation Level')
plt.ylabel('Frequency')
plt.show()

# Plot distribution by sample condition
plt.figure(figsize=(10, 6))
sns.boxplot(x='condition', y='methylation_level', data=df_sample)
plt.title('Methylation Levels by Condition')
plt.xlabel('Condition')
plt.ylabel('Methylation Level')
plt.show()


In [None]:
# Convert condition to category if not already
df_melted['condition'] = df_melted['condition'].astype('category')

# Quick summary using describe
print(df_melted.describe())
