Some typical NumPy 'random' functions:

In [None]:
import numpy as np

# To generate a random float values between 0 and 1 (exclusive)
print(np.random.rand(3, 2)) # Creates a 3 x 2 matrix of random float values between 0 and 1 (exclusive)

# To generate random float values from standard normal distribution (mean = 0 and std = 1)
print(np.random.randn(5)) # Generates 5 random float values from standard normal distribution

# To generate random integer values within a range of values
print(np.random.randint(1, 100, 10)) # Generates 10 random integer values between 1 and 100 (exclusive)

# To randomly select an element from a given list of elements
print(np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9], size=3)) # Three random numbers from the given array will be chosen

# To generate a random sample of values from a normally distributed with a specified mean and standard deviation
print(np.random.normal(loc=0, scale=1, size=10)) # 10 random values will be generated from a normally distributed data where 0 (loc) is the mean and 1 (scale) is the standard deviation

# To set a seed for a random number generation to ensure you get the same results later as well
print(np.random.seed(42)) # This ensures that any random numbers generated in the program are same every time you run the program. The seed value '42' can be any other value

AIM #1: Generate a very large dataset
1. Generate a dataset of 1 million random data items between 1 and 100 items using only pandas
2. Generate a dataset of 1 million random data items between 1 and 100 using only NumPy
3. Calculate the time it takes for both the above operations. 
    3.1. Import the 'time' module, and use the time() function to calculate current time
    3.2. Which one is faster and why?

In [None]:
import pandas as pd
import numpy as np
import time

# AIM #1: Generate 1 million random data items between 1 and 100 using Pandas
start_time_pandas = time.time()
pandas_data = pd.Series(np.random.randint(1, 101, size=1000000))
end_time_pandas = time.time()

# Time taken for pandas
pandas_time = end_time_pandas - start_time_pandas
print(f"Time taken using Pandas: {pandas_time} seconds")


# AIM #2: Generate 1 million random data items between 1 and 100 using NumPy
start_time_numpy = time.time()
numpy_data = np.random.randint(1, 101, size=1000000)
end_time_numpy = time.time()

# Time taken for NumPy
numpy_time = end_time_numpy - start_time_numpy
print(f"Time taken using NumPy: {numpy_time} seconds")

# 3.2 Comparison
if numpy_time < pandas_time:
    print("NumPy is faster than Pandas for this operation.")
else:
    print("Pandas is faster than NumPy for this operation.")


AIM #2: Basic statistics
For the given dataset on sleep health and lifestyle, do the following
1. Using only pandas, load the dataset, calculate mean 'Sleep Duration', 'Systolic Blood Pressure', 'Diastolic Blood Pressure', 'Heart Rate' and 'Daily Steps'.
2. Do the same as in Step 1 using only NumPy
3. Using only pandas, first calculate correlation (across only the numerical variables), and then separate correlation between...
    Sleep duration and Age
    Sleep duration and Heart rate
    Sleep duration and Daily steps
4. Using only NumPy, do the same as Step 3
5. Using pandas only, calculate standard deviation for 'Sleep Duration'. 
6. Usiong NumPy only, calculate standard deviation for 'Sleep Duration'. 
7. Calculate the time difference between using pandas and NumPy, right from the step of loading the dataset to the final standard deviation step. 
    5.1. Which one is faster and why?

In [None]:
import pandas as pd
import numpy as np
import time

# Loading the dataset using Pandas
start_time_pandas = time.time()
df = pd.read_csv('sleep_health.csv')

# Step 1: Calculate mean using Pandas
mean_sleep_duration_pandas = df['Sleep Duration'].mean()
mean_systolic_bp_pandas = df['Systolic Blood Pressure'].mean()
mean_diastolic_bp_pandas = df['Diastolic Blood Pressure'].mean()
mean_heart_rate_pandas = df['Heart Rate'].mean()
mean_daily_steps_pandas = df['Daily Steps'].mean()

# Display mean values
print("Pandas Mean Calculations:")
print(f"Sleep Duration: {mean_sleep_duration_pandas}")
print(f"Systolic Blood Pressure: {mean_systolic_bp_pandas}")
print(f"Diastolic Blood Pressure: {mean_diastolic_bp_pandas}")
print(f"Heart Rate: {mean_heart_rate_pandas}")
print(f"Daily Steps: {mean_daily_steps_pandas}")

# Step 3: Calculate correlation using Pandas
corr_matrix = df.corr()  # Correlation across all numerical variables
print("\nPandas Correlation Matrix:")
print(corr_matrix)

# Correlation between specific pairs
corr_sleep_age = df['Sleep Duration'].corr(df['Age'])
corr_sleep_heart_rate = df['Sleep Duration'].corr(df['Heart Rate'])
corr_sleep_steps = df['Sleep Duration'].corr(df['Daily Steps'])

print("\nPandas Correlations:")
print(f"Correlation between Sleep Duration and Age: {corr_sleep_age}")
print(f"Correlation between Sleep Duration and Heart Rate: {corr_sleep_heart_rate}")
print(f"Correlation between Sleep Duration and Daily Steps: {corr_sleep_steps}")

# Step 5: Calculate standard deviation for 'Sleep Duration' using Pandas
std_sleep_duration_pandas = df['Sleep Duration'].std()
print(f"\nPandas Standard Deviation of Sleep Duration: {std_sleep_duration_pandas}")

end_time_pandas = time.time()
pandas_time = end_time_pandas - start_time_pandas
print(f"\nTime taken using Pandas: {pandas_time} seconds")


# ------------------- NumPy Implementation ------------------- #

# Loading the dataset using NumPy
start_time_numpy = time.time()
data = np.genfromtxt('sleep_health.csv', delimiter=',', skip_header=1)

# Extract columns for NumPy operations
sleep_duration = data[:, 0]  # Assuming 'Sleep Duration' is in the first column
systolic_bp = data[:, 1]  # 'Systolic Blood Pressure'
diastolic_bp = data[:, 2]  # 'Diastolic Blood Pressure'
heart_rate = data[:, 3]  # 'Heart Rate'
daily_steps = data[:, 4]  # 'Daily Steps'
age = data[:, 5]  # 'Age'

# Step 1: Calculate mean using NumPy
mean_sleep_duration_numpy = np.mean(sleep_duration)
mean_systolic_bp_numpy = np.mean(systolic_bp)
mean_diastolic_bp_numpy = np.mean(diastolic_bp)
mean_heart_rate_numpy = np.mean(heart_rate)
mean_daily_steps_numpy = np.mean(daily_steps)

print("\nNumPy Mean Calculations:")
print(f"Sleep Duration: {mean_sleep_duration_numpy}")
print(f"Systolic Blood Pressure: {mean_systolic_bp_numpy}")
print(f"Diastolic Blood Pressure: {mean_diastolic_bp_numpy}")
print(f"Heart Rate: {mean_heart_rate_numpy}")
print(f"Daily Steps: {mean_daily_steps_numpy}")

# Step 3: Calculate correlation using NumPy
corr_sleep_age_numpy = np.corrcoef(sleep_duration, age)[0, 1]
corr_sleep_heart_rate_numpy = np.corrcoef(sleep_duration, heart_rate)[0, 1]
corr_sleep_steps_numpy = np.corrcoef(sleep_duration, daily_steps)[0, 1]

print("\nNumPy Correlations:")
print(f"Correlation between Sleep Duration and Age: {corr_sleep_age_numpy}")
print(f"Correlation between Sleep Duration and Heart Rate: {corr_sleep_heart_rate_numpy}")
print(f"Correlation between Sleep Duration and Daily Steps: {corr_sleep_steps_numpy}")

# Step 5: Calculate standard deviation for 'Sleep Duration' using NumPy
std_sleep_duration_numpy = np.std(sleep_duration)
print(f"\nNumPy Standard Deviation of Sleep Duration: {std_sleep_duration_numpy}")

end_time_numpy = time.time()
numpy_time = end_time_numpy - start_time_numpy
print(f"\nTime taken using NumPy: {numpy_time} seconds")


# ------------------- Time Comparison ------------------- #
if numpy_time < pandas_time:
    print("\nNumPy is faster than Pandas for this operation.")
else:
    print("\nPandas is faster than NumPy for this operation.")




AIM #3: Use suitable plots to visualize the data

1. Using only pandas (and matplotlib/seaborn if necessary) plot the distribution for
    1.1. Age
    1.2. Sleep Duration
    1.3. Quality of Sleep
    1.4. Physical Activity Level
    1.5. Stress Level
    1.6. Heart Rate
2. Using only NumPy, do the same as Step 1. You will need matplotlib for this
3. Using only pandas, use the appropriate plot to
    3.1. See the distribution of 'Sleep Duration' based on 'Quality of Sleep'
    3.2. See the distribution of 'Sleep Duration' based on 'Stress Level'
    3.3. See the distribution of 'Sleep Duration' based on 'Physical Activity Level'
    3.4. See the distribution of 'Sleep Duration' based on 'Occupation'
    3.5. See the distribution of 'Sleep Duration' based on 'BMI'
4. Using only NumPy, do the same as Step 3. You will need matplotlib for this
5. Using only pandas, use a suitable plot to see the relation between
    5.1. Age and Sleep Duration
    5.2. Sleep Duration and Heart Rate
    5.3. Heart Rate and Daily Steps
    5.4. Sleep Duration and Daily Steps
6. Using only NumPy, do the same as Step 5. You will need matplotlib for this 
7. Find the time difference between plotting using only pandas, and plotting using NumPy

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

# -------------------- Step 1: Plot Distribution using Pandas -------------------- #
# Load dataset using Pandas
df = pd.read_csv('sleep_health.csv')

# 1.1. Age distribution using Pandas
plt.figure(figsize=(10, 6))
sns.histplot(df['Age'], kde=True)
plt.title('Distribution of Age (Pandas)')
plt.show()

# 1.2. Sleep Duration distribution using Pandas
plt.figure(figsize=(10, 6))
sns.histplot(df['Sleep Duration'], kde=True)
plt.title('Distribution of Sleep Duration (Pandas)')
plt.show()

# 1.3. Quality of Sleep distribution using Pandas
plt.figure(figsize=(10, 6))
sns.histplot(df['Quality of Sleep'], kde=True)
plt.title('Distribution of Quality of Sleep (Pandas)')
plt.show()

# 1.4. Physical Activity Level distribution using Pandas
plt.figure(figsize=(10, 6))
sns.histplot(df['Physical Activity Level'], kde=True)
plt.title('Distribution of Physical Activity Level (Pandas)')
plt.show()

# 1.5. Stress Level distribution using Pandas
plt.figure(figsize=(10, 6))
sns.histplot(df['Stress Level'], kde=True)
plt.title('Distribution of Stress Level (Pandas)')
plt.show()

# 1.6. Heart Rate distribution using Pandas
plt.figure(figsize=(10, 6))
sns.histplot(df['Heart Rate'], kde=True)
plt.title('Distribution of Heart Rate (Pandas)')
plt.show()


# -------------------- Step 2: Plot Distribution using NumPy -------------------- #
data = np.genfromtxt('sleep_health.csv', delimiter=',', skip_header=1)

# Extract necessary columns
age = data[:, 0]  # Assuming 'Age' is the first column
sleep_duration = data[:, 1]  # 'Sleep Duration'
quality_of_sleep = data[:, 2]  # 'Quality of Sleep'
physical_activity = data[:, 3]  # 'Physical Activity Level'
stress_level = data[:, 4]  # 'Stress Level'
heart_rate = data[:, 5]  # 'Heart Rate'

# 2.1. Age distribution using NumPy
plt.figure(figsize=(10, 6))
plt.hist(age, bins=20, edgecolor='k', alpha=0.7)
plt.title('Distribution of Age (NumPy)')
plt.show()

# 2.2. Sleep Duration distribution using NumPy
plt.figure(figsize=(10, 6))
plt.hist(sleep_duration, bins=20, edgecolor='k', alpha=0.7)
plt.title('Distribution of Sleep Duration (NumPy)')
plt.show()

# 2.3. Quality of Sleep distribution using NumPy
plt.figure(figsize=(10, 6))
plt.hist(quality_of_sleep, bins=20, edgecolor='k', alpha=0.7)
plt.title('Distribution of Quality of Sleep (NumPy)')
plt.show()

# 2.4. Physical Activity Level distribution using NumPy
plt.figure(figsize=(10, 6))
plt.hist(physical_activity, bins=20, edgecolor='k', alpha=0.7)
plt.title('Distribution of Physical Activity Level (NumPy)')
plt.show()

# 2.5. Stress Level distribution using NumPy
plt.figure(figsize=(10, 6))
plt.hist(stress_level, bins=20, edgecolor='k', alpha=0.7)
plt.title('Distribution of Stress Level (NumPy)')
plt.show()

# 2.6. Heart Rate distribution using NumPy
plt.figure(figsize=(10, 6))
plt.hist(heart_rate, bins=20, edgecolor='k', alpha=0.7)
plt.title('Distribution of Heart Rate (NumPy)')
plt.show()


# -------------------- Step 3: Distribution Based on Categorical Data -------------------- #
# Pandas: Distribution of 'Sleep Duration' based on categorical variables

# 3.1. Sleep Duration based on Quality of Sleep
plt.figure(figsize=(10, 6))
sns.boxplot(x='Quality of Sleep', y='Sleep Duration', data=df)
plt.title('Sleep Duration vs Quality of Sleep (Pandas)')
plt.show()

# 3.2. Sleep Duration based on Stress Level
plt.figure(figsize=(10, 6))
sns.boxplot(x='Stress Level', y='Sleep Duration', data=df)
plt.title('Sleep Duration vs Stress Level (Pandas)')
plt.show()

# 3.3. Sleep Duration based on Physical Activity Level
plt.figure(figsize=(10, 6))
sns.boxplot(x='Physical Activity Level', y='Sleep Duration', data=df)
plt.title('Sleep Duration vs Physical Activity Level (Pandas)')
plt.show()

# 3.4. Sleep Duration based on Occupation
plt.figure(figsize=(10, 6))
sns.boxplot(x='Occupation', y='Sleep Duration', data=df)
plt.title('Sleep Duration vs Occupation (Pandas)')
plt.show()

# 3.5. Sleep Duration based on BMI
plt.figure(figsize=(10, 6))
sns.boxplot(x='BMI', y='Sleep Duration', data=df)
plt.title('Sleep Duration vs BMI (Pandas)')
plt.show()


# -------------------- Step 4: Relationships Between Variables -------------------- #
# Pandas: Scatter plot for relationships

# 5.1. Age and Sleep Duration
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Age', y='Sleep Duration', data=df)
plt.title('Age vs Sleep Duration (Pandas)')
plt.show()

# 5.2. Sleep Duration and Heart Rate
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Sleep Duration', y='Heart Rate', data=df)
plt.title('Sleep Duration vs Heart Rate (Pandas)')
plt.show()

# 5.3. Heart Rate and Daily Steps
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Heart Rate', y='Daily Steps', data=df)
plt.title('Heart Rate vs Daily Steps (Pandas)')
plt.show()

# 5.4. Sleep Duration and Daily Steps
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Sleep Duration', y='Daily Steps', data=df)
plt.title('Sleep Duration vs Daily Steps (Pandas)')
plt.show()


# -------------------- Step 5: Time Difference Between Pandas and NumPy -------------------- #
# Tracking the time taken for Pandas and NumPy plotting
start_time_pandas = time.time()

# Perform the plotting steps for Pandas here (e.g., Sleep Duration and Age)
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Age', y='Sleep Duration', data=df)
plt.title('Age vs Sleep Duration (Pandas)')
plt.show()

end_time_pandas = time.time()
pandas_time_taken = end_time_pandas - start_time_pandas

start_time_numpy = time.time()

# Perform the plotting steps for NumPy here (e.g., Sleep Duration and Age)
plt.figure(figsize=(10, 6))
plt.scatter(age, sleep_duration)
plt.title('Age vs Sleep Duration (NumPy)')
plt.show()

end_time_numpy = time.time()
numpy_time_taken = end_time_numpy - start_time_numpy

# Print time difference
print(f"\nTime taken using Pandas: {pandas_time_taken} seconds")
print(f"Time taken using NumPy: {numpy_time_taken} seconds")

if pandas_time_taken < numpy_time_taken:
    print("\nPandas is faster for plotting.")
else:
    print("\nNumPy is faster for plotting.")


AIM #4: Other possible plotting

1. Think of other possible plots to show some interesting distribution and relations. Do this using both pandas and NumPy

