In [5]:
import csv
from datetime import datetime, timedelta
import random
import math

def generate_weather_data(num_days=365):
    # Start date
    start_date = datetime(2023, 1, 1)
    
    # Generate data
    data = []
    for i in range(num_days):
        date = start_date + timedelta(days=i)
        # Generate temperature with seasonal variation
        base_temp = 20  # Base temperature
        seasonal_variation = 10 * math.sin(2 * math.pi * i / 365)  # Annual cycle
        random_variation = random.uniform(-3, 3)  # Daily random variation
        temperature = base_temp + seasonal_variation + random_variation
        
        data.append({
            'date': date.strftime('%Y-%m-%d'),
            'temperature': round(temperature, 2),
            'humidity': random.randint(40, 90)
        })
    
    # Write to CSV
    with open('weather_data.csv', 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['date', 'temperature', 'humidity'])
        writer.writeheader()
        writer.writerows(data)

if __name__ == '__main__':

    generate_weather_data()


In [6]:
import csv

def read_csv_data(filename):
    temperatures = []
    humidity_values = []
    
    with open(filename) as f:
        reader = csv.reader(f)
        next(reader)  # Skip the header row
        for row in reader:
            try:
                temp = float(row[1])
                humidity = float(row[2])
                temperatures.append(temp)
                humidity_values.append(humidity)
            except (ValueError, IndexError):
                print(f'Skipping invalid row: {row}')
                continue
    
    return temperatures, humidity_values

def calculate_mean(numbers):
    s = sum(numbers)
    N = len(numbers)
    return s/N

def calculate_median(numbers):
    N = len(numbers)
    numbers.sort()
    if N % 2 == 0:
        m1 = int(N/2) - 1
        m2 = int(N/2)
        median = (numbers[m1] + numbers[m2])/2
    else:
        m = int((N+1)/2) - 1
        median = numbers[m]
    return median

# Test This list of data
test_list = [15.5, 17.2, 16.8, 15.9, 18.1]

#Calculate mean and median of the Test List
mean = calculate_mean(test_list)
median = calculate_median(test_list)

#Print or display that calculated mean and median from the Test list
print(f"Test Data Mean: {mean}")
print(f"Test Data Median: {median}")

#Call the function using the filename of the *.csv file that we created

#Calculate the mean and median values from the csv file and print those calculations

Test Data Mean: 16.7
Test Data Median: 16.8


In [None]:
"""
Correlation Analyzer Class
Based on Chapter 3 of "Doing Math with Python"
"""

import numpy as np
import matplotlib.pyplot as plt
from typing import List, Tuple, Union, Optional

class CorrelationAnalyzer:
	"""A class to analyze correlation between two datasets"""
   
	def __init__(self, x: List[float], y: List[float]):
    	"""
    	Initialize the CorrelationAnalyzer with two datasets
       
    	Args:
        	x: First dataset
        	y: Second dataset
           
    	Raises:
        	ValueError: If datasets have different lengths
    	"""
    	if len(x) != len(y):
        	raise ValueError("Both datasets must have the same length")
       	 
    	self.x = x
    	self.y = y
    	self.correlation = None
   
	def calculate_correlation(self) -> float:
    	"""
    	Calculate the Pearson correlation coefficient between the datasets
    	using the formula from page 76-77 of the book.
       
    	Returns:
        	float: The correlation coefficient
    	"""
    	n = len(self.x)
       
    	# Calculate the sum of products
    	sum_prod_x_y = sum(xi * yi for xi, yi in zip(self.x, self.y))
       
    	# Calculate the sums
    	sum_x = sum(self.x)
    	sum_y = sum(self.y)
       
    	# Calculate the squares
    	squared_sum_x = sum_x ** 2
    	squared_sum_y = sum_y ** 2
       
    	# Calculate the sum of squares
    	x_squared_sum = sum(xi ** 2 for xi in self.x)
    	y_squared_sum = sum(yi ** 2 for yi in self.y)
       
    	# Calculate using the formula
    	numerator = n * sum_prod_x_y - sum_x * sum_y
    	denominator_term1 = n * x_squared_sum - squared_sum_x
    	denominator_term2 = n * y_squared_sum - squared_sum_y
    	denominator = (denominator_term1 * denominator_term2) ** 0.5
       
    	# Handle division by zero
    	if denominator == 0:
        	return 0
           
    	self.correlation = numerator / denominator
    	return self.correlation
   
	def interpret_correlation(self) -> str:
    	"""
    	Provide an interpretation of the correlation coefficient
       
    	Returns:
        	str: Interpretation of the correlation strength
    	"""
    	if self.correlation is None:
        	self.calculate_correlation()
           
    	corr = abs(self.correlation)
       
    	if corr > 0.9:
        	strength = "very strong"
    	elif corr > 0.7:
        	strength = "strong"
    	elif corr > 0.5:
        	strength = "moderate"
    	elif corr > 0.3:
        	strength = "weak"
    	else:
        	strength = "very weak or no"
           
    	direction = "positive" if self.correlation > 0 else "negative"
       
    	if abs(self.correlation) < 0.1:
        	return f"There is virtually no correlation (r={self.correlation:.4f})"
           
    	return f"There is a {strength} {direction} correlation (r={self.correlation:.4f})"
   
	def create_scatter_plot(self, title: Optional[str] = None,
                       	xlabel: Optional[str] = None,
                       	ylabel: Optional[str] = None,
                       	add_trendline: bool = False) -> None:
    	"""
    	Create a scatter plot of the datasets
       
    	Args:
        	title: Optional title for the plot
        	xlabel: Optional label for x-axis
        	ylabel: Optional label for y-axis
        	add_trendline: Whether to add a trend line to the plot
    	"""
    	plt.figure(figsize=(10, 6))
    	plt.scatter(self.x, self.y, color='blue', alpha=0.7)
       
    	if title:
        	plt.title(title)
    	else:
        	plt.title(f'Scatter Plot (r={self.correlation:.4f})')
           
    	plt.xlabel(xlabel if xlabel else 'X')
    	plt.ylabel(ylabel if ylabel else 'Y')
    	plt.grid(True, alpha=0.3)
       
    	if add_trendline:
        	# Calculate the trend line
        	z = np.polyfit(self.x, self.y, 1)
        	p = np.poly1d(z)
           
        	# Add the line to the plot
        	x_line = np.linspace(min(self.x), max(self.x), 100)
        	plt.plot(x_line, p(x_line), "r--", alpha=0.8)
           
        	# Add formula text
        	equation = f'y = {z[0]:.4f}x + {z[1]:.4f}'
        	plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction',
                    	bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))
       
    	# Add correlation text
    	if self.correlation is None:
        	self.calculate_correlation()
           
    	corr_text = f'r = {self.correlation:.4f}'
    	plt.annotate(corr_text, xy=(0.05, 0.90), xycoords='axes fraction',
                	bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))
       
    	plt.tight_layout()
    	plt.show()
		
# Write your code here to:
# 1. Create a CorrelationAnalyzer instance
from correlation_analyzer import CorrelationAnalyzer

# 2. Calculate the correlation
analyzer = CorrelationAnalyzer(??, ??)
correlation = analyzer.calculate_correlation()

# 3. Print the interpretation
print(analyzer.interpret_correlation())

# Datasets for analysis
dataset_1 = [10, 20, 30, 40, 50]
dataset_2 = [12, 21, 35, 45, 55]