## Project 7
# Korey Sansone
## Project Repository
### https://github.com/korey-byte/datafun-07-ml




## Import Libraries

In [30]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
# Load the Iris dataset into DataFrame
df = sns.load_dataset('iris')

# Inspect first rows of the DataFrame
print(df.head())

## Part 1 - Chart a Straight Line

### Celsius vs Fahrenheit Data

This section shows the relationship between Celsius and Fahrenheit using the equation:

\[
F = \frac{9}{5}C + 32
\]

Where:
- \( m = \frac{9}{5} \) is the slope (rise over run).
- \( b = 32 \) is the y-intercept (the point where the line crosses the y-axis).

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt

# Define Celsius values (you can adjust the range)
celsius = pd.Series(range(-30, 101, 10))  # From -30 to 100 in steps of 10

# Calculate Fahrenheit using the equation F = (9/5)*C + 32
fahrenheit = (9/5) * celsius + 32

# Create a DataFrame for easy plotting
temp_df = pd.DataFrame({
    'Celsius': celsius,
    'Fahrenheit': fahrenheit
})

# Plot the Celsius vs Fahrenheit
plt.figure(figsize=(8, 5))
plt.plot(temp_df['Celsius'], temp_df['Fahrenheit'], marker='o')

# Set labels and title
plt.title('Celsius vs Fahrenheit')
plt.xlabel('Celsius')
plt.ylabel('Fahrenheit')

# Show the plot
plt.grid(True)
plt.show()

## Part 2 - Prediction
This project involves building a linear regression model to predict the average high temperature in NYC for January using historical data.


In [33]:
# Add imports here
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# Display settings
pd.set_option('display.precision', 2)

### Section 1 - Data Acquisition

In [34]:
# Load the data from a CSV file into a DataFrame
nyc_df = pd.read_csv('/Users/koreysansone/Library/CloudStorage/OneDrive-nwmissouri.edu/Python Files/datafun-07-ml/IntroToPython/examples/ch10/ave_hi_nyc2_jan_1895-2018.csv')  # Replace with your file path

### Section 2 - Data Inspection

In [None]:
# View the first and last few rows of the dataset
nyc_df.head()
nyc_df.tail()

In [None]:
# Check the existing columns to see how many there are and what they're named
print(nyc_df.columns)

In [37]:
# Drop the 'Anomaly' column since it's not needed for the linear regression
nyc_df = nyc_df[['Date', 'Value']]  # Keep only 'Date' and 'Value'

# Rename the remaining columns to 'Year' and 'Average_High_Temp'
nyc_df.columns = ['Year', 'Average_High_Temp']

# Convert 'Year' to datetime format
nyc_df['Year'] = pd.to_datetime(nyc_df['Year'], format='%Y')

# Verify the DataFrame after changes​⬤

### Section 3 - Data Cleaning

In [38]:
# Rename columns for clarity and clean up date series
nyc_df.columns = ['Year', 'Average_High_Temp']
nyc_df['Year'] = pd.to_datetime(nyc_df['Year'], format='%Y')

### Section 4 - Descriptive Statistics

In [None]:
# Display basic descriptive statistics
nyc_df.describe()

### Section 5 - Build the Model

In [None]:
# Use linregress from scipy.stats to calculate slope and intercept
slope, intercept, r_value, p_value, std_err = stats.linregress(nyc_df['Year'].dt.year, nyc_df['Average_High_Temp'])

# Print model parameters
print(f"Slope: {slope:.2f}, Intercept: {intercept:.2f}, R-squared: {r_value**2:.2f}")

### Section 6 - Predict

In [None]:
# Predict the average high temperature for January 2024
year = 2024
predicted_temp_2024 = slope * year + intercept
print(f"Predicted Average High Temperature for January 2024: {predicted_temp_2024:.2f}")

### Section 7 - Visualizations

In [None]:
# Plot a scatter plot with a best fit line
plt.figure(figsize=(10, 6))
sns.regplot(x=nyc_df['Year'].dt.year, y=nyc_df['Average_High_Temp'], line_kws={"color": "red"})

# Set plot labels and title
plt.title('NYC January Average High Temperature Over Time')
plt.xlabel('Year')
plt.ylabel('Average High Temperature (°F)')
plt.grid(True)
plt.show()

## Part 3 - Prediction

In [None]:
pip install scikit-learn

In [None]:
pip show scikit-learn

In [None]:
pip install scikit-learn pandas

In [None]:
pip show scikit-learn

In [None]:
pip show pandas

In [None]:
import sklearn
print(sklearn.__version__)

In [49]:
# Add imports for data handling, model building, and visualizations
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import seaborn as sns
import matplotlib.pyplot as plt

In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import seaborn as sns
import matplotlib.pyplot as plt

### Section 1 - Build the Model

In [None]:
# Print the columns in the DataFrame
print("Columns in the DataFrame:", df.columns)

In [None]:
# Assuming you have already loaded the DataFrame
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Load the data into df
df = pd.read_csv('/Users/koreysansone/Library/CloudStorage/OneDrive-nwmissouri.edu/Python Files/datafun-07-ml/IntroToPython/examples/ch15/ave_hi_nyc_jan_1895-2018.csv')

# Check and clean the column names
print("Original columns in the DataFrame:", df.columns)
df.columns = df.columns.str.strip()  # Clean whitespace
print("Cleaned columns:", df.columns)

# Check for required columns
if 'Year' in df.columns and 'Average_High_Temp' in df.columns:
    # Define X and y for the regression
    X = df[['Year']]
    y = df['Average_High_Temp']

    # Split the data into training and testing sets (80% training, 20% testing)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Check the shape of each set
    print("Training set shape:", X_train.shape)
    print("Testing set shape:", X_test.shape)

    # Create the Linear Regression model and fit it on the training data
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Get the coefficients and intercept
    m = model.coef_[0]
    b = model.intercept_
    print(f"Slope (m): {m}")
    print(f"Intercept (b): {b}")
else:
    print("Error: Required columns are not present in the DataFrame.")
    print("Available columns:", df.columns)

In [None]:
# Display the first few rows of the DataFrame
print(df.head())

### Section 2 - Test the Model

In [54]:
df = pd.read_csv('/Users/koreysansone/Library/CloudStorage/OneDrive-nwmissouri.edu/Python Files/datafun-07-ml/IntroToPython/examples/ch15/ave_hi_nyc_jan_1895-2018.csv')

In [None]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Step 2: Load the data into df
df = pd.read_csv('/Users/koreysansone/Library/CloudStorage/OneDrive-nwmissouri.edu/Python Files/datafun-07-ml/IntroToPython/examples/ch15/ave_hi_nyc_jan_1895-2018.csv')

# Step 3: Clean column names by stripping whitespace
df.columns = df.columns.str.strip()

# Step 4: Check the DataFrame to ensure it has the correct columns
print(df.head())  # Optional: Check the first few rows of the DataFrame

# Step 4.5: Extract the year from the 'Date' column
df['Year'] = df['Date'].astype(str).str[:4].astype(int)  # Extract year from Date column

# Step 5: Define X and y for the regression
X = df[['Year']]  # Now 'Year' is a valid column
y = df['Value']   # Change this to 'Value' or any appropriate column for the temperature

# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Create and fit the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 8: Test the model by predicting on the testing set
y_pred = model.predict(X_test)

# Step 9: Evaluate the model by calculating the Mean Squared Error (MSE) and R^2 score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Step 10: Print the evaluation metrics
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

In [None]:
# Test the model by predicting on the testing set
y_pred = model.predict(X_test)

# Evaluate the model by calculating the Mean Squared Error (MSE) and R^2 score
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

### Section 3 - Predict

In [None]:
# Predict the average high temp in January for the year 2024
year_2024 = pd.DataFrame([[2024]], columns=['Year'])
predicted_temp_2024 = model.predict(year_2024)

print(f"Predicted average high temperature in January 2024: {predicted_temp_2024[0]:.2f}°F")

### Section 4 - Visualizations

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot the data points and the best-fit line
plt.figure(figsize=(10, 6))

# Scatter plot of the original data
sns.scatterplot(x=df['Year'], y=df['Value'], label='Original Data', color='blue')  # Changed 'Average_High_Temp' to 'Value'

# Plot the best-fit line using the model
sns.lineplot(x=df['Year'], y=model.predict(df[['Year']]), label='Best Fit Line', color='red')

# Customize the plot
plt.title('Average High Temperature in January (NYC)')
plt.xlabel('Year')
plt.ylabel('Average High Temp (°F)')
plt.grid(True)
plt.legend()

# Show the plot
plt.show()

### Section 5 - Error Handling

If you encounter any issues or warnings while running the notebook, be sure to:

	1.	Read the error message carefully: Most warnings and errors will provide insight into what’s wrong.
	2.	Check data integrity: Ensure the data is clean and the correct shape before feeding it into the model.
	3.	Try rerunning cells: Sometimes, rerunning the notebook cells helps resolve order-related issues.
