In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd

# Set a random seed for reproducibility
np.random.seed(42)

# Generate ages from 16 to 75
ages = np.random.randint(16, 76, size=100)

# Generate rates that linearly decrease with age with some random noise
rates = 100 - ages + np.random.normal(0, 10, size=100)

# Create a dataframe
df_linear = pd.DataFrame({'ages': ages, 'rates': rates})

# Write the dataframe to a csv file
df_linear.to_csv('linear_data.csv', index=False)

df_linear.head()

# Load the new data
df_linear = pd.read_csv('linear_data.csv')

# Display the first few rows of the dataframe
df_linear.head()

# Drop negative values
df_linear = df_linear[df_linear['rates'] >= 0]

# Drop null values
df_linear = df_linear.dropna()

# Display the first few rows of the dataframe
df_linear.head()

# Identify outliers
Q1 = df_linear.quantile(0.25)
Q3 = df_linear.quantile(0.75)
IQR = Q3 - Q1

# Remove outliers
df_linear_no_outliers = df_linear[~((df_linear < (Q1 - 1.5 * IQR)) | (df_linear > (Q3 + 1.5 * IQR))).any(axis=1)]

# Display the first few rows of the dataframe without outliers
df_linear_no_outliers.head()

# Plot ages versus rates for the original dataframe
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.scatter(df_linear['ages'], df_linear['rates'])
plt.title('Original Data')
plt.xlabel('Ages')
plt.ylabel('Rates')

# Plot ages versus rates for the dataframe without outliers
plt.subplot(1, 2, 2)
plt.scatter(df_linear_no_outliers['ages'], df_linear_no_outliers['rates'])
plt.title('Data Without Outliers')
plt.xlabel('Ages')
plt.ylabel('Rates')

plt.tight_layout()
plt.show()

# Box and whisker plots
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
df_linear.boxplot(column=['rates'])
plt.title('Original Data')

plt.subplot(1, 2, 2)
df_linear_no_outliers.boxplot(column=['rates'])
plt.title('Data Without Outliers')

plt.tight_layout()
plt.show()

# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define predictor and target variables
X = df_linear_no_outliers[['ages']]
y = df_linear_no_outliers['rates']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression object
regr = LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test)

# Print the coefficients
print('Coefficients:', regr.coef_)

# Print the mean squared error
print('Mean squared error:', mean_squared_error(y_test, y_pred))

# Print the coefficient of determination (R^2 score)
print('Coefficient of determination (R^2 score):', r2_score(y_test, y_pred))

# Plot scatter plot of the test data and the predicted regression line
plt.scatter(X_test, y_test, color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)
plt.title('Linear Regression Model')
plt.xlabel('Ages')
plt.ylabel('Rates')
plt.show()

# Plot residuals
plt.scatter(y_pred, y_test - y_pred, color='black')
plt.hlines(y=0, xmin=y_test.min(), xmax=y_test.max(), color='blue')
plt.title('Residuals')
plt.xlabel('Predicted')
plt.ylabel('Residuals')
plt.show()



## Model Evaluation

The linear regression model has a coefficient of -0.9446, which indicates that as age increases, the rate decreases, which aligns with our expectations from the data.

The mean squared error of the model is 75.25. This is a measure of the average squared difference between the actual and predicted values, with lower values indicating a better fit of the model to the data.

The coefficient of determination (R^2 score) is 0.793. This score ranges from 0 to 1 and represents the proportion of the variance in the dependent variable that is predictable from the independent variable(s). A score of 0.793 indicates that approximately 79.3% of the variability in rates can be explained by age, which suggests a strong relationship.

The scatter plot of the test data and the predicted regression line shows a clear negative linear relationship, which is what we would expect given the negative coefficient of the model.

The residuals plot shows how the prediction errors (residuals) are distributed. Ideally, we would like to see a random distribution of residuals around the horizontal axis. In this case, the residuals appear to be randomly distributed around zero, suggesting that a linear model is appropriate for this data.