### Dados Kaggle
#### https://www.kaggle.com/datasets/abhishek14398/salary-dataset-simple-linear-regression/data

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

## Importando Dados

In [None]:
#Importando dados no dataframe e visualizando
df = pd.read_csv("dados/Salary_dataset.csv", index_col=0)
df.head()

## Explorando Dados

In [None]:
df.describe()

In [None]:
#Valor médio da variável target, pode ser interpretado como valor médio esperado da previsão
vlr_mean_predict = target.mean()
vlr_mean_predict

In [None]:
#SSE simulado
#SSE é a diferença ao quadrodo entre o valor previsto e o valor observado
squared_error = pd.Series(vlr_mean_predict - df['Salary']) ** 2
SSE = np.sum(squared_error)
print('Soma dos Quadrados dos Erros (SSE):  %01.f' % SSE)

In [None]:
# Se os erros estivessem concentrados na menor taxa, significa que a var target estava normalizada, neste caso aparentemente não esta
hist_plot = squared_error.plot(kind='hist')

In [None]:
# Scatterplot para ver relação das variáveis
plt.figure(figsize=(10, 6))

sns.scatterplot(x='YearsExperience', y='Salary', data=df, color='b', s=100)

plt.title('Scatter Plot: Years of Experience vs Salary')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.grid(True)

plt.show()

In [None]:
# Visualizando a relação linear
plt.figure(figsize=(10, 6))

sns.regplot(x='YearsExperience', y='Salary', data=df, scatter_kws={'s': 100}, line_kws={'color': 'red'})

plt.title('Regression Plot: Years of Experience vs Salary')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.grid(True)

plt.show()

## Modelo Regressão Linear Scikit-Learn

In [None]:
modelo = LinearRegression(fit_intercept = True)

In [None]:
X = df['YearsExperience'].values.reshape(-1, 1)
y = df['Salary']

In [None]:
# Splitting treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
modelo.fit(X_train, y_train)

In [None]:
print (modelo.coef_)
print (modelo.intercept_)

In [None]:
y_train_pred = modelo.predict(X_train)
y_test_pred = modelo.predict(X_test)

In [None]:
# Desempenho do modelo
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

In [None]:
# Resultados
print(f"Training MSE: {train_mse}")
print(f"Testing MSE: {test_mse}")
print(f"Training R²: {train_r2}")
print(f"Testing R²: {test_r2}")

In [None]:
# Scatterplot com valores de treino e valores previsto
train_results = pd.DataFrame({'Actual': y_train, 'Predicted': y_train_pred, 'Dataset': 'Training'})
test_results = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred, 'Dataset': 'Testing'})
combined_results = pd.concat([train_results, test_results])


plt.figure(figsize=(10, 6))
sns.scatterplot(x='Actual', y='Predicted', hue='Dataset', data=combined_results, s=100, alpha=0.8)
plt.plot([min(y.min(), y_train_pred.min()) - 1, max(y.max(), y_train_pred.max()) + 1],
         [min(y.min(), y_train_pred.min()) - 1, max(y.max(), y_train_pred.max()) + 1], 'r--')

plt.xlabel('Actual Salary')
plt.ylabel('Predicted Salary')
plt.title('Actual vs Predicted Salary (Training & Testing)')
plt.grid(True)
plt.legend()
plt.show()

In [None]:
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

print("Cross-validation MSE scores:", -cv_scores)
print("Mean cross-validation MSE:", -cv_scores.mean())

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(np.arange(1, 6), cv_scores, marker='o', color='b', label='MSE por Fold')

plt.axhline(y=cv_scores.mean(), color='r', linestyle='--', label=f'Média MSE: {cv_scores.mean():.2f}')
plt.title('Cross-Validation: MSE por Fold')
plt.xlabel('Fold')
plt.ylabel('Erro Quadrático Médio (MSE)')
plt.legend()
plt.grid(True)

plt.show()