# Module 03: EDA

In [None]:
# packages
import numpy as np 
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots
from sklearn.model_selection import train_test_split 
from ISLP import load_data

# set seed
seed = 2323

### We'll use the _Hitters_ data from ISLP for this activity. The metadata for _Hitters_ can be found [here](https://intro-stat-learning.github.io/ISLP/datasets/Hitters.html).

In [None]:
# Load the data
Hitters = load_data('Hitters')

### Determine the number of rows and columns in the dataset by returning its "shape" attribute

In [None]:


Hitters.shape

### Determine whether each feature is numeric or categorical by returning the "dtype" attribute for each column

In [None]:
for col in Hitters.columns:
    print(f"{col}: ")
{Hitters [col].dtype}

### Before doing any other analyses, let's create training and test sets.

In [None]:
Train, Test = train_test_split(Hitters, 
                               random_state=seed, 
                               test_size=0.40, 
                               shuffle=True) 

### Based on the metadata, what is the difference between the 6 columns starting with 'C' and the 6 related columns that don't?

#fillin Type your answer here

The 6 columns starting with 'C' are total career numbers, while the related columns without 'C' are statistics for the most recent season.

### On the training set, create pairwise scatterplots for each of these 6 columns with the 'Salary' variable.

In [None]:

# First create a subset of the columns that we want to plot
subset = Train[['CAtBat', 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks']]

# Initialize the plots before drawing them
fig, axes = subplots(nrows=2,
                     ncols=3,
                     figsize=(15, 10))

# Copy the helper function
def range_to_grid(i,Ncol):
    x=[]
    y=[]
    for n in range(Ncol**2):
        x.append(int(np.floor(n/Ncol)))
        y.append(n % Ncol)
    return x[i],y[i]

# Plot the variables
for j in range(len(subset.columns)):
    axes[range_to_grid(j,3)[0],range_to_grid(j,3)[1]].plot(subset.iloc[:,j], Train['Salary'], 'o')
    axes[range_to_grid(j,3)[0],range_to_grid(j,3)[1]].set_xlabel(subset.columns[j])
    axes[range_to_grid(j,3)[0],range_to_grid(j,3)[1]].set_ylabel('Salary')

plt.tight_layout()
plt.show()

### Use the "describe" method to determine the mean, standard deviation, and 5 number summary of all numeric variables in the training subset of _Hitters_.

In [None]:
#fillin
Train.describe()

### It looks like the mean and median of 'AtBat' are nearly equal. This _might_ suggest that this variable is normally distributed. Create a histogram of 'AtBat' to check this hypothesis.

In [None]:
#fillin

plt.figure(figsize=(10, 6))
plt.hist(Train['AtBat'], bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('AtBat')
plt.ylabel('Frequency')
plt.title('Distribution of AtBat in Training Set')
plt.grid(True, alpha=0.3)
plt.show()

### Let's standardize the AtBat feature (i.e., normalize by z-scores). We'll create a new column in the training data called 'AtBat_st' to represent this.

In [None]:
#fillin

Train['AtBat_st'] = (Train['AtBat'] - Train['AtBat'].mean()) / Train['AtBat'].std()

### How many rows have an 'AtBat' value within the first standard deviation?

Hint: the 'len' magic method returns the number of rows of a dataFrame.

In [None]:
#fillin
within_one_std = len(Train[(Train['AtBat_st'] >= -1) & (Train['AtBat_st'] <= 1)])
print(f"Rows within one standard deviation: {within_one_std}")

### Going back to the results of the 'describe' method, how can you tell that the 'Salary' variable has missing values?

#fillin Type your answer here.

The 'describe' method shows the count for Salary as 131, which is less than the total number of rows in the training set. This discrepancy in the count indicates missing values.

### Describe a situation where a variable could have missing values but this would not be reflected in the results of the 'describe' method.

#fillin Type your answer here.

If missing values are coded as a placeholder value like 0, -99, 999, or "NA" instead of proper NaN/None values, the 'describe' method would include these placeholder values in its count and calculations. The method would show the full count of rows but the statistics would be skewed by these coded missing values, hiding the fact that they represent missing data.

### On the training data, create separate boxplots of the 'AtBat' variable for when 'Salary' is populated or missing.

In [None]:
#fillin

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Boxplot for AtBat when Salary is not missing
Train[Train['Salary'].notna()].boxplot(column='AtBat', ax=axes[0])
axes[0].set_title('AtBat Distribution - Salary Present')
axes[0].set_ylabel('AtBat')
axes[0].grid(True, alpha=0.3)

# Boxplot for AtBat when Salary is missing
Train[Train['Salary'].isna()].boxplot(column='AtBat', ax=axes[1])
axes[1].set_title('AtBat Distribution - Salary Missing')
axes[1].set_ylabel('AtBat')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### Create a correlation matrix for all numeric features in the training set

In [None]:
#fillin

numeric_cols = Train.select_dtypes(include=[np.number]).columns
correlation_matrix = Train[numeric_cols].corr()

# Display the correlation matrix
correlation_matrix

# Optional: Visualize the correlation matrix with a heatmap
plt.figure(figsize=(12, 10))
plt.imshow(correlation_matrix, cmap='coolwarm', aspect='auto')
plt.colorbar()
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
plt.title('Correlation Matrix of Numeric Features (Training Set)')
plt.tight_layout()
plt.show()

### Propose two different ways of imputing the missing values of Salary while taking advantage of the information given in the boxplots or the correlation matrix.

#fillin Type your answer here.

  

1. By AtBat groups: Boxplots show missing Salary players have fewer AtBats. Group players by AtBat quartiles, fill missing Salary with median Salary from their quartile.  

2. By regression: Correlation matrix shows Salary links to CRBI (0.61), CRuns (0.60), and CHits (0.58). Predict missing Salary using a linear model with these features.

### For our last exercise, we'll explore Hits and Walks relative to AtBat totals. 
- Use the sum function to calculuate the totals of each of these three variables for the 1986 season (on the training set). 
- Create a pie chart which shows total hits, total walks, and remaining total (neither) as percents of the At Bats total (on the training set). 

In [None]:
TotHits = Train['Hits'].sum()
TotWalks = Train['Walks'].sum()
TotAtBat = Train['AtBat'].sum()

Labels = ['Hits', 'Walks', 'Neither']
Totals = [TotHits, TotWalks, TotAtBat-TotHits-TotWalks]

# Create a pie chart
plt.figure(figsize=(8, 8))
plt.pie(Totals, labels=Labels, autopct='%1.1f%%', startangle=90, 
        colors=['#ff9999', '#66b3ff', '#99ff99'])
plt.title('Distribution of AtBat Outcomes (1986 Training Set)', fontsize=14)
plt.axis('equal')
plt.show()

In [None]:
# pie chart


### The previous two cells gave us totals across all players. For each player in the training set, calculate the Hits as a percent of AtBat and store it in a new variable called 'AVG'

In [None]:
#fillin
Train['AVG'] = Train['Hits'] / Train['AtBat']

### Using 0.25 and 0.31 as the split points, create a new variable with three bins: high, medium, and low. 

In [None]:
Train['AVG_bin'] = 'medium'
Train['AVG_bin'][#fillin] = 'low'
Train['AVG_bin'][#fillin] = 'high'

### Create a bar chart that displays the number of players in each of the low, medium, and high categories (for the training data).

In [None]:
Train['AVG_bin'].value_counts()

Notice that the order of the bars will be medium, low, high. That's counterintuitive. We can reorder these quickly. 

In [None]:
indexMap = ['low', 'medium', 'high']
reordered_list = [Train['AVG_bin'].value_counts()[i] for i in indexMap]

In [None]:

print(Train['AVG_bin'].value_counts())

indexMap = ['low', 'medium', 'high']
reordered_list = [Train['AVG_bin'].value_counts()[i] for i in indexMap]

plt.figure(figsize=(8, 6))
plt.bar(indexMap, reordered_list, color=['#ff7f7f', '#ffff7f', '#7fff7f'], edgecolor='black')
plt.title("1986 AVG (Training Set)", fontsize=14)
plt.ylabel("Number of Players")
plt.xlabel("AVG Category")

plt.show()

### Did we use the depth method or width method for creating these bins? Explain.

#fillin Type your answer here.

We used the Width method. We used fixed cutoff values (0.25 and 0.31) to define bins, not quantiles. Depth method would create bins with equal numbers of players using percentiles.