# Chapter 2 Exercises

## Set Up

In [None]:
# file paths
data_root = "https://raw.githubusercontent.com/kmerkl22/k-s-ISL/main/Data/"
image_root = "https://github.com/kmerkl22/k-s-ISL/raw/tree/main/Images"

In [None]:
# import numpy
import numpy as np

# import pyplot, subplots, images
from matplotlib import pyplot as plt
from matplotlib.pyplot import subplots
from matplotlib import image as img

#import pandas
import pandas as pd

In [None]:
# random number generator seed
rng = np.random.default_rng(3)

## Exercise 8

We are working with the college data set. First we will open it up in google sheets, it looks something like this:

In [None]:
import PIL, urllib

url = image_root + "CollegesinGS.png"
image = PIL.Image.open(urllib.request.urlopen(url))
plt.title("Colleges in Google Sheets")
plt.imshow(image)
plt.axis('off')
plt.show()

a) We now want to read in our data and name the loaded data "college". We will use pandas (pd) to load the data in.

In [None]:
college_path = data_root + "College.csv"
college = pd.read_csv(college_path)

b) Next we look at the data

In [None]:
college

In [None]:
college.head(25)

We don't really like this unnamed column, so we can try reading the data into pandas a few different ways:

In [None]:
college2 = pd.read_csv(college_path, index_col=0)
college2


For college2 we simply told pandas to use the first column (0) as the index

In [None]:
college3 = college.rename({'Unnamed: 0': 'College'},
axis=1)
#axis{0 or ‘index’, 1 or ‘columns’} tells it to do the mapping on the columns by setting axis=1
college3 = college3.set_index('College')
college3

For college3 we rename the first column or college as "College" and also set this as the index. This is more like what we want, so lets save it as our 'college' dataframe.

In [None]:
college = college3

c) The describe() method produces a numerical summary of each column in a data frame.

In [None]:
college.describe()

d)  We want to use the pd.plotting.scatter_matrix() function to produce a
scatterplot matrix of the columns [Top10perc, Apps, Enroll].
First let's look at a list of columns in college:

In [None]:
college.columns

Selecting the 3 desired columns we make a scatterplot:

In [None]:
pd.plotting.scatter_matrix(college[['Top10perc', 'Apps', 'Enroll']]);

In [None]:
college.iloc[:, 3]

Let's also try selecting them by column number

In [None]:
pd.plotting.scatter_matrix(college.iloc[:,[1,3,4]]);

e) Use the boxplot() method of college to produce side-by-side
boxplots of Outstate versus Private.

In [None]:
fig, ax = subplots(figsize=(8, 8))
college.boxplot('Outstate', by='Private', ax=ax)

f) Create a new qualitative variable, called Elite, by binning the
Top10perc variable into two groups based on whether or not the
proportion of students coming from the top 10% of their high
school classes exceeds 50%

In [None]:
college['Elite'] = pd.cut(college['Top10perc'],[0, 50, 100], labels=['No','Yes'])


try to count elites

In [None]:
college['Elite'].value_counts()

In [None]:
college['Elite']

Make a boxplot of Elite and Outstate

In [None]:
fig, ax = subplots(figsize=(8, 8))
college.boxplot('Outstate', by='Elite', ax=ax)

g) Use the plot.hist() method of college to produce some histograms with difering numbers of bins for a few of the quantitative variables. The command plt.subplots(2, 2) may be useful: it will divide the plot window into four regions so that four
plots can be made simultaneously. By changing the arguments
you can divide the screen up in other combinations.

In [None]:
college.plot.hist()

Plot each using histogram bins and 2x2 subplot

In [None]:
# Plot histograms
fig = plt.figure()

plt.subplot(221)
college['Accept'].plot.hist(bins=4)
plt.subplot(222)
college['Apps'].plot.hist(bins=6)
plt.subplot(223)
college['Grad.Rate'].plot.hist(bins=7)
plt.subplot(224)
college['Room.Board'].plot.hist(bins=5)

fig.subplots_adjust(hspace=1) # Add space between plots


h) Continue to explore the data.

## Exercise 9

Read in the data

In [None]:
Auto = pd.read_csv(data_root + "Auto.csv")
Auto

Look at mpg and displacement data

In [None]:
Auto['mpg']

In [None]:
np.max(Auto['mpg']), np.min(Auto['mpg'])

In [None]:
np.max(Auto['displacement']), np.min(Auto['displacement'])

In [None]:
np.mean(Auto['mpg']), np.std(Auto['mpg'])

Read in auto.data file

In [None]:
Auto = pd.read_csv(data_root + "Auto.data",
                  na_values=['?'],
                  delim_whitespace=True)


In [None]:

Auto['horsepower'].sum()


In [None]:


# In[17]:


np.max(Auto['horsepower']), np.min(Auto['horsepower'])


# In[18]:


np.max(Auto['weight']), np.min(Auto['weight'])


# In[19]:


np.max(Auto['acceleration']), np.min(Auto['acceleration'])


# In[20]:


np.mean(Auto['displacement']), np.std(Auto['displacement'])


# In[22]:


np.mean(Auto['weight']), np.std(Auto['weight'])


# In[23]:


np.mean(Auto['acceleration']), np.std(Auto['acceleration'])


# In[24]:


data = Auto.drop(labels=range(10,86), axis=0)


# In[25]:


data


# In[26]:


np.max(data['mpg']), np.min(data['mpg']), np.mean(data['mpg']), np.std(data['mpg'])


# In[27]:


np.max(data['displacement']), np.min(data['displacement']), np.mean(data['displacement']), np.std(data['displacement'])


# In[28]:


np.max(data['horsepower']), np.min(data['horsepower']), np.mean(data['horsepower']), np.std(data['horsepower'])


# In[29]:


np.max(data['weight']), np.min(data['weight']), np.mean(data['weight']), np.std(data['weight'])


# In[30]:


np.max(data['acceleration']), np.min(data['acceleration']), np.mean(data['acceleration']), np.std(data['acceleration'])


# In[34]:


from matplotlib.pyplot import subplots
fig, ax = subplots(figsize=(8,8))
Auto.boxplot('mpg', by='horsepower', ax=ax);


# In[42]:


import matplotlib.pyplot as plt
plt.scatter(Auto['mpg'], Auto['horsepower']);
plt.xlabel("MPG");
plt.ylabel("Horsepower");


# In[43]:


plt.scatter(Auto['mpg'], Auto['displacement']);
plt.xlabel("MPG");
plt.ylabel("Displacement");


# In[44]:


plt.scatter(Auto['mpg'], Auto['weight']);
plt.xlabel("MPG");
plt.ylabel("Weight");


# In[45]:


plt.scatter(Auto['mpg'], Auto['acceleration']);
plt.xlabel("MPG");
plt.ylabel("Acceleration");


# In[46]:


plt.scatter(Auto['displacement'], Auto['horsepower']);
plt.xlabel("Displacement");
plt.ylabel("Horsepower");


# In[47]:


plt.scatter(Auto['weight'], Auto['horsepower']);
plt.xlabel("Weight");
plt.ylabel("Horsepower");


# In[48]:


plt.scatter(Auto['mpg'], Auto['year']);
plt.xlabel("MPG");
plt.ylabel("Year");


# In[49]:


plt.scatter(Auto['mpg'], Auto['cylinders']);
plt.xlabel("MPG");
plt.ylabel("Cylinders");


# In[50]:


plt.scatter(Auto['mpg'], Auto['origin']);
plt.xlabel("MPG");
plt.ylabel("Origin");

## Exercise 10

a) Load Boston data set

In [None]:
from ISLP import load_data

In [None]:
Boston = load_data('Boston')

In [None]:
type(Boston)

Boston is of the correct type, dataframe

In [None]:
Boston

Description from [site](https://islp.readthedocs.io/en/latest/datasets/Boston.html):

A data set containing housing values in 506 suburbs of Boston.
     
- `crim`: per capita crime rate by town.

- `zn`: proportion of residential land zoned for lots over 25,000
          sq.ft.

- `indus`: proportion of non-retail business acres per town.

- `chas`: Charles River dummy variable (= 1 if tract bounds river; 0
          otherwise).

- `nox`: nitrogen oxides concentration (parts per 10 million).

- `rm`: average number of rooms per dwelling.

- `age`: proportion of owner-occupied units built prior to 1940.

- `dis`: weighted mean of distances to five Boston employment
          centres.

- `rad`: index of accessibility to radial highways.

- `tax`: full-value property-tax rate per $10,000.

- `ptratio`: pupil-teacher ratio by town.

- `lstat`: lower status of the population (percent).

- `medv`: median value of owner-occupied homes in $1000s.

b) To get the rows and columns I'll look at the shape

In [None]:
Boston.shape

We have 506 rows representing towns in the boston area and 13 columns representing features of them

c) Pairwise scatterplots

Look at columns

In [None]:
Boston.columns

Make a scatterplot of a few

In [None]:
pd.plotting.scatter_matrix(Boston.iloc[:,[0,3,4,9]]);

d) Predictors associated with per capita crime rate?


From above, it looks like there may be a correlation with nox. Let's try a few more.

In [None]:
pd.plotting.scatter_matrix(Boston.iloc[:,[0,1,2,3,4,5,6]]);

In [None]:
pd.plotting.scatter_matrix(Boston.iloc[:,[0,7,8,9,10,11,12]]);

Potential: nox, rm, age, lstat, medv

In [None]:
pd.plotting.scatter_matrix(Boston[['crim', 'nox'
]])

In [None]:
pd.plotting.scatter_matrix(Boston[['crim', 'rm'
]])

In [None]:
pd.plotting.scatter_matrix(Boston[['crim', 'lstat'
]])

In [None]:
pd.plotting.scatter_matrix(Boston[['crim', 'medv'
]])

e) Which suburbs have particularly high crime rates? Tax rates? Pupil-teacher ratios? Comment on the range of each predictor

Make boxplots to look for outliers

In [None]:
#fig, ax = subplots(figsize=(8, 8))
#Boston.boxplot('crim', by='tax rates', ax=ax)

f)