In [None]:
### Basic Visualizations

### Scatter Plot

# The X value is first and Y is second argument of the scatter() function

plt.scatter(df['displacement'], df['mpg'], color='b')

plt.xlabel("Engine Displacement")
plt.ylabel("MPG")
plt.show()


# Note: Here we start using the short-cut panda access to the data column
# df.displacement instead of df['displacement']

plt.scatter(df.displacement, df.mpg, color='b',
            # Using alpha value allows the data to stack up in darker colors
            alpha=.25)

plt.xlabel("Engine Displacement")
plt.ylabel("MPG")
plt.show()

####-------------- SIDE by Side Scatter Plots

# Configure plotting dimensions for multiple plots
# This size is in inches
# Below we are using 10 inches across by 5 inches tall for the overall plotting canvas

plt.figure(figsize=(10, 5))

# subplot sets up a rows, columns subplot, then selects the plot number
# in a 1x2 grid (m,n,number)
# select plot number 1

plt.subplot(1,2,1) 

# When a dataset has overlapping points, they can be difficult if not impossible to see
# This can hide valuable information

plt.scatter(df.displacement, df.mpg, color='b') # X value is first, Y is second
plt.xlabel("Engine Displacement")
plt.ylabel("MPG")

# In a 1x2 grid (m,n,number)
plt.subplot(1,2,2) # Select plot number 2

# By lowqering the alpha, overlapping points will become darker, showing density
plt.scatter(df.displacement, df.mpg, color='b', alpha=.25)
plt.xlabel("Engine Displacement")
plt.ylabel("MPG")


##### ----- With Trend Line

# First plot the raw data

plt.plot(df.displacement, df.mpg, 'bo', alpha=.25)

# Fit and overplot a 2nd order polynomial curve for the trend line

params = np.polyfit(df.displacement, df.mpg, 2) # Generate a 2nd order polynomial
xp = np.linspace(df.displacement.min(), 500, 20) # Generate 20 points from the min displacement up to 500
yp = np.polyval(params, xp) # Evaluate the polynomial at the xp points to get the y-axis values
plt.plot(xp, yp, 'k') # Plot the line generated from those points

# Overplot an error band
sig = np.std(df.mpg - np.polyval(params, df.displacement)) # Calculate the standard deviation of the difference between
                                                           # MPG values and the polynomial curve
plt.fill_between(xp, yp - sig, yp + sig,
                 color='k', alpha=0.1) # Shade the area +/- one standard deviation above/below the line

plt.xlabel("Engine Displacement")
plt.ylabel("MPG")

In [None]:
### Box Plots

# Disabling a warning which has no impact on the code below besides hiding the warning.
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# Extract acceleration values for the different engine cylinder counts

three_cyl = df.acceleration[df.cylinders == 3]
four_cyl = df.acceleration[df.cylinders == 4]
five_cyl = df.acceleration[df.cylinders == 5]
six_cyl = df.acceleration[df.cylinders == 6]
eight_cyl = df.acceleration[df.cylinders == 8]
                
plt.figure(figsize=(5, 5)) # Set the plot size

# Generate a box plot with the different acceleration values
# Note that each *_cyl created above is a separate data frame that has its own descriptive stats

plt.boxplot([three_cyl,four_cyl,five_cyl,six_cyl,eight_cyl]) 


plt.ylabel("Car Acceleration in Seconds")

plt.xticks([1, 2, 3, 4, 5], ["3 Cyl", "4 Cyl", "5 Cyl", "6 Cyl", "8 Cyl"])
plt.ylim(5, 25)
plt.show()

In [None]:
####### Correlation 

# Pull each column out

X = df['weight']  # Note this is a Pandas.Series
Y = df['mpg']     # Note this is a Pandas.Series

# Set up a mapping of X and Y linked as records
# using a Python dictionary 
X_Y = {}
X_Y['weight'] = X
X_Y['mpg'] = Y

# Construct a DataFrame from the dictionary and re-assign to variable
X_Y = pd.DataFrame(X_Y)

# Preview
X_Y.head()


# Pandas.Series .var()
# pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.var.html

print('Variance of Weight = ', round(X.var(),3))
print('Variance of MPG = ', round(Y.var(),3))

# Pandas.Series .cov( otherSeries )
# pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.cov.html

print('Covariance of (mpg,weight) = ',round(X.cov(Y),3))
print('Covariance of (weight,mpg) = ',round(Y.cov(X),3))

# Pandas.DataFrame . cov()
# pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.cov.html

print('Covariance Matrix')
X_Y.cov()


df.cov()


# Configure plotting dimensions for multiple plots
# This size is in inches 14 inches across and 14 inches high

plt.figure(figsize=(14, 14)) 

# In a 2x2 grid (m,n,number), Select plot #1 
plt.subplot(2,2,1) 
plt.scatter(df['displacement'], df['acceleration'], color='b', alpha=0.25) 
plt.xlabel("Displacement")
plt.ylabel("Acceleration")

# In a 2x2 grid (m,n,number), Select plot #2
plt.subplot(2,2,2) 
plt.scatter(df['displacement'], df['horsepower'], color='r', alpha=0.25) 
plt.xlabel("Displacement")
plt.ylabel("Horsepower")

# In a 2x2 grid (m,n,number), Select plot #3
plt.subplot(2,2,3) 
plt.scatter(df['weight'], df['mpg'], color='g', alpha=0.25) 
plt.xlabel("Weight (tons)")
plt.ylabel("MPG")

# In a 2x2 grid (m,n,number), Select plot #4
plt.subplot(2,2,4) 
plt.scatter(df['weight'], df['horsepower'], color='k', alpha=0.25) 
plt.xlabel("Weight (tons)")
plt.ylabel("Horsepower")

plt.show()


#### -------------------------- Covariance ------------------------------------ ####
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.corr.html
df.corr()

plt.figure(figsize=(10, 5)) 
plt.subplot(1,2,1) 
plt.scatter(df['weight'], df['horsepower'], color='k', alpha=0.25) 
plt.xlabel("Weight (tons)")
plt.ylabel("Horsepower")

plt.subplot(1,2,2) 
plt.scatter(df['horsepower'], df['horsepower'], color='k', alpha=0.25) 
plt.xlabel("Horsepower")
plt.ylabel("Horsepower")

plt.show()


plt.scatter(df['mpg'], df['weight'], color='k', alpha=0.25) 
plt.xlabel("MPG")
plt.ylabel("Weight")
plt.show()

In [None]:
##### Regression https://indigo.sgn.missouri.edu/user/kkc.0669/notebooks/Archive/PSDS2200OP3-4_kkc.0669/Day5/labs/Regression.ipynb

from pandas.plotting import scatter_matrix

help(scatter_matrix)


scatter_matrix(df.iloc[:,0:6],   # Lets create a scatter matrix for just the first 6 columns scatter matrix 
               figsize=(14, 14), # Set plot size
               diagonal='hist')
plt.show()

In [None]:
#### Correlation 
### Econ_2010: IC.FRM.BRIB.ZS (P: 1, R: .923310), 